In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
from html.parser import HTMLParser
from html.entities import name2codepoint
sns.set(color_codes=True)
import warnings
warnings.filterwarnings("ignore")                   
import nltk                                         
nltk.download('stopwords')
from nltk.corpus import stopwords                   
from nltk.stem import PorterStemmer                 

from sklearn.feature_extraction.text import CountVectorizer          
from sklearn.feature_extraction.text import TfidfVectorizer          
from gensim.models.word2vec import Word2Vec                                  
import re
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chd415/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load training data text
def load_data(filename):
    load_file = pd.read_csv(filename,delimiter=',', header=0,
                        dtype={'name':str, 'lvl1':str, 'lvl2':str, 'lvl3':str, 'descrption':str, 'type':str})
    load_file.columns = ['id', 'name','lvl1','lvl2','lvl3','descrption','price','type']
    load_file.duplicated(subset=None, keep='first')
    load_file.set_index('id', inplace = True)
    load_file.head()
    return load_file
#print(len(train_file))
def load_label(filename):
    load_label = pd.read_csv(filename,delimiter=',', header=0)
    load_label.columns = ['id', 'score']
    load_label.duplicated(subset=None, keep='first')
    load_label.set_index('id', inplace = True)
    return load_label

In [3]:
def map_mathod(column):
    values = []
    indexs = []
    mapping = {}
    index = 0
    for count in range(len(train_file)):
        value = train_file.get_value(count+1,column)
        if value in values and value != np.nan:
            continue
        values.append(value)
        indexs.append(len(values))
    for j in range(len(indexs)):
        mapping[values[j]] = indexs[j]
    mapping[np.nan] = 0.0
    return mapping
#train_file['lvl3'] = train_file['lvl3'].str.lower().replace('[^\'\w]+',' ',regex=True)
#mapping_lvl3 = map_mathod('lvl3')
#print(mapping_lvl3)

In [53]:
def text_embedding(column):
    temp_X = column.astype(str)
    stop = set(stopwords.words('english'))
    temp =[]
    snow = nltk.stem.SnowballStemmer('english')

    for sentence in temp_X: 
        words = [snow.stem(word) for word in sentence.split(' ') if word not in stopwords.words('english')]   # Stemming and removing stopwords
        temp.append(sentence)

    count_vect = CountVectorizer(max_features=5000)
    bow_data = count_vect.fit_transform(temp)

    final_tf = temp
    tf_idf = TfidfVectorizer(max_features=5000)
    tf_data = tf_idf.fit_transform(final_tf)
    w2v_data = temp
    splitted = []
    for row in w2v_data: 
        splitted.append([word for word in row.split()])     #splitting words
    
    train_w2v = Word2Vec(splitted,min_count=5,size=50, workers=4)
    avg_data = []
    for row in splitted:
        vec = np.zeros(50)
        count = 0
        for word in row:
            try:
                vec += train_w2v[word]
                count += 1
            except:
                pass
        avg_data.append(vec/count)
    
    return avg_data

In [54]:
def clean_data(filename):
    #clean up data for lvl 1&2&3
    filename['lvl1'] = filename['lvl1'].str.lower().replace('[^a-zA-Z]+',' ',regex=True)
    filename['lvl2'] = filename['lvl2'].str.lower().replace('[^\'\w]+',' ',regex=True)
    filename['lvl3'] = filename['lvl3'].str.lower().replace('[^\'\w]+',' ',regex=True)
    filename['descrption'] = filename['descrption'].str.lower()
    filename['name'] = filename['name'].str.lower()
    
    mapping_lvl1 = map_mathod('lvl1')
    mapping_lvl2 = map_mathod('lvl2')
    mapping_lvl3 = map_mathod('lvl3')
    
    filename['lvl1'] = filename['lvl1'].map(mapping_lvl1)
    filename['lvl2'] = filename['lvl2'].map(mapping_lvl2)
    filename['lvl3'] = filename['lvl3'].map(mapping_lvl3)
    
    #normalize price
    maxp = filename.price.max()
    valuethred = 600.
    filename['price'] = filename['price'].clip(lower=0.,upper=valuethred).div(valuethred,fill_value=None)
    #hist = train_file['price'].hist(bins=10)
    #maxp

    #clean up type 
    mapping_type = {'international':1.,'local':2., np.nan:0.}
    filename['type'] = filename['type'].map(mapping_type)
    
    #clean up text
    description_X = filename.descrption.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',regex=True)
    filename['descrption'] = text_embedding(description_X)
    
    name_X = filename.name.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',regex=True)
    filename['name'] = text_embedding(name_X)

    return filename

In [55]:
train_file = load_data('train_data.csv')
cleaned_train = clean_data(train_file)
train_score = load_label('train_label.csv')

In [56]:
cleaned_train.head(10)

Unnamed: 0_level_0,name,lvl1,lvl2,lvl3,descrption,price,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"[0.2785512868847166, 0.19038709785257066, -0.0...",1.0,1.0,1.0,"[0.11597492836881429, 1.4357457332662307, 0.50...",0.213333,1.0
2,"[0.28689356201461386, 0.22276211396924087, -0....",2.0,2.0,2.0,"[0.23798063496748606, 0.1815572033325831, -0.7...",0.024483,1.0
3,"[-0.49888483695685865, 1.109314926713705, -0.4...",3.0,3.0,3.0,"[-0.21071642846800387, 1.2024616375565529, 0.2...",0.0235,1.0
4,"[-0.5269565202761441, 1.026423168642556, -0.64...",3.0,3.0,3.0,"[-1.2040237646017755, 1.446750091300124, -0.20...",0.0299,1.0
5,"[-0.309658750093409, 0.9799015782773495, -1.06...",3.0,3.0,3.0,"[-0.5666554727488093, 0.8941111771596802, -1.2...",0.011333,1.0
6,"[0.38679614745908314, 0.3356042135920789, 0.19...",4.0,4.0,4.0,"[-0.4136801449243318, 0.08116007901050827, -0....",0.648317,1.0
7,"[0.14339128910348967, 0.6498455840807694, -0.5...",2.0,5.0,5.0,"[-0.269310249805902, 0.7997061725367199, -1.22...",1.0,2.0
8,"[-0.31091215420100426, -0.012318719799319902, ...",5.0,6.0,6.0,"[-0.020183219733934814, 0.9211052132980718, 0....",0.036233,1.0
9,"[0.37643933296203613, 0.35806038975715637, 0.2...",6.0,7.0,7.0,"[0.25171623274831, 0.7570996880531311, 0.01779...",0.041667,2.0
10,"[0.36040696160246927, 0.16271462974449, 0.4959...",6.0,7.0,8.0,"[0.07560220795373122, 0.25588008761405945, 0.4...",0.0158,1.0


In [69]:
def text_temp(column):
    temp_X = column.astype(str)
    stop = set(stopwords.words('english'))
    temp =[]
    snow = nltk.stem.SnowballStemmer('english')

    for sentence in temp_X: 
        words = [snow.stem(word) for word in sentence.split(' ') if word not in stopwords.words('english')]   # Stemming and removing stopwords
        temp.append(sentence)

    count_vect = CountVectorizer(max_features=5000)
    bow_data = count_vect.fit_transform(temp)

    final_tf = temp
    tf_idf = TfidfVectorizer(max_features=5000)
    tf_data = tf_idf.fit_transform(final_tf)
    w2v_data = temp
    splitted = []
    for row in w2v_data: 
        splitted.append([word for word in row.split()])     #splitting words
    
    train_w2v = Word2Vec(splitted,min_count=5,size=50, workers=4)
    avg_data = []
    for row in splitted:
        vec = np.zeros(50, dtype=float)
        count = 0
        for word in row:
            try:
                vec += train_w2v[word]
                count += 1
            except:
                pass
        if (count == 0):
            avg_data.append(vec)
        else:
            avg_data.append(vec/count)
#        avg_data.append(vec)
    
    return avg_data

In [73]:
import numpy as np
LA = np.linalg

temp = load_data('train_data.csv')
description_X = temp.descrption.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',regex=True)
temp['descrption'] = text_temp(description_X)

lg = temp.descrption.as_matrix(columns=None).tolist()
lg_array = np.vstack( lg )

#U, s, Vh = LA.svd(lg_array, full_matrices=True)
#assert np.allclose(lg_array, np.dot(U, np.dot(np.diag(s), Vh)))

#s[3:] = 0
#new_lg = np.dot(U, np.dot(np.diag(s), Vh))
print(lg_array)
#print(where_are_NaNs)

[[ 0.02889192  1.28863436  0.4601773  ...  0.14331316  0.00685618
   0.52905368]
 [ 0.1619879   0.37270176 -0.93766182 ... -0.80113736 -0.21356619
  -0.39530368]
 [-0.22494442  1.24141782  0.10931073 ... -0.70130394 -0.87736782
   1.0250334 ]
 ...
 [ 0.21175022  0.36488799 -0.285278   ... -0.3001667  -0.71704184
   0.48001188]
 [-0.16509218  0.51833014 -0.27890221 ... -0.0982828  -1.24664449
   0.54783445]
 [-0.63246833  0.10083609 -0.60401502 ...  0.54369822 -1.70785115
   0.9342068 ]]


In [77]:
U, s, Vh = LA.svd(lg_array, full_matrices=True)
assert np.allclose(lg_array, np.dot(U, np.dot(np.diag(s), Vh)))

s[3:] = 0
new_lg = np.dot(U, np.dot(np.diag(s), Vh))
print(new_lg)
#print(where_are_NaNs)

ValueError: shapes (18141,18141) and (50,50) not aligned: 18141 (dim 1) != 50 (dim 0)

In [8]:
la = cleaned_train.lvl1.as_matrix(columns=None).tolist()
lb = cleaned_train.lvl2.as_matrix(columns=None).tolist()
lc = cleaned_train.lvl3.as_matrix(columns=None).tolist()
ld = cleaned_train.price.as_matrix(columns=None).tolist()
le = cleaned_train.type.as_matrix(columns=None).tolist()
lf = cleaned_train.name.as_matrix(columns=None).tolist()
lg = cleaned_train.descrption.as_matrix(columns=None).tolist()
X = la
X = np.column_stack((X,lb))
X = np.column_stack((X,lc))
X = np.column_stack((X,ld))
X = np.column_stack((X,le))
X = np.column_stack((X,lf))
X = np.column_stack((X,lg))
X = X.tolist()
Y = train_score.score.as_matrix(columns=None).tolist()
print(len(X))

18141


In [16]:
from sklearn.utils import shuffle
X, Y = shuffle(X, Y)
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=0.20, random_state=0)

from keras.preprocessing import sequence
maxlen = 120
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, dtype='float32')
X_validation = sequence.pad_sequences(X_validation, maxlen=maxlen, dtype='float32')
#X_train = np.any(np.isnan(X_train))
#X_train = np.all(np.isfinite(X_train))
print(X_train[1400].size)

120


In [10]:
import os
import time
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
import pandas as pd
import random
import math
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_val_score

In [11]:
# Linear Regression Classifier
def linear_regression_classifier(train_x, train_y):
    model = linear_model.LinearRegression()
    model.fit(train_x, train_y)
    return model
 
# Multinomial Naive Bayes Classifier
def naive_bayes_classifier(train_x, train_y):
    model = MultinomialNB()

    param_grid = {'alpha': [math.pow(10,-i) for i in range(11)]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = MultinomialNB(alpha = best_parameters['alpha'])  
    model.fit(train_x, train_y)
    return model
 
 
# KNN Classifier
def knn_classifier(train_x, train_y):
    model = KNeighborsClassifier()

    param_grid = {'n_neighbors': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = KNeighborsClassifier(n_neighbors = best_parameters['n_neighbors'])

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    return bagging
 
 
# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model
 
 
# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])
    
    model.fit(train_x, train_y)
    return model
 
 
# Decision Tree Classifier
def decision_tree_classifier(train_x, train_y):
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    return bagging
 
 
# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(train_x, train_y):
    model = GradientBoostingClassifier()
    
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(100,300,10))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])

    model.fit(train_x, train_y)
    return model

# SVM Classifier
def svm_classifier(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    model.fit(train_x, train_y)
    return model
 
# SVM Classifier using cross validation
def svm_cross_validation(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    #for para, val in best_parameters.items():
        #print para, val
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
    model.fit(train_x, train_y)
    return model

def feature_select(x,y):
    clf = ExtraTreesClassifier()
    clf = clf.fit(x, y)
    model = SelectFromModel(clf, prefit=True)
    return model

In [12]:
if __name__ == '__main__': 
    thresh = 0.5    
#    model_save_file = "/home/jason/datamining/model/models"     
#    model_save = {}
#    result_save_file = '/home/jason/datamining/result/results' 
     
    test_classifiers = ['NB','KNN','LR','RF','DT','GBC','SVM']    
    classifiers = { 
                    'NB':naive_bayes_classifier,
                   'KNN':knn_classifier,
                    'LR':logistic_regression_classifier,
                    'RF':random_forest_classifier,
                    'DT':decision_tree_classifier,
                   'GBC':gradient_boosting_classifier,
                   'SVM':svm_classifier,
                   
    }
        
    print('reading training and testing data...')    
    #X_train, X_validation, y_train, y_validation
    select_model = feature_select(X_train, y_train)
    X_train = select_model.transform(X_train)
    X_validation = select_model.transform(X_validation)

    result = []
        
    for classifier in test_classifiers:    
        print('******************* %s ********************' % classifier)    
        start_time = time.time()    
        model = classifiers[classifier](X_train, y_train)   
        print('training took %fs!' % (time.time() - start_time))    
        predict = model.predict(X_validation)

        precision = metrics.precision_score(y_validation, predict)    
        recall = metrics.recall_score(y_validation, predict)    
        print('precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall))    
        accuracy = metrics.accuracy_score(y_validation, predict)    
        print('accuracy: %.2f%%' % (100 * accuracy))

        scores = cross_val_score(model, X_train, y_train)
        print(scores)

reading training and testing data...


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [13]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 25
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

print('Loading data...')
#X_train, X_validation, y_train, y_validation
#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_validation), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen,dtype='float32')
X_validation = sequence.pad_sequences(X_validation, maxlen=maxlen,dtype='float32')
print('X_train shape:', X_train.shape)
print('X_validation shape:', X_validation.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_validation, y_validation))

Loading data...
14512 train sequences
3629 test sequences
Pad sequences (samples x time)
X_train shape: (14512, 400)
X_validation shape: (3629, 400)
Build model...
Train on 14512 samples, validate on 3629 samples
Epoch 1/2


InvalidArgumentError: indices[22,316] = -1 is not in [0, 5000)
	 [[Node: embedding_1/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training/Adam/Assign_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_1/embeddings/read, embedding_1/Cast, training/Adam/gradients/embedding_1/embedding_lookup_grad/concat/axis)]]

In [None]:
table = pd.pivot_table(train_file,values = 'price', index=['lvl1','lvl2','lvl3'],columns=['type'],aggfunc=[min, max, np.mean])
table