In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
from html.parser import HTMLParser
from html.entities import name2codepoint
sns.set(color_codes=True)
import warnings
warnings.filterwarnings("ignore")                   
import nltk                                         
nltk.download('stopwords')
from nltk.corpus import stopwords                   
from nltk.stem import PorterStemmer  
LA = np.linalg

from sklearn.feature_extraction.text import CountVectorizer          
from sklearn.feature_extraction.text import TfidfVectorizer          
from gensim.models.word2vec import Word2Vec                                  
import re
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chd415/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load training data text
def load_data(filename):
    load_file = pd.read_csv(filename,delimiter=',', header=0,
                        dtype={'name':str, 'lvl1':str, 'lvl2':str, 'lvl3':str, 'descrption':str, 'type':str})
    load_file.columns = ['id', 'name','lvl1','lvl2','lvl3','descrption','price','type']
    load_file.duplicated(subset=None, keep='first')
    load_file.set_index('id', inplace = True)
    load_file.head()
    return load_file
#print(len(train_file))
def load_label(filename):
    load_label = pd.read_csv(filename,delimiter=',', header=0)
    load_label.columns = ['id', 'score']
    load_label.duplicated(subset=None, keep='first')
    load_label.set_index('id', inplace = True)
    return load_label

In [3]:
def map_mathod(column):
    values = []
    indexs = []
    mapping = {}
    index = 0
    for count in range(len(train_file)):
        value = train_file.get_value(count+1,column)
        if value in values and value != np.nan:
            continue
        values.append(value)
        indexs.append(len(values))
    for j in range(len(indexs)):
        mapping[values[j]] = indexs[j]
    mapping[np.nan] = 0.0
    return mapping
#train_file['lvl3'] = train_file['lvl3'].str.lower().replace('[^\'\w]+',' ',regex=True)
#mapping_lvl3 = map_mathod('lvl3')
#print(mapping_lvl3)

In [4]:
def text_embedding(column):
    temp_X = column.astype(str)
    stop = set(stopwords.words('english'))
    temp =[]
    snow = nltk.stem.SnowballStemmer('english')

    for sentence in temp_X: 
        words = [snow.stem(word) for word in sentence.split(' ') if word not in stopwords.words('english')]   # Stemming and removing stopwords
        temp.append(sentence)

    count_vect = CountVectorizer(max_features=5000)
    bow_data = count_vect.fit_transform(temp)

    final_tf = temp
    tf_idf = TfidfVectorizer(max_features=5000)
    tf_data = tf_idf.fit_transform(final_tf)
    w2v_data = temp
    splitted = []
    for row in w2v_data: 
        splitted.append([word for word in row.split()])     #splitting words
    
    train_w2v = Word2Vec(splitted,min_count=5,size=50, workers=4)
    avg_data = []
    for row in splitted:
        vec = np.zeros(50, dtype=float)
        count = 0
        for word in row:
            try:
                vec += train_w2v[word]
                count += 1
            except:
                pass
        if (count == 0):
            avg_data.append(vec)
        else:
            avg_data.append(vec/count)
#        avg_data.append(vec)
    
    return avg_data

In [5]:
def clean_data(filename):
    #clean up data for lvl 1&2&3
    filename['lvl1'] = filename['lvl1'].str.lower().replace('[^a-zA-Z]+',' ',regex=True)
    filename['lvl2'] = filename['lvl2'].str.lower().replace('[^\'\w]+',' ',regex=True)
    filename['lvl3'] = filename['lvl3'].str.lower().replace('[^\'\w]+',' ',regex=True)
    filename['descrption'] = filename['descrption'].str.lower()
    filename['name'] = filename['name'].str.lower()
    
    mapping_lvl1 = map_mathod('lvl1')
    mapping_lvl2 = map_mathod('lvl2')
    mapping_lvl3 = map_mathod('lvl3')
    
    filename['lvl1'] = filename['lvl1'].map(mapping_lvl1)
    filename['lvl2'] = filename['lvl2'].map(mapping_lvl2)
    filename['lvl3'] = filename['lvl3'].map(mapping_lvl3)
    
    #normalize price
    maxp = filename.price.max()
    valuethred = 600.
    filename['price'] = filename['price'].clip(lower=0.,upper=valuethred).div(valuethred,fill_value=None)
    #hist = train_file['price'].hist(bins=10)
    #maxp

    #clean up type 
    mapping_type = {'international':1.,'local':2., np.nan:0.}
    filename['type'] = filename['type'].map(mapping_type)
    
    #clean up text
    description_X = filename.descrption.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',regex=True)
    filename['descrption'] = text_embedding(description_X)
    
    name_X = filename.name.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',regex=True)
    filename['name'] = text_embedding(name_X)

    return filename

In [6]:
train_file = load_data('train_data.csv')
cleaned_train = clean_data(train_file)
train_score = load_label('train_label.csv')

In [7]:
cleaned_train.head(10)

Unnamed: 0_level_0,name,lvl1,lvl2,lvl3,descrption,price,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"[0.1880733413355691, -0.4636911132505962, 0.26...",1.0,1.0,1.0,"[-0.5770275709219277, -0.7481932633090764, 0.0...",0.213333,1.0
2,"[0.5312821110045272, 0.026926082026745592, 0.7...",2.0,2.0,2.0,"[0.701379303385814, 0.3900831903020541, 0.2436...",0.024483,1.0
3,"[-0.7736120581626892, -0.03396958652883768, 0....",3.0,3.0,3.0,"[-0.04242800525389612, -0.41407389286905527, 0...",0.0235,1.0
4,"[-0.868405310817145, -0.09403416974579587, 0.4...",3.0,3.0,3.0,"[-0.32411699263112886, -0.37115765096885817, 0...",0.0299,1.0
5,"[-0.44530814486954895, -0.23949102259108, 0.76...",3.0,3.0,3.0,"[-0.6024488943318526, 0.6705889486604266, -0.1...",0.011333,1.0
6,"[0.42577460159858066, -0.5092235257228216, 0.1...",4.0,4.0,4.0,"[0.7746137570251118, 0.5192284722897139, 0.886...",0.648317,1.0
7,"[-0.1370278112590313, -0.2474837268774326, 0.3...",2.0,5.0,5.0,"[0.4343554526003021, -0.11628930117568058, 1.0...",1.0,2.0
8,"[-0.0919013379348649, -0.5935968094401889, 0.1...",5.0,6.0,6.0,"[-0.3752606124523276, 0.09166628048725185, 0.5...",0.036233,1.0
9,"[-0.05658536443453548, -0.6769966695989881, 0....",6.0,7.0,7.0,"[-0.13542820482612833, -0.09841193907011889, -...",0.041667,2.0
10,"[0.22501516404251257, -0.5363367721438408, -0....",6.0,7.0,8.0,"[0.41355790404809845, -0.42321546872456867, -0...",0.0158,1.0


In [69]:
def text_temp(column):
    temp_X = column.astype(str)
    stop = set(stopwords.words('english'))
    temp =[]
    snow = nltk.stem.SnowballStemmer('english')

    for sentence in temp_X: 
        words = [snow.stem(word) for word in sentence.split(' ') if word not in stopwords.words('english')]   # Stemming and removing stopwords
        temp.append(sentence)

    count_vect = CountVectorizer(max_features=5000)
    bow_data = count_vect.fit_transform(temp)

    final_tf = temp
    tf_idf = TfidfVectorizer(max_features=5000)
    tf_data = tf_idf.fit_transform(final_tf)
    w2v_data = temp
    splitted = []
    for row in w2v_data: 
        splitted.append([word for word in row.split()])     #splitting words
    
    train_w2v = Word2Vec(splitted,min_count=5,size=50, workers=4)
    avg_data = []
    for row in splitted:
        vec = np.zeros(50, dtype=float)
        count = 0
        for word in row:
            try:
                vec += train_w2v[word]
                count += 1
            except:
                pass
        if (count == 0):
            avg_data.append(vec)
        else:
            avg_data.append(vec/count)
#        avg_data.append(vec)
    
    return avg_data

In [8]:
import numpy as np
LA = np.linalg
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from string import punctuation
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import ngrams
from itertools import chain
from wordcloud import WordCloud
from fractions import Fraction
import re

def text_fit(X, y, model,clf_model,coef_show=1):
    
    X_c = model.fit_transform(X)
    print('# features: {}'.format(X_c.shape[1]))
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
    print('# train records: {}'.format(X_train.shape[0]))
    print('# test records: {}'.format(X_test.shape[0]))
    clf = clf_model.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    print ('Model Accuracy: {}'.format(acc))
    
    if coef_show == 1: 
        w = model.get_feature_names()
        coef = clf.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))



[[-0.57702757 -0.74819326  0.09924899 ... -0.18377791  0.25190663
  -0.48338763]
 [ 0.7013793   0.39008319  0.24365189 ... -0.49847584 -0.53981707
  -0.38494914]
 [-0.04242801 -0.41407389  0.20057534 ... -0.78256806 -1.23607265
   0.1788884 ]
 ...
 [-0.6382768  -0.10912365  0.39701301 ... -0.48355715 -1.22618435
   0.06634921]
 [-0.2263508   0.14092254  0.97238378 ...  0.24495687 -0.93782678
  -0.78630965]
 [-0.10913116 -0.05172057  0.79419652 ... -0.03334181 -0.49530171
   0.00454599]]


In [None]:
#temp = load_data('train_data.csv')
#description_X = temp.descrption.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',regex=True)
#temp['descrption'] = text_temp(description_X)
temp = load_data('train_data.csv')
temp_score = load_label('train_label.csv')
X = temp.descrption.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',regex=True)
y = temp_score['score']


c = CountVectorizer(stop_words = 'english')
text_fit(X, y, c, LogisticRegression())

In [None]:
tfidf = TfidfVectorizer(stop_words = 'english')
text_fit(X, y, tfidf, LogisticRegression())

In [9]:
#SVD method

lg = cleaned_train.descrption.as_matrix(columns=None).tolist()
lg_array = np.vstack( lg )

U, s, Vh = LA.svd(lg_array, full_matrices=False)
assert np.allclose(lg_array, np.dot(U, np.dot(np.diag(s), Vh)))

s[3:] = 0.
new_lg = np.dot(U, np.dot(np.diag(s), Vh))
print(new_lg)
#print(where_are_NaNs)

[[-0.35198389 -0.11379996 -0.30357828 ... -0.1527949  -0.34127216
  -0.0542869 ]
 [ 0.60459277  0.16271913  0.75331262 ... -0.03694653 -0.34736228
  -0.50716436]
 [-0.13683075 -0.11227524 -0.07129964 ... -0.26827231 -0.50276253
  -0.26495914]
 ...
 [-0.07516894 -0.09759468 -0.00784303 ... -0.26397035 -0.49561554
  -0.289664  ]
 [ 0.00483437  0.20873543  0.33265187 ...  0.19107017 -0.5666628
  -0.34265145]
 [-0.14275475  0.38489335  0.35923337 ...  0.52218815 -0.60167191
  -0.27258963]]


In [14]:
la = cleaned_train.lvl1.as_matrix(columns=None).tolist()
lb = cleaned_train.lvl2.as_matrix(columns=None).tolist()
lc = cleaned_train.lvl3.as_matrix(columns=None).tolist()
ld = cleaned_train.price.as_matrix(columns=None).tolist()
le = cleaned_train.type.as_matrix(columns=None).tolist()
lf = cleaned_train.name.as_matrix(columns=None).tolist()
lg = cleaned_train.descrption.as_matrix(columns=None).tolist()
lg_array = np.vstack( lg )



X = la
X = np.column_stack((X,lb))
X = np.column_stack((X,lc))
X = np.column_stack((X,ld))
X = np.column_stack((X,le))
#X = np.column_stack((X,lf))
X = np.column_stack((X,new_lg))
X = X.tolist()
Y = train_score.score.as_matrix(columns=None).tolist()
print(len(X))

18141


In [15]:
from sklearn.utils import shuffle
X, Y = shuffle(X, Y)
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=0.20, random_state=0)

from keras.preprocessing import sequence
maxlen = 120
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, dtype='float32')
X_validation = sequence.pad_sequences(X_validation, maxlen=maxlen, dtype='float32')
#X_train = np.any(np.isnan(X_train))
#X_train = np.all(np.isfinite(X_train))
print(X_train[1400].size)

120


In [16]:
import os
import time
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
import pandas as pd
import random
import math
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_val_score

In [17]:
# Linear Regression Classifier
def linear_regression_classifier(train_x, train_y):
    model = linear_model.LinearRegression()
    model.fit(train_x, train_y)
    return model
 
# Multinomial Naive Bayes Classifier
def naive_bayes_classifier(train_x, train_y):
    model = MultinomialNB()

    param_grid = {'alpha': [math.pow(10,-i) for i in range(11)]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = MultinomialNB(alpha = best_parameters['alpha'])  
    model.fit(train_x, train_y)
    return model
 
 
# KNN Classifier
def knn_classifier(train_x, train_y):
    model = KNeighborsClassifier()

    param_grid = {'n_neighbors': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = KNeighborsClassifier(n_neighbors = best_parameters['n_neighbors'], algorithm='kd_tree')

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    return bagging
 
 
# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model
 
 
# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])
    
    model.fit(train_x, train_y)
    return model
 
 
# Decision Tree Classifier
def decision_tree_classifier(train_x, train_y):
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    return bagging
 
 
# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(train_x, train_y):
    model = GradientBoostingClassifier()
    
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(100,300,10))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])

    model.fit(train_x, train_y)
    return model

# SVM Classifier
def svm_classifier(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    model.fit(train_x, train_y)
    return model
 
# SVM Classifier using cross validation
def svm_cross_validation(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    #for para, val in best_parameters.items():
        #print para, val
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
    model.fit(train_x, train_y)
    return model

def feature_select(x,y):
    clf = ExtraTreesClassifier()
    clf = clf.fit(x, y)
    model = SelectFromModel(clf, prefit=True)
    return model

In [None]:
if __name__ == '__main__': 
    thresh = 0.5    
#    model_save_file = "/home/jason/datamining/model/models"     
#    model_save = {}
#    result_save_file = '/home/jason/datamining/result/results' 
     
    test_classifiers = ['KNN','LR','RF','DT','GBC','SVM']    
    classifiers = {
                   'KNN':knn_classifier,
                    'LR':logistic_regression_classifier,
                    'RF':random_forest_classifier,
                    'DT':decision_tree_classifier,
                   'GBC':gradient_boosting_classifier,
                   'SVM':svm_classifier,
                   
    }
        
    print('reading training and testing data...')    
    #X_train, X_validation, y_train, y_validation
    select_model = feature_select(X_train, y_train)
    X_train = select_model.transform(X_train)
    X_validation = select_model.transform(X_validation)

    result = []
        
    for classifier in test_classifiers:    
        print('******************* %s ********************' % classifier)    
        start_time = time.time()    
        model = classifiers[classifier](X_train, y_train)   
        print('training took %fs!' % (time.time() - start_time))    
        predict = model.predict(X_validation)

        precision = metrics.precision_score(y_validation, predict)    
        recall = metrics.recall_score(y_validation, predict)    
        print('precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall))    
        accuracy = metrics.accuracy_score(y_validation, predict)    
        print('accuracy: %.2f%%' % (100 * accuracy))

        scores = cross_val_score(model, X_train, y_train)
        print(scores)

reading training and testing data...
******************* KNN ********************
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    3.6s finished


training took 3.770676s!
precision: 70.64%, recall: 97.41%
accuracy: 70.65%
[0.70070277 0.70477569 0.70105437]
******************* LR ********************
training took 0.095091s!
precision: 71.24%, recall: 91.18%
accuracy: 68.92%
[0.70173625 0.69836676 0.70105437]
******************* RF ********************
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   11.7s finished


training took 12.678014s!
precision: 76.97%, recall: 85.15%
accuracy: 72.53%
[0.71992559 0.7215216  0.72793054]
******************* DT ********************
training took 0.237765s!
precision: 71.14%, recall: 87.54%
accuracy: 67.32%
[0.70421662 0.71593963 0.69464544]
******************* GBC ********************
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  3.8min finished


training took 244.804178s!
precision: 77.07%, recall: 86.49%
accuracy: 73.27%
[0.72984704 0.73330577 0.73247881]
******************* SVM ********************


In [None]:
table = pd.pivot_table(train_file,values = 'price', index=['lvl1','lvl2','lvl3'],columns=['type'],aggfunc=[min, max, np.mean])
table