In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
from html.parser import HTMLParser
from html.entities import name2codepoint
sns.set(color_codes=True)
import warnings
warnings.filterwarnings("ignore")                   
import nltk                                         
nltk.download('stopwords')
from nltk.corpus import stopwords                   
from nltk.stem import PorterStemmer  
LA = np.linalg

from sklearn.feature_extraction.text import CountVectorizer          
from sklearn.feature_extraction.text import TfidfVectorizer   
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
#from gensim.models.word2vec import Word2Vec  
import spacy
from spacy.lang.en.examples import sentences

import re

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chd415/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load training data text
def load_data(filename):
    load_file = pd.read_csv(filename,delimiter=',', header=0,
                        dtype={'name':str, 'lvl1':str, 'lvl2':str, 'lvl3':str, 'descrption':str, 'type':str})
    load_file.columns = ['id', 'name','lvl1','lvl2','lvl3','descrption','price','type']
    load_file.duplicated(subset=None, keep='first')
    load_file.set_index('id', inplace = True)
    load_file.head()
    return load_file
#print(len(train_file))
def load_label(filename):
    load_label = pd.read_csv(filename,delimiter=',', header=0)
    load_label.columns = ['id', 'score']
    load_label.duplicated(subset=None, keep='first')
    load_label.set_index('id', inplace = True)
    return load_label

In [3]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):

        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)


In [4]:
def clean_data(filename):
    
    filename['lvl1'] = filename['lvl1'].str.lower().replace('[^a-zA-Z]+',' ',regex=True)
    filename['lvl2'] = filename['lvl2'].str.lower().replace('[^\'\w]+',' ',regex=True)
    filename['lvl3'] = filename['lvl3'].str.lower().replace('[^\'\w]+',' ',regex=True)
    filename['descrption'] = filename['descrption'].str.lower()
    filename['name'] = filename['name'].str.lower()

    #clean up data for lvl 1&2&3 and type
    temp =  filename.drop(['price', 'descrption','name'], axis=1)
    outfile = MultiColumnLabelEncoder(columns = ['lvl1','lvl2','lvl3','type']).fit_transform(temp.astype(str))
    
    #one-hot encoder
    enc = preprocessing.OneHotEncoder()
    enc.fit(outfile)
    outfile = enc.transform(outfile).toarray()

    #bin price
    valuethred = 5000.
    clipped_price = filename['price'].clip(lower=0.,upper=valuethred)
    price_kbin = clipped_price.as_matrix(columns=None).reshape(-1, 1)
    kbinest = KBinsDiscretizer(n_bins=20, encode='onehot-dense')
    kbinout = kbinest.fit_transform(price_kbin)
    outfile = np.column_stack((outfile,kbinout))

    #normalize price
    valuethred = 5000.
    clipped2norm = filename['price'].clip(lower=0.,upper=valuethred)
    price_X = clipped2norm.as_matrix(columns=None).reshape(-1, 1)
    transformer = Normalizer(copy=True,norm='l2')
    ld = transformer.fit_transform(price_X)
    outfile = np.column_stack((outfile,ld))  

    #clean up text
    description_X = filename.descrption.str.lower().replace('<li>','final ',regex=True).replace('<.*?>','',
                                                                    regex=True).replace('[^\w\s]+',' ',regex=True)
    count_descrption = description_X.str.count('final').fillna(0).tolist()
    outfile = np.column_stack((outfile,count_descrption))
    
    description_X = description_X.str.lower().replace('final ','',regex=True).replace('\d+', '',regex=True)
    descrption_Xstring = pd.Series(description_X.tolist()).astype(str)
    count_wordcount = descrption_Xstring.apply(lambda x: len(x.split(' ')))
    count_lettercount = descrption_Xstring.apply(lambda x: len(x))
    outfile = np.column_stack((outfile,count_wordcount))
    outfile = np.column_stack((outfile,count_lettercount))

    clg_g = spacy.load('en_core_web_lg')
    descrption_clg = []
    for i in range(len(descrption_Xstring)):
        doc = clg_g(descrption_Xstring[i])
        descrption_clg.append(doc.vector)
    lg = np.vstack(descrption_clg)
    
    
    name_X = filename.name.str.lower().replace('<.*?>','',regex=True).replace('[^\w\s]+',' ',
                                                                        regex=True).replace('\d+', '',regex=True)
    name_Xstring = pd.Series(name_X.tolist()).astype(str)
    name_wordcount = name_Xstring.apply(lambda x: len(x.split(' ')))
    name_lettercount = name_Xstring.apply(lambda x: len(x))
    outfile = np.column_stack((outfile,name_wordcount))
    outfile = np.column_stack((outfile,name_lettercount))
    
    clg_f = spacy.load('en_core_web_lg')
    name_clg = []
    for i in range(len(name_Xstring)):
        doc = clg_f(name_Xstring[i])
        name_clg.append(doc.vector)
    lf = np.vstack(name_clg)

    #svd
    svd_n = TruncatedSVD(n_components=30,algorithm='arpack')
    new_lg = svd_n.fit_transform(lg.astype('f'))
    svd_f = TruncatedSVD(n_components=15,algorithm='arpack')
    new_lf = svd_n.fit_transform(lf.astype('f'))
    
    outfile = np.column_stack((outfile,new_lg))
    outfile = np.column_stack((outfile,new_lf))
    
    return outfile

In [5]:
train_file = load_data('train_data.csv')
test_file = load_data('test_data.csv')
combined_file = pd.concat([train_file,test_file])
cleaned_train = clean_data(combined_file)
train_score = load_label('train_label.csv')
np.shape(cleaned_train)

(36283, 340)

In [6]:
cleaned_train

array([[ 0.        ,  0.        ,  0.        , ...,  0.34309661,
         0.17233208,  0.45813119],
       [ 0.        ,  1.        ,  0.        , ..., -0.04276813,
         0.20824412, -0.23762988],
       [ 0.        ,  0.        ,  0.        , ..., -0.16593644,
        -0.04395773,  0.114619  ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.14239173,
        -0.00520656, -0.13928017],
       [ 0.        ,  0.        ,  0.        , ..., -0.04370552,
         0.08029149, -0.26857373],
       [ 0.        ,  0.        ,  0.        , ...,  0.10217056,
        -0.12637366,  0.1478506 ]])

In [7]:
X = cleaned_train
w,b = np.shape(np.array(X))
print(np.shape(np.array(X)))
Y = train_score.score.as_matrix(columns=None).tolist()
b

(36283, 340)


340

In [8]:
X = cleaned_train[:18141]
XX = cleaned_train[18141:]
Y = train_score.score.as_matrix(columns=None).tolist()
print(np.size(Y))
print(np.shape(X))
print(np.shape(XX))


18141
(18141, 340)
(18142, 340)


In [9]:
from sklearn.utils import shuffle
X, Y = shuffle(X, Y)
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=0.20, random_state=0)

print(X_train[1400].size)

340


In [10]:
import os
import time
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
import pandas as pd
import random
import math
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import BaggingClassifier  
from sklearn.model_selection import cross_val_score

In [11]:
# Linear Regression Classifier
def linear_regression_classifier(train_x, train_y):
    model = linear_model.LinearRegression()
    model.fit(train_x, train_y)
    return model
 
# Multinomial Naive Bayes Classifier
def naive_bayes_classifier(train_x, train_y):
    model = MultinomialNB()

    param_grid = {'alpha': [math.pow(10,-i) for i in range(11)]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = MultinomialNB(alpha = best_parameters['alpha'])  
    model.fit(train_x, train_y)
    return model
 
# KNN Classifier
def knn_classifier(train_x, train_y):
    model = KNeighborsClassifier()

    param_grid = {'n_neighbors': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = KNeighborsClassifier(n_neighbors = best_parameters['n_neighbors'], algorithm='kd_tree')

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    return bagging
 
# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model
 
# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])
    
    model.fit(train_x, train_y)
    return model
  
# Decision Tree Classifier
def decision_tree_classifier(train_x, train_y):
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    
    return bagging
 
# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(train_x, train_y):
    model = GradientBoostingClassifier()
    
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(100,300,10))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])

    model.fit(train_x, train_y)
    return model

# SVM Classifier
def svm_classifier(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    model.fit(train_x, train_y)
    return model
 
# SVM Classifier using cross validation
def svm_cross_validation(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()

    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
    model.fit(train_x, train_y)
    return model

def feature_select(x,y):
    clf = ExtraTreesClassifier()
    clf = clf.fit(x, y)
    model = SelectFromModel(clf, prefit=True)
    return model

In [12]:
# just for my own record

if __name__ == '__main__': 
    thresh = 0.5    
    
    test_classifiers = ['LR','RF','DT']    
    classifiers = {
                    'LR':logistic_regression_classifier,
                    'RF':random_forest_classifier,
                    'DT':decision_tree_classifier,
    }
       
    print('reading training and testing data...')    

    select_model = feature_select(X_train, y_train)
    X_train = select_model.transform(X_train)
    X_validation = select_model.transform(X_validation)

    result = []
       
    for classifier in test_classifiers:    
        print('******************* %s ********************' % classifier)    
        start_time = time.time()    
        model = classifiers[classifier](X_train, y_train)   
        print('training took %fs!' % (time.time() - start_time))    
        predict = model.predict(X_validation)

        precision = metrics.precision_score(y_validation, predict)    
        recall = metrics.recall_score(y_validation, predict)    
        accuracy = metrics.accuracy_score(y_validation, predict)    
        print('accuracy: %.2f%%' % (100 * accuracy))
        logloss = metrics.log_loss(y_validation, predict)
        print('loss: %.2f' % (logloss))

        scores = cross_val_score(model, X_train, y_train)
#        print(scores)

reading training and testing data...
******************* LR ********************
training took 1.214134s!
accuracy: 80.46%
loss: 6.75
******************* RF ********************
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   39.3s finished


training took 43.186757s!
accuracy: 80.99%
loss: 6.57
******************* DT ********************
training took 1.909744s!
accuracy: 67.29%
loss: 11.30


In [13]:
X_train
#X_train, y_train = X,Y

array([[ 0.        ,  0.        ,  0.        , ..., -0.08697326,
        -0.18759921,  0.11283299],
       [ 0.        ,  1.        ,  0.        , ...,  0.37877461,
         0.35219729,  0.17430373],
       [ 0.        ,  0.        ,  1.        , ..., -0.0323269 ,
        -0.10206458,  0.20948608],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.09015514,
         0.12230652,  0.12184646],
       [ 0.        ,  0.        ,  1.        , ...,  0.07511818,
         0.16912353,  0.06112052],
       [ 0.        ,  0.        ,  1.        , ..., -0.0028989 ,
        -0.36047363,  0.35635692]])

In [14]:
X_train = cleaned_train[:18141]
X_test = cleaned_train[18141:]
y_train = train_score.score.as_matrix(columns=None).tolist()

In [15]:
if __name__ == '__main__': 
    thresh = 0.5    
     
    test_classifiers = ['LR','RF','GBC','DT']    
    classifiers = {
                   'LR':logistic_regression_classifier,
                   'RF':random_forest_classifier,
                   'GBC':gradient_boosting_classifier,
                   'DT':decision_tree_classifier
    }
        
    print('reading training and testing data...')    

    select_model = feature_select(X_train, y_train)
    X_train = select_model.transform(X_train)
    X_test = select_model.transform(X_test)

    result = []
    
    start_time = time.time()
    modellr = classifiers['LR'](X_train, y_train)   
    print('training took %fs!' % (time.time() - start_time))    
    Y_predict_lr = modellr.predict_proba(X_test)[:,1]
    print('predict finished')
    
    modelrf = classifiers['RF'](X_train, y_train)   
    print('training took %fs!' % (time.time() - start_time))
    regr = RandomForestRegressor(max_depth=2, max_features=1)
    regr.fit(X_train, y_train)
    Y_predict_rf = regr.predict(X_test)
    print('predict finished')
    
    modelgbc = classifiers['GBC'](X_train, y_train)   
    print('training took %fs!' % (time.time() - start_time))    
    Y_predict_gbc = modelgbc.predict_proba(X_test)[:,1]
    print('predict finished')
    
    modeldt = classifiers['DT'](X_train, y_train)  
    print('training took %fs!' % (time.time() - start_time))
    regressor = DecisionTreeRegressor(max_depth=2) 
    regressor.fit(X_train, y_train)
    Y_predict_dt = regressor.predict(X_test)
    print('predict finished')


reading training and testing data...
training took 2.251265s!
predict finished
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   51.6s finished


training took 58.578194s!
predict finished
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 84.1min finished


training took 5182.220682s!
predict finished
training took 5185.944616s!
predict finished


In [32]:
#nn train
X_train = cleaned_train[:18141]
X_test = cleaned_train[18141:]
y_train = train_score.score.as_matrix(columns=None).tolist()
X_array = np.asarray(X_train)
Y_array = np.asarray(y_train)
Xtest_array = np.asarray(X_test) 
import warnings
warnings.filterwarnings("ignore")   

from keras.models import Sequential
from keras.layers import Dense


# fix random seed for reproducibility
np.random.seed(7)

# split into input (X) and output (Y) variables
X = X_array
Y = Y_array
# create model
nnmodel = Sequential()
nnmodel.add(Dense(100, input_dim=b, activation='relu'))
nnmodel.add(Dense(100, activation='relu'))
nnmodel.add(Dense(100, activation='relu'))
nnmodel.add(Dense(100, activation='relu'))
nnmodel.add(Dense(1, activation='sigmoid'))

nnmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

nnmodel.fit(X, Y, epochs=20, batch_size=150)

scores = nnmodel.evaluate(X, Y)
print("\n%s: %.2f%%" % (nnmodel.metrics_names[1], scores[1]*100))
Y_predict_nn = nnmodel.predict(Xtest_array, verbose=0)
Y_predict_nn = np.squeeze(Y_predict_nn, axis=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

acc: 82.47%


In [18]:
from sklearn.ensemble import VotingClassifier
eclf1 = VotingClassifier(estimators=[('lr', modellr), ('rf', modelrf), ('gbc', modelgbc)], voting='soft')
eclf1 = eclf1.fit(X_train, y_train)
Y_predict_emsemble = eclf1.predict_proba(X_test)[:,1]

In [28]:
Y_predict = []
for i in range(len(Y_predict_emsemble)):
    Y_predict.append(( Y_predict_emsemble[i] + Y_predict_nn[i] ) / 2.0)

In [30]:
temp_score = load_label('submission.csv')
submit_score  = temp_score
submit_score['score'] = Y_predict
submit_score.to_csv('predict_result_lr-gbc-nn.csv')