In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
from html.parser import HTMLParser
from html.entities import name2codepoint
sns.set(color_codes=True)

%matplotlib inline

In [2]:
# load training data text
def load_data(filename):
    load_file = pd.read_csv(filename,delimiter=',', header=0,
                        dtype={'name':str, 'lvl1':str, 'lvl2':str, 'lvl3':str, 'descrption':str, 'type':str})
    load_file.columns = ['id', 'name','lvl1','lvl2','lvl3','descrption','price','type']
    load_file.duplicated(subset=None, keep='first')
    load_file.set_index('id', inplace = True)
    load_file.head()
    return load_file
#print(len(train_file))
def load_label(filename):
    load_label = pd.read_csv(filename,delimiter=',', header=0)
    load_label.columns = ['id', 'score']
    load_label.duplicated(subset=None, keep='first')
    load_label.set_index('id', inplace = True)
    return load_label

In [3]:
def map_mathod(column):
    values = []
    indexs = []
    mapping = {}
    index = 0
    for count in range(len(train_file)):
        value = train_file.get_value(count+1,column)
        if value in values and value != np.nan:
            continue
        values.append(value)
        indexs.append(len(values))
    for j in range(len(indexs)):
        mapping[values[j]] = indexs[j]
    mapping[np.nan] = 0.0
    return mapping
#train_file['lvl3'] = train_file['lvl3'].str.lower().replace('[^\'\w]+',' ',regex=True)
#mapping_lvl3 = map_mathod('lvl3')
#print(mapping_lvl3)

In [4]:
def clean_data(filename):
    #clean up data for lvl 1&2&3
    filename['lvl1'] = filename['lvl1'].str.lower().replace('[^a-zA-Z]+',' ',regex=True)
    filename['lvl2'] = filename['lvl2'].str.lower().replace('[^\'\w]+',' ',regex=True)
    filename['lvl3'] = filename['lvl3'].str.lower().replace('[^\'\w]+',' ',regex=True)
    
    mapping_lvl1 = map_mathod('lvl1')
    mapping_lvl2 = map_mathod('lvl2')
    mapping_lvl3 = map_mathod('lvl3')
    
    filename['lvl1'] = filename['lvl1'].map(mapping_lvl1)
    filename['lvl2'] = filename['lvl2'].map(mapping_lvl2)
    filename['lvl3'] = filename['lvl3'].map(mapping_lvl3)
    
    #normalize price
    maxp = filename.price.max()
    valuethred = 600.
    filename['price'] = filename['price'].clip(lower=0.,upper=valuethred).div(valuethred,fill_value=None)
    #hist = train_file['price'].hist(bins=10)
    #maxp

    #clean up type 
    mapping_type = {'international':1.,'local':2., np.nan:0.}
    filename['type'] = filename['type'].map(mapping_type)

    return filename

In [5]:
train_file = load_data('train_data.csv')
cleaned_train = clean_data(train_file)
#cleaned_train.head()
train_score = load_label('train_label.csv')
#train_score.info()
la = cleaned_train.lvl1.as_matrix(columns=None).tolist()
lb = cleaned_train.lvl2.as_matrix(columns=None).tolist()
lc = cleaned_train.lvl3.as_matrix(columns=None).tolist()
ld = cleaned_train.price.as_matrix(columns=None).tolist()
le = cleaned_train.type.as_matrix(columns=None).tolist()
X = la
X = np.column_stack((X,lb))
X = np.column_stack((X,lc))
X = np.column_stack((X,ld))
X = np.column_stack((X,le))
X = X.tolist()
Y = train_score.score.as_matrix(columns=None).tolist()

  import sys
  
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [6]:
from sklearn.utils import shuffle
X, Y = shuffle(X, Y)
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=0.20, random_state=0)

In [7]:
import os
import time
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
import pandas as pd
import random
import math
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_val_score

In [8]:
# Linear Regression Classifier
def linear_regression_classifier(train_x, train_y):
    model = linear_model.LinearRegression()
    model.fit(train_x, train_y)
    return model
 
# Multinomial Naive Bayes Classifier
def naive_bayes_classifier(train_x, train_y):
    model = MultinomialNB()

    param_grid = {'alpha': [math.pow(10,-i) for i in range(11)]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = MultinomialNB(alpha = best_parameters['alpha'])  
    model.fit(train_x, train_y)
    return model
 
 
# KNN Classifier
def knn_classifier(train_x, train_y):
    model = KNeighborsClassifier()

    param_grid = {'n_neighbors': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = KNeighborsClassifier(n_neighbors = best_parameters['n_neighbors'])

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    return bagging
 
 
# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model
 
 
# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(1,21))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])
    
    model.fit(train_x, train_y)
    return model
 
 
# Decision Tree Classifier
def decision_tree_classifier(train_x, train_y):
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)

    bagging = BaggingClassifier(model, max_samples=0.5, max_features=1 )
    bagging.fit(train_x, train_y)
    return bagging
 
 
# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(train_x, train_y):
    model = GradientBoostingClassifier()
    
    model = RandomForestClassifier()

    param_grid = {'n_estimators': list(range(100,300,10))}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    
    model = RandomForestClassifier(n_estimators = best_parameters['n_estimators'])

    model.fit(train_x, train_y)
    return model

# SVM Classifier
def svm_classifier(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    model.fit(train_x, train_y)
    return model
 
# SVM Classifier using cross validation
def svm_cross_validation(train_x, train_y):
    model = SVC(kernel='linear', probability=True)
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
    grid_search.fit(train_x, train_y)
    best_parameters = grid_search.best_estimator_.get_params()
    #for para, val in best_parameters.items():
        #print para, val
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
    model.fit(train_x, train_y)
    return model

def feature_select(x,y):
    clf = ExtraTreesClassifier()
    clf = clf.fit(x, y)
    model = SelectFromModel(clf, prefit=True)
    return model

In [9]:
if __name__ == '__main__': 
    thresh = 0.5    
#    model_save_file = "/home/jason/datamining/model/models"     
#    model_save = {}
#    result_save_file = '/home/jason/datamining/result/results' 
     
    test_classifiers = ['NB','KNN','LR','RF','DT','GBC','SVM']    
    classifiers = { 
                    'NB':naive_bayes_classifier,
                   'KNN':knn_classifier,
                    'LR':logistic_regression_classifier,
                    'RF':random_forest_classifier,
                    'DT':decision_tree_classifier,
                   'GBC':gradient_boosting_classifier,
                   'SVM':svm_classifier,
                   
    }
        
    print('reading training and testing data...')    
    #X_train, X_validation, y_train, y_validation
    select_model = feature_select(X_train, y_train)
    X_train = select_model.transform(X_train)
    X_validation = select_model.transform(X_validation)

    result = []
        
    for classifier in test_classifiers:    
        print('******************* %s ********************' % classifier)    
        start_time = time.time()    
        model = classifiers[classifier](X_train, y_train)   
        print('training took %fs!' % (time.time() - start_time))    
        predict = model.predict(X_validation)

        precision = metrics.precision_score(y_validation, predict)    
        recall = metrics.recall_score(y_validation, predict)    
        print('precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall))    
        accuracy = metrics.accuracy_score(y_validation, predict)    
        print('accuracy: %.2f%%' % (100 * accuracy))

        scores = cross_val_score(model, X_train, y_train)
        print(scores)

reading training and testing data...
******************* NB ********************
Fitting 3 folds for each of 11 candidates, totalling 33 fits


[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:    0.3s finished


training took 0.312381s!
precision: 70.54%, recall: 100.00%
accuracy: 70.54%
[0.68375362 0.68389498 0.68389498]
******************* KNN ********************
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    2.3s finished


training took 2.357985s!
precision: 70.83%, recall: 96.09%
accuracy: 69.33%
[0.67734601 0.67769278 0.67583213]
******************* LR ********************
training took 0.011753s!
precision: 70.54%, recall: 100.00%
accuracy: 70.54%
[0.68375362 0.68389498 0.68389498]
******************* RF ********************
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    7.1s finished


training took 7.353589s!
precision: 72.07%, recall: 84.69%
accuracy: 66.05%
[0.64572137 0.65040314 0.64688857]
******************* DT ********************
training took 0.153025s!
precision: 71.67%, recall: 86.17%
accuracy: 66.22%
[0.64882183 0.64399421 0.6522638 ]
******************* GBC ********************
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.2min finished


training took 139.319282s!
precision: 72.00%, recall: 85.90%
accuracy: 66.49%
[0.64964862 0.64998966 0.65185032]
******************* SVM ********************
training took 8.969538s!
precision: 70.54%, recall: 100.00%
accuracy: 70.54%
[0.68375362 0.68389498 0.68389498]


In [None]:
#X_train, X_validation, y_train, y_validation


In [None]:
table = pd.pivot_table(train_file,values = 'price', index=['lvl1','lvl2','lvl3'],columns=['type'],aggfunc=[min, max, np.mean])
table