# We will evaluate all models on "Test" data only which will not be the part of Training dataset.

# In this submission we will use TF-IDF (Term frequency - Inverse document frequency)

In [8]:
import matplotlib
import os
import numpy as np
import pandas as pd
import json
import io
import matplotlib.pyplot as plt
from sklearn import linear_model
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from IPython.core.display import display, HTML
from __future__ import print_function
import sys
from time import time
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit

from collections import defaultdict


%matplotlib inline  

In [2]:
d = defaultdict(LabelEncoder)

category_json_path = 'categories.json'
cat_file = open(category_json_path).read()
cat_js = json.loads(cat_file)
cat_dict_by_id = {}

for x in cat_js:
    cat_dict_by_id[x['id']] = x
    
all_leaf_list = []
for x in list(filter(lambda x: x['leaf'] == True, cat_js)):
    all_leaf_list.append(x)
    
df = pd.read_json(cat_file)
df['depth'] = None
df.loc[:,['depth']] = (df.apply(lambda row: float(len(row['path'].split('>'))), axis=1)).values


df['name'] = df['name'].str.lower()
df['path'] = df['path'].str.lower()
names_count_df = df.groupby(['name']).agg(['count'])
names_duplicate_df = names_count_df[(names_count_df['id']['count'] > 1)]
df['duplicates'] = False
df['duplicates'] = np.where(df['name'].isin(names_duplicate_df.index) & df['leaf'] == True, 1, 0)

vocabulary = set()
for idx, row in df.iterrows():
    [vocabulary.update([path_seq.strip()]) for path_seq in row.path.split('>')]

df = df[df.leaf == 1]

df['leaf'] = np.where(df['leaf'] == True, 1.0, 0.0)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,id,leaf,name,path,pathWithIds,depth,duplicates
0,15999,1.0,internal cd drives,computers & accessories > computer components ...,10161 > 15997 > 15998 > 15999,4.0,0
1,16004,1.0,boys,sports & outdoors > fan shop > clothing & acce...,10162 > 16000 > 16001 > 16002 > 16003 > 16004,6.0,1
2,16008,1.0,boots,automotive > motorcycle & powersports > protec...,10163 > 16005 > 16006 > 16007 > 16008,5.0,1
3,16010,1.0,kids bikes & accessories,sports & outdoors > cycling > kids bikes & acc...,10162 > 16009 > 16010,3.0,0
4,16013,1.0,bells & sleigh bells,home & kitchen > seasonal décor > ornaments > ...,10164 > 16011 > 16012 > 16013,4.0,0


# Feature Engineering:
1. Using <span>TF-IDF<span> to introduce synthetic features
2. TF-IDF has been calculated on entire dataset.
3. TF-IDF has been applied to only dataset of our interest i.e. data subset having leaf nodes.
4. Extracted labels

In [3]:
x = df.apply(lambda row: "".join(row['path'].split('>')), axis = 1)
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(x)
df_tfidf = pd.DataFrame(x.todense())
y = df.duplicates

# Shuffle and split dataset

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_tfidf, y, test_size=0.33, random_state=42)

In [5]:
 def classify(clf):
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    clf_descr = str(clf).split('(')[0]
    return str(clf), clf_descr, score

# Trying various classifiers to pick best classifer while evaluating model on test dataset.

In [20]:
results = []
for clf, name in (
        (GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), "GaussianProcessClassifier"),
        (GaussianNB(), "GaussianNB"),
        (linear_model.LogisticRegression(), "linear_model.LogisticRegression")):
    results.append(classify(clf))

accuracy:   0.793
accuracy:   0.678
accuracy:   0.764


# From above results we should select the LogisticRegression even the accuracy is slightly higer with GaussianProcessClassifier as it converged much faster.

In [6]:
def scoring_a(y_true, y_predict):
    return metrics.accuracy_score(y_true, y_predict)

clf = linear_model.LogisticRegression()
scorer = make_scorer(scoring_a)
cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

# Trying GridSearchCV with LogisticRegression as with only 'LibLinear' kernel. (other kernels will not work with l1 penalty, we will try them later).

In [9]:
parameters = {
                'penalty':("l1","l2"),
                'C':(1.0,0.9,0.8),
                'class_weight':('balanced',None), 
                'max_iter':(90,100,110),
                'solver':['liblinear'],
                'warm_start':(True,False),
                #'dual':(True,False),
                'random_state':[4242]
            }


grid_obj = GridSearchCV(clf, parameters, cv = cv_sets)#, scoring = scorer)
start = time()
grid_fit = grid_obj.fit(X_train, y_train)
end = time()
delta = end-start
best_clf = grid_fit.best_estimator_
print("Learning time in seconds: "+str(delta))
print("Best Accuracy with Test data: "+str(scoring_a(best_clf.predict(X_test),y_test)))
print(best_clf)

Learning time in seconds: 100.39824295
Best Accuracy with Test data: 0.763791763792
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=90, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=4242, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=True)


# Trying other kernels with GridSearchCV.

In [7]:
parameters = {
                #'penalty':["l2"],
                #'C':(1.0,0.9,0.8),
                #'class_weight':('balanced',None), 
                #'max_iter':(90,100,110),
                'solver':['liblinear','newton-cg','lbfgs','sag'],
                #'warm_start':(True,False),
                #'dual':(True,False),
                'random_state':[4242]
            }
grid_obj_outher_solver_for_l2 = GridSearchCV(clf, parameters, cv = cv_sets)#, scoring = scorer)
grid_fit_outher_solver_for_l2 = grid_obj_outher_solver_for_l2.fit(X_train, y_train)
best_clf_outher_solver_for_l2 = grid_fit_outher_solver_for_l2.best_estimator_
print("Best Accuracy: "+str(scoring_a(best_clf_outher_solver_for_l2.predict(X_test),y_test)))
print(best_clf_outher_solver_for_l2)

Best Accuracy: 0.763403263403
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=4242, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)


In [20]:
lr_clf = linear_model.LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=90, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=4242, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=True)
start = time()
lr_clf.fit(X_train, y_train)
end = time()
delta = end-start
pred = lr_clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
display(HTML('Learning time with best settings found by GridSearchCV in <span style="background-color:yellow;">seconds: '+str(delta)+'</span>'))
display(HTML('Accuracy with best settings found by GridSearchCV on Test data:  <span style="background-color:yellow;"> '+str(score*100)+' % </span>'))