In [1]:
import pandas as pd
from tqdm import tqdm_notebook

import sys
sys.path.append("../../../lib") # Adds higher directory to python modules path.
import helper_functions as hf
import language_processing as lp
import feature_extraction as fe

In [2]:
PATH = "../../../data/"
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')
stopwords = [line.rstrip('\n') for line in open(f'{PATH}stopwords/english')]


## Replace NaN

In [3]:
data_full = hf.replace_nan(data_full)

In [4]:
data_full.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,No Brand,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,No Brand,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,No Brand,44.0,0,Complete with certificate of authenticity


# Work with only 'Electronics/Video Games & Consoles/Video Gaming Merchandise' category

In [5]:
cat_df = data_full.loc[data_full.category_name == 'Electronics/Video Games & Consoles/Video Gaming Merchandise']

len(cat_df)

1151

## Delete items without description

In [6]:
cat_df = cat_df[cat_df.item_description != 'No description yet']

len(cat_df)

1065

## Extract labels

In [7]:
import price_classifier

pc = price_classifier.PriceClassifier(cat_df, 5)
y = pc.extract(cat_df)

for range_ in pc.ranges:
    print("range {} has {} items".format(range_, (y == (str(range_[0]) + "-" + str(range_[1]))).sum()))

range (0, 9.0) has 209 items
range (9.0, 15.0) has 193 items
range (15.0, 23.0) has 227 items
range (23.0, 38.0) has 218 items
range (38.0, 589.0) has 218 items


### Train & Test split

In [8]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(cat_df, y, test_size=0.20, random_state=42)

### Convert GloVe into word2vec

In [9]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove_input_file = f'{PATH}glove/glove.840B.300d.txt'
# word2vec_output_file = f'{PATH}glove/glove.840B.300d.txt.word2vec'
# glove2word2vec(glove_input_file, word2vec_output_file)

### Load the word2vec model

In [10]:
from gensim.models import KeyedVectors

# UNCOMMENT ONE OF THE FOLLOWING

# filename = f'{PATH}glove/glove.6B.100d.txt.word2vec' # GloVe Wikipedia + Gigaword
# model = KeyedVectors.load_word2vec_format(filename, binary=False)

filename = f'{PATH}glove/glove.6B.300d.txt.word2vec' # GloVe Wikipedia + Gigaword
model = KeyedVectors.load_word2vec_format(filename, binary=False)

# filename = f'{PATH}glove/glove.840B.300d.txt.word2vec' # GloVe Common Crowl
# filename_stemmed = f'{PATH}glove/stemmed_glove.840B.300d.txt.word2vec' # GloVe Common Crowl
# model = KeyedVectors.load_word2vec_format(filename, binary=False)
# stemmed_model = KeyedVectors.load_word2vec_format(filename_stemmed, binary=False)



# filename = f'{PATH}google/GoogleNews-vectors-negative300.bin'# Google News
# model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [97]:
df_train.loc[1352110]

train_id                                                       1352110
name                                             Pokemon Mewtwo amiibo
item_condition_id                                                    2
category_name        Electronics/Video Games & Consoles/Video Gamin...
brand_name                                                    Nintendo
price                                                               12
shipping                                                             0
item_description                                                Amiibo
Name: 1352110, dtype: object

In [12]:
import importlib

In [110]:
importlib.reload(lp)

<module 'language_processing' from '../../../lib/language_processing.py'>

In [111]:
mwe = lp.PrincipalEmbeddingExtractor(
            model,
            n_directions=9,
            column_name='item_description',
            stem=False,
            stopwords=None
        )

In [112]:
X_train = mwe.extract(df_train)
# X_test = mwe.extract(df_test)

  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var


In [113]:
X_train

Unnamed: 0,princ_axis_0_dim_0,princ_axis_0_dim_1,princ_axis_0_dim_2,princ_axis_0_dim_3,princ_axis_0_dim_4,princ_axis_0_dim_5,princ_axis_0_dim_6,princ_axis_0_dim_7,princ_axis_0_dim_8,princ_axis_0_dim_9,...,princ_axis_8_dim_290,princ_axis_8_dim_291,princ_axis_8_dim_292,princ_axis_8_dim_293,princ_axis_8_dim_294,princ_axis_8_dim_295,princ_axis_8_dim_296,princ_axis_8_dim_297,princ_axis_8_dim_298,princ_axis_8_dim_299
79206,-0.065026,-0.035790,0.003831,-0.017795,0.005396,0.032713,-0.013887,-0.008991,0.038507,-0.408733,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1249311,-0.091864,0.019626,-0.009511,-0.015431,0.021523,-0.007041,0.001369,0.017711,0.043429,-0.374320,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
49447,-0.086543,-0.009805,-0.022434,-0.000544,0.048609,0.060991,-0.023441,0.023484,0.033727,-0.313851,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
392497,-0.053651,0.036911,-0.006998,-0.019122,0.000754,0.028214,-0.019111,0.036884,0.021419,-0.449715,...,-0.038956,0.000851,0.143368,0.032093,0.050861,-0.055819,-0.021108,0.042404,0.117747,-0.033421
1104640,-0.037487,0.032680,0.014065,-0.022010,-0.090606,0.034851,-0.049264,0.087447,0.008864,-0.233447,...,-0.026933,0.017106,-0.114878,0.026388,0.077121,0.136332,0.091245,0.088326,-0.003959,0.010019
428832,-0.029782,-0.003217,-0.006104,-0.073143,-0.052144,0.022141,-0.021820,0.040666,0.015891,-0.340149,...,-0.043724,-0.049791,0.043436,0.006587,0.030967,-0.166271,0.041230,0.006059,0.036129,-0.032567
452323,-0.039589,0.022079,0.031348,-0.018624,-0.039459,0.001701,0.022064,0.021702,-0.005747,-0.243896,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
474929,-0.052145,0.024921,-0.027527,-0.018267,0.012399,0.006883,-0.008479,-0.018400,0.038173,-0.425026,...,0.066855,0.040928,0.071769,0.053134,0.024678,-0.035724,-0.047973,-0.047936,-0.039355,-0.023306
1131082,-0.053224,0.047125,-0.002231,-0.013056,-0.016452,0.020651,-0.002649,0.012007,0.013949,-0.450801,...,-0.000933,0.012842,-0.041787,0.068011,-0.012081,0.056029,0.099535,0.009994,0.049172,0.007354
1363611,-0.027164,-0.030902,0.007677,-0.036685,0.001564,0.052542,-0.038973,-0.015506,0.014776,-0.381040,...,-0.070096,0.008298,-0.059944,-0.097134,0.107609,0.076021,0.034970,-0.003036,-0.041464,-0.026914


## Run grid search

Run grid search for all combinations of text processing

In [12]:
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

n_directions_lst = [1, 3, 5, 10, 15, 20, 30, 40, 50]
NLP_param_grid = {
    'desc_sw': [stopwords, None], 'desc_stem': [True, False], 'desc_n_dirs': n_directions_lst,
    'name_sw': [stopwords, None], 'name_stem': [True, False], 'name_n_dirs': n_directions_lst,
}

grid_search_param_grid = {
#     'n_estimators':[1, 3, 10, 30, 100, 300, 1000],
    'n_estimators':[1],
#     'max_depth':[1,3,10,30,100],
    'min_samples_leaf':[1,3,10],
    'random_state':[42]
}

optimisation_result = None

for gp in tqdm_notebook(list(ParameterGrid(NLP_param_grid))):
#     print("NLP params:", gp)

    # Create pipeline
    pipe = fe.Pipeline(steps=[
        ('item_desc', lp.PrincipalEmbeddingExtractor(
            model=stemmed_model if gp['desc_stem'] else model,
            n_directions=gp['desc_n_dirs'],
            column_name='item_description',
            stopwords=gp['desc_sw'],
        )),
        ('name', lp.PrincipalEmbeddingExtractor(
            model=stemmed_model if gp['name_stem'] else model,
            n_directions=gp['name_n_dirs'],
            column_name='name',
            stopwords=gp['name_sw'],
        )),
    ])
    
    # Exctract features
#     print("Extracting features")
    X_train = pipe.extract_features(df_train)
    X_test = pipe.extract_features(df_test)

    # Run grid search
#     print("Running grid search")
    random_forest = RandomForestClassifier()
    clf = GridSearchCV(random_forest, grid_search_param_grid, iid=False, cv=5, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    current_result = {
        'best_score_': clf.best_score_,
        'best_NLP_gp': gp,
        'best_params_': clf.best_params_,
        'best_estimator_': clf.best_estimator_,
        'X_train': X_train,
        'X_test': X_test,
        'pipe': pipe,
    }
    
    if optimisation_result is None:
        optimisation_result = current_result
#         print()
        continue
        
    if current_result['best_score_'] > optimisation_result['best_score_']:
        print("New best score achieved", current_result['best_score_'])
        optimisation_result = current_result
#     print()

# Add the parameters for grid search and the training data
print("Optimization done, adding additional data into optimisation_result before saving pickle...")
optimisation_result['NLP_param_grid'] = NLP_param_grid
optimisation_result['grid_search_param_grid'] = grid_search_param_grid
optimisation_result['df_train'] = df_train
optimisation_result['df_test'] = df_test
optimisation_result['y_train'] = y_train
optimisation_result['y_test'] = y_test

print("Saving pickle")
# hf.save_pickle(optimisation_result, f'{PATH}pickle/BOW_optimisation_result')
print("Done")

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




Optimization done, adding additional data into optimisation_result before saving pickle...
Saving pickle
Done


### Exctract features

In [16]:
print("Extracting features for the training set...")
X_train = pipe.extract_features(df_train)
print("Done")
print("Extracting features for the test set...")
X_test = pipe.extract_features(df_test)
print("Done")

Extracting features for the training set...
Done
Extracting features for the test set...
Done


### Analysis on word embeddings (item_description)

In [17]:
voc_set_model = pipe.named_steps['word2vec_item_description'].voc_set_model
voc_set_df_train = pipe.named_steps['word2vec_item_description'].voc_set_df_train
voc_set_intersect = pipe.named_steps['word2vec_item_description'].voc_set_intersect

In [18]:
print("Size of model vocabulary: {}".format(len(voc_set_model)))
print("Size of df_train vocabulary: {}".format(len(voc_set_df_train)))
print("Size of intersect vocabulary: {}".format(len(voc_set_intersect)))

Size of model vocabulary: 400000
Size of df_train vocabulary: 3307
Size of intersect vocabulary: 2406


In [19]:
print("Intersect vocabulary: {}".format(voc_set_intersect))

Intersect vocabulary: {'us', 'kind', 'production', 'improves', 'control', 'applicant', 'outs', 'ab', 'smooth', 'literally', 'optically', 'reminder', 'aligning', 'webcam', 'alot', 'full', 'please', 'released', 'properly', 'slot', 'lighting', 'bangs', 'sticky', 'zte', 'enhanced', 'directly', 'lens', 'line', 'extension', 'fitted', 'll', 'high', 'specially', 'package', 'turn', 'very', 'panels', 'cup', 'yellowing', 'returned', 'burlap', '@', 'protection', 'your', 'rm', 'reinforces', 'ship', 'fit', 'maintains', 'typical', 'compliments', '4-6', 'said', 'prime', 'etched', 'cheapest', 'bucks', '360', 'point', 'packaged', 'nor', 'smartphones', 'separate', 'unbranded', 'one', '36', 'nwt', 'bluetooth', 'i', '8h', 'wrap', 'exposed', 'form', 'charger', 'worn', 'google', 'solve', 'diminish', 'trees', 'skin', 'military', '2.5', 'tough', 'sports', 'unscratched', 'fast', 's4', '1-3', 'holder', 'precise', 'allows', ';', 'proprietary', 'thinking', '20', 'ups', 'reduction', 'fyi', 'motorola', 'big', 'unloc

In [20]:
voc_diff = voc_set_df_train.difference(voc_set_model)
print("Vocabulary that is in df_train but is NOT in model:\nlentght: {}\nvoc: {}".format(len(voc_diff), voc_diff))

Vocabulary that is in df_train but is NOT in model:
lentght: 901


### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,
                             max_depth=100,
                             min_samples_leaf=1,
                             random_state=0
                            )
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [24]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_train, pred_train)
print('Train accuracy: ', acc_train)

acc_test = accuracy_score(y_test, pred_test)
print('Test accuracy: ', acc_test)

Train accuracy:  0.9835680751173709
Test accuracy:  0.3755868544600939


# Naive algorithm: predicting average of the class

In [24]:
mode = y_train.mode()

pred_train_naive = y_train.copy()
pred_test_naive = y_test.copy()
pred_train_naive.loc[:] = mode.values[0]
pred_test_naive.loc[:] = mode.values[0]

In [25]:
print('Naive train accuracy: ', accuracy_score(y_train, pred_train_naive))
print('Naive test accuracy: ', accuracy_score(y_test, pred_test_naive))

Naive train accuracy:  0.29120198265179675
Naive test accuracy:  0.2732342007434944


In [26]:
# from importlib import reload
# fe = reload(fe)
# lp = reload(lp)