In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

import sys
sys.path.append("../../../lib") # Adds higher directory to python modules path.
import helper_functions as hf
import language_processing as lp
import feature_extraction as fe

In [2]:
PATH = "../../../data/"
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')
stopwords = [line.rstrip('\n') for line in open(f'{PATH}stopwords/english')]

## Replace NaN

In [3]:
data_full = hf.replace_nan(data_full)

In [4]:
data_full.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,No Brand,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,No Brand,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,No Brand,44.0,0,Complete with certificate of authenticity


# Work with only 'Electronics/Video Games & Consoles/Video Gaming Merchandise' category

In [5]:
cat_df = data_full.loc[data_full.category_name == 'Electronics/Video Games & Consoles/Video Gaming Merchandise']

len(cat_df)

1151

## Delete items without description

In [6]:
cat_df = cat_df[cat_df.item_description != 'No description yet']

len(cat_df)

1065

## Extract labels

In [7]:
import price_classifier

pc = price_classifier.PriceClassifier(cat_df, 5)
y = pc.extract(cat_df)

for range_ in pc.ranges:
    print("range {} has {} items".format(range_, (y == (str(range_[0]) + "-" + str(range_[1]))).sum()))

range (0, 9.0) has 209 items
range (9.0, 15.0) has 193 items
range (15.0, 23.0) has 227 items
range (23.0, 38.0) has 218 items
range (38.0, 589.0) has 218 items


## Train & Test split

In [8]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(cat_df, y, test_size=0.20, random_state=42)

## Run grid search

Run grid search for all combinations of text processing

In [12]:
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

NLP_param_grid = {
    'desc_stem': [True, False], 'desc_norm': [True, False], 'desc_sw': [stopwords, None],
    'name_stem': [True, False], 'name_norm': [True, False], 'name_sw': [stopwords, None],
}

grid_search_param_grid = {
#     'n_estimators':[1, 3, 10, 30, 100, 300, 1000],
    'n_estimators':[1],
#     'max_depth':[1,3,10,30,100],
    'min_samples_leaf':[1,3,10],
    'random_state':[42]
}

optimisation_result = None

for gp in tqdm_notebook(list(ParameterGrid(NLP_param_grid))):
#     print("NLP params:", gp)

    # Create pipeline
    pipe = fe.Pipeline(steps=[
    #     ('base', fe.BaseFeatureExtractor()),
        ('stemmed_tokens_item_desc', lp.CountVectorizer(
            df_train,
            'item_description',
            stem=gp['desc_stem'],
            normalize=gp['desc_norm'],
            stopwords=gp['desc_sw']
        )),
        ('stemmed_tokens_name', lp.CountVectorizer(
            df_train,
            'name',
            stem=gp['name_stem'],
            normalize=gp['name_norm'],
            stopwords=gp['name_sw']
        ))    
                         ])
    # Exctract features
#     print("Extracting features")
    X_train = pipe.extract_features(df_train)
    X_test = pipe.extract_features(df_test)

    # Run grid search
#     print("Running grid search")
    random_forest = RandomForestClassifier()
    clf = GridSearchCV(random_forest, grid_search_param_grid, iid=False, cv=5, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    current_result = {
        'best_score_': clf.best_score_,
        'best_NLP_gp': gp,
        'best_params_': clf.best_params_,
        'best_estimator_': clf.best_estimator_,
        'X_train': X_train,
        'X_test': X_test,
    }
    
    if optimisation_result is None:
        optimisation_result = current_result
#         print()
        continue
        
    if current_result['best_score_'] > optimisation_result['best_score_']:
        print("New best score achieved", current_result['best_score_'])
        optimisation_result = current_result
#     print()

# Add the parameters for grid search and the training data
print("Optimization done, adding additional data into optimisation_result before saving pickle...")
optimisation_result['NLP_param_grid'] = NLP_param_grid
optimisation_result['grid_search_param_grid'] = grid_search_param_grid
optimisation_result['df_train'] = df_train
optimisation_result['df_test'] = df_test
optimisation_result['y_train'] = y_train
optimisation_result['y_test'] = y_test

print("Optimization completed")

print("Saving pickle")
# hf.save_pickle(optimisation_result, f'{PATH}pickle/BOW_optimisation_result')
print("Done")

HBox(children=(IntProgress(value=0, max=64), HTML(value='')))

New best score achieved 0.34861987776269554
Optimization completed
Saving pickle
Done


In [4]:
clf_best = optimisation_result['best_estimator_']
X_train = optimisation_result['X_train']
X_test = optimisation_result['X_test']

In [5]:
pred_train = clf_best.predict(X_train)
pred_test = clf_best.predict(X_test)

In [6]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_train, pred_train)
print('Train accuracy: ', acc_train)

acc_test = accuracy_score(y_test, pred_test)
print('Test accuracy: ', acc_test)

NameError: name 'y_train' is not defined

# Naive algorithm: predicting average of the class

In [96]:
mode = y_train.mode()

pred_train_naive = y_train.copy()
pred_test_naive = y_test.copy()
pred_train_naive.loc[:] = mode.values[0]
pred_test_naive.loc[:] = mode.values[0]

In [97]:
print('Naive train accuracy: ', accuracy_score(y_train, pred_train_naive))
print('Naive test accuracy: ', accuracy_score(y_test, pred_test_naive))

Naive train accuracy:  0.21679197994987467
Naive test accuracy:  0.20224719101123595


In [20]:
# from importlib import reload
# fe = reload(fe)
# lp = reload(lp)