In [1]:
import numpy as np
import pandas as pd
# import spacy
# nlp = spacy.load('en_core_web_sm')
from collections import OrderedDict
from tqdm import tqdm_notebook

import sys
sys.path.append("../../lib") # Adds higher directory to python modules path.
import helper_functions as hf
import language_processing as lp
import feature_extraction as fe

Following functions has been loaded:

replace_nan
rmse
extract_n_random_cats
tokenize



In [2]:
PATH = "../../data/"
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')

## Replace NaN

In [3]:
data_full = hf.replace_nan(data_full)

In [4]:
data_full.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,No Brand,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,No Brand,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,No Brand,44.0,0,Complete with certificate of authenticity


## Select randomly N categories

In [5]:
data_10_cats = hf.extract_n_random_cats(data_full, 10, random_seed=27)

In [6]:
extracted_cats = data_10_cats.category_name.unique()
for cat in extracted_cats:
    print('{}\titems in\t{}'.format(len(data_10_cats.loc[data_10_cats.category_name == cat]), cat))

8648	items in	Women/Underwear/Panties
756	items in	Men/Jeans/Boot Cut
5325	items in	Women/Women's Accessories/Hair Accessories
2152	items in	Electronics/Cell Phones & Accessories/Screen Protectors
146	items in	Home/Bedding/Quilts
313	items in	Home/Storage & Organization/Racks, Shelves & Drawers
44	items in	Handmade/Others/Other
6	items in	Handmade/Children/Other
3	items in	Vintage & Collectibles/Serving/Butter Dish
1	items in	Handmade/Accessories/Mirror


# For now, work with just one category

In [7]:
cat_df = data_10_cats.loc[data_10_cats.category_name == 'Electronics/Cell Phones & Accessories/Screen Protectors']
# cat_df = data_10_cats.loc[data_10_cats.category_name == 'Women/Underwear/Panties']

In [8]:
cat_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1063,1063,Tempered glass screen protectors,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,6.0,1,"New, (3) iPhone 6-6s tempered glass screen pro..."
2480,2480,iPhone 6/6s/7 tempered glass,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,7.0,1,Brand new free shipping You will get 2 tempere...
2486,2486,Galaxy S6 Edge Invisible Shield,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,5.0,1,INVISIBLE SHIELD for Samsung Galaxy S 6 Edge. ...
2505,2505,Tempered Glass for iPhone 7,1,Electronics/Cell Phones & Accessories/Screen P...,Apple,6.0,1,The screen protector is made of specially proc...
3500,3500,iPhone 7 plus screen protector glass,1,Electronics/Cell Phones & Accessories/Screen P...,Apple,8.0,1,•TEMPERED GLASS SCREEN PROTECTOR PRIVACY• Avai...


### Extract labels

In [9]:
import price_classifier
# from importlib import reload
# reload(price_classifier)

In [10]:
pc = price_classifier.PriceClassifier(cat_df, 5)
y = pc.extract(cat_df)

In [11]:
y.head()

1063     6.0-8.0
2480     6.0-8.0
2486     5.0-6.0
2505     6.0-8.0
3500    8.0-10.0
dtype: object

### Train & Test split

In [12]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(cat_df, y, test_size=0.25, random_state=42)

### Convert GloVe into word2vec

In [211]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove_input_file = f'{PATH}glove.6B/glove.6B.300d.txt'
# word2vec_output_file = f'{PATH}glove.6B/glove.6B.300d.txt.word2vec'
# glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 300)

### Load the word2vec model

In [219]:
from gensim.models import KeyedVectors

# UNCOMMENT ONE OF THE FOLLOWING
filename = f'{PATH}glove.6B/glove.6B.300d.txt.word2vec' # GloVe Wikipedia + Gigaword
model = KeyedVectors.load_word2vec_format(filename, binary=False)


# filename = f'{PATH}google/GoogleNews-vectors-negative300.bin'# Google News
# model = KeyedVectors.load_word2vec_format(filename, binary=True)

### Exctract features

In [222]:
pipe = fe.Pipeline(steps=[('base', fe.BaseFeatureExtractor()),
                          ('word2vec_item_description', lp.MeanEmbeddingVectorizer(model, df_train, 'item_description')),
                          ('word2vec_tokens_name', lp.MeanEmbeddingVectorizer(model, df_train, 'name')),
#                           ('stemmed_tokens_item_description', lp.CountVectorizer(df_train, 'item_description', stem=True, normalize=True)),
#                           ('stemmed_tokens_name', lp.CountVectorizer(df_train, 'name', stem=True, normalize=True))
                         ])

In [223]:
print("Extracting features for the training set...")
X_train = pipe.extract_features(df_train)
print("Done")
print("Extracting features for the test set...")
X_test = pipe.extract_features(df_test)
print("Done")

Extracting features for the training set...
Done
Extracting features for the test set...
Done


In [224]:
X_train.head()

Unnamed: 0,name_len,item_description_len,item_condition_id,shipping,mev_item_description_minimum,mev_item_description_normal,mev_item_description_together,mev_item_description_return,mev_item_description_go,mev_item_description_6s,...,mev_name_reserve,mev_name_deal,mev_name_glass,mev_name_marble,mev_name_s6,mev_name_1x,mev_name_pixel,mev_name_one,mev_name_protection,mev_name_8
185890,40,588,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.012019,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1228366,39,468,1,1,0.0,0.0,0.001974,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1131355,37,135,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.012019,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384365,37,875,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1458638,40,18,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.012019,0.0,0.003425,0.0,0.0,0.0,0.0,0.0


### Random Forest

In [225]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,
                             max_depth=50,
                             min_samples_leaf=1,
                             random_state=0
                            )
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [226]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [227]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_train, pred_train)
print('Train accuracy: ', acc_train)

acc_test = accuracy_score(y_test, pred_test)
print('Test accuracy: ', acc_test)

Train accuracy:  0.9876084262701363
Test accuracy:  0.5408921933085502


# Naive algorithm: predicting average of the class

In [193]:
mode = y_train.mode()

pred_train_naive = y_train.copy()
pred_test_naive = y_test.copy()
pred_train_naive.loc[:] = mode.values[0]
pred_test_naive.loc[:] = mode.values[0]

In [194]:
print('Naive train accuracy: ', accuracy_score(y_train, pred_train_naive))
print('Naive test accuracy: ', accuracy_score(y_test, pred_test_naive))

Naive train accuracy:  0.29120198265179675
Naive test accuracy:  0.2732342007434944


In [None]:
X_train.head()

In [None]:
(X_train.name_fit != 0).sum()

In [204]:
from importlib import reload
fe = reload(fe)
lp = reload(lp)