In [1]:
import numpy as np
import pandas as pd
# import spacy
# nlp = spacy.load('en_core_web_sm')
from collections import OrderedDict
from tqdm import tqdm_notebook

import sys
sys.path.append("../../lib") # Adds higher directory to python modules path.
import helper_functions as hf
import language_processing

Following functions has been loaded:

replace_nan
rmse
extract_n_random_cats
tokenize



In [2]:
PATH = "../../data/"
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')

## Replace NaN

In [3]:
data_full = hf.replace_nan(data_full)

In [4]:
data_full.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,No Brand,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,No Brand,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,No Brand,44.0,0,Complete with certificate of authenticity


## Select randomly N categories

In [5]:
data_10_cats = hf.extract_n_random_cats(data_full, 10, random_seed=27)

In [6]:
extracted_cats = data_10_cats.category_name.unique()
for cat in extracted_cats:
    print('{}\titems in\t{}'.format(len(data_10_cats.loc[data_10_cats.category_name == cat]), cat))

8648	items in	Women/Underwear/Panties
756	items in	Men/Jeans/Boot Cut
5325	items in	Women/Women's Accessories/Hair Accessories
2152	items in	Electronics/Cell Phones & Accessories/Screen Protectors
146	items in	Home/Bedding/Quilts
313	items in	Home/Storage & Organization/Racks, Shelves & Drawers
44	items in	Handmade/Others/Other
6	items in	Handmade/Children/Other
3	items in	Vintage & Collectibles/Serving/Butter Dish
1	items in	Handmade/Accessories/Mirror


# For now, work with just one category

In [7]:
cat_df = data_10_cats.loc[data_10_cats.category_name == 'Electronics/Cell Phones & Accessories/Screen Protectors']

In [8]:
cat_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
1063,1063,Tempered glass screen protectors,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,6.0,1,"New, (3) iPhone 6-6s tempered glass screen pro..."
2480,2480,iPhone 6/6s/7 tempered glass,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,7.0,1,Brand new free shipping You will get 2 tempere...
2486,2486,Galaxy S6 Edge Invisible Shield,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,5.0,1,INVISIBLE SHIELD for Samsung Galaxy S 6 Edge. ...
2505,2505,Tempered Glass for iPhone 7,1,Electronics/Cell Phones & Accessories/Screen P...,Apple,6.0,1,The screen protector is made of specially proc...
3500,3500,iPhone 7 plus screen protector glass,1,Electronics/Cell Phones & Accessories/Screen P...,Apple,8.0,1,•TEMPERED GLASS SCREEN PROTECTOR PRIVACY• Avai...


In [9]:
def price_to_classes(df, n_classes, print_ranges=False):
    """
    Takes a dataframe (containig column 'price') and adds a column with price ranges, which assigns each items
    a class. The number of classes required is set by n_classes. The function tries to set ranges in such a way
    that keeps the number of items in each class the same (or close to that).
    Returns a dataframe with added column called 'price_range'
    """
    df_c = df.copy()
    prices = df_c.price
    sorted_prices = prices.sort_values()
    l = len(df_c)
    n = int(l / (n_classes)) # ideal number of items in each class (range)
    
    ranges = [] # list of tuples
    counter = 0
    lower = sorted_prices.iloc[0]
    upper = None
    for index, val in sorted_prices.iteritems():
        counter += 1
        if counter % n == 0:
            upper = val
            ranges.append((lower, upper))
            lower = upper
    
    # Extend the lower bound of the first range to 0
    ranges[0] = (0, ranges[0][1])
    # Extend the upper bound of the last range to infinity
    ranges[-1] = (ranges[-1][0], np.inf)
    
    for range_ in ranges:
        lower_bound = range_[0]
        upper_bound = range_[1]
        ids = prices.loc[(prices >= lower_bound) & (prices < upper_bound)].index
        df_c.loc[ids, 'price_range'] = '{}-{}'.format(lower_bound, upper_bound)

    
    if print_ranges:
        print(ranges)
    return df_c

In [10]:
cat_class = price_to_classes(cat_df, 5, print_ranges=True)

[(0, 5.0), (5.0, 6.0), (6.0, 8.0), (8.0, 10.0), (10.0, inf)]


In [11]:
for range_ in cat_class.price_range.unique():
    print("{}: {} items".format(range_, len(cat_class.loc[cat_class.price_range == range_])))

6.0-8.0: 617 items
5.0-6.0: 298 items
8.0-10.0: 512 items
10.0-inf: 455 items
0-5.0: 270 items


### Extract labels

In [12]:
y = cat_class.price_range.loc[cat_class.index]

In [13]:
y.head()

1063     6.0-8.0
2480     6.0-8.0
2486     5.0-6.0
2505     6.0-8.0
3500    8.0-10.0
Name: price_range, dtype: object

### Train & Test split

In [14]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(cat_class, y, test_size=0.25, random_state=42)

## Item description

### Build a custom vocabulary based on all the words in the category

In [15]:
count_vectorizer = language_processing.CountVectorizer(df_train, 'item_description')

In [16]:
X_train = count_vectorizer.extract_features(df_train)
X_test = count_vectorizer.extract_features(df_test)

### Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_train, pred_train)
print('Train accuracy: ', acc_train)

acc_test = accuracy_score(y_test, pred_test)
print('Test accuracy: ', acc_test)

Train accuracy:  0.4758364312267658
Test accuracy:  0.3643122676579926


# Naive algorithm: predicting average of the class

In [20]:
mode = y_train.mode()

pred_train_naive = y_train.copy()
pred_test_naive = y_test.copy()
pred_train_naive.loc[:] = mode.values[0]
pred_test_naive.loc[:] = mode.values[0]

In [21]:
print('Naive train accuracy: ', accuracy_score(y_train, pred_train_naive))
print('Naive test accuracy: ', accuracy_score(y_test, pred_test_naive))

Naive train accuracy:  0.29120198265179675
Naive test accuracy:  0.2732342007434944


In [23]:
cat_class.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,price_range
1063,1063,Tempered glass screen protectors,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,6.0,1,"New, (3) iPhone 6-6s tempered glass screen pro...",6.0-8.0
2480,2480,iPhone 6/6s/7 tempered glass,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,7.0,1,Brand new free shipping You will get 2 tempere...,6.0-8.0
2486,2486,Galaxy S6 Edge Invisible Shield,1,Electronics/Cell Phones & Accessories/Screen P...,No Brand,5.0,1,INVISIBLE SHIELD for Samsung Galaxy S 6 Edge. ...,5.0-6.0
2505,2505,Tempered Glass for iPhone 7,1,Electronics/Cell Phones & Accessories/Screen P...,Apple,6.0,1,The screen protector is made of specially proc...,6.0-8.0
3500,3500,iPhone 7 plus screen protector glass,1,Electronics/Cell Phones & Accessories/Screen P...,Apple,8.0,1,•TEMPERED GLASS SCREEN PROTECTOR PRIVACY• Avai...,8.0-10.0


In [26]:
cat_class.head().name.str.len()

1063    32
2480    28
2486    31
2505    27
3500    36
Name: name, dtype: int64

In [37]:
import feature_extraction

In [42]:
bfe = feature_extraction.BaseFeatureExtractor()
bfe.extract_features(cat_class)

Unnamed: 0,name_len,item_description_len
1063,32,89
2480,28,148
2486,31,533
2505,27,753
3500,36,680
3530,37,536
4221,28,851
5246,31,57
5882,30,447
6992,39,711


In [67]:
from importlib import reload
reload(feature_extraction)

<module 'feature_extraction' from '../../lib/feature_extraction.py'>

In [68]:
pipe = feature_extraction.Pipeline(steps=[('base', feature_extraction.BaseFeatureExtractor())])

In [69]:
pipe.extract_features(cat_class)

Unnamed: 0,name_len,item_description_len,item_condition_id,shipping
1063,32,89,1,1
2480,28,148,1,1
2486,31,533,1,1
2505,27,753,1,1
3500,36,680,1,1
3530,37,536,1,1
4221,28,851,1,1
5246,31,57,1,0
5882,30,447,1,1
6992,39,711,1,1
