In [1]:
# memory usage for jupyter
!pip install nbresuse
# progress bar
!pip install tqdm
# other libraries
!pip install pandas numpy sklearn



In [2]:
import numpy as np
import pandas as pd

# Settings

In [3]:
settings = { 
            'data_size': None, # size of data subset after shuffle is performed
            'test_size': 0.25, # fraction of data set to be assigned as test data
            'save_env': False, # save environment
            'del': True, # Delete variables that are no longer needed to proceed in computations to save place
            'filename_str': 'learning_curve.csv', # File for saving training and test RMSEs, this is appended to current date string (yymmdd)
            'learning_curve': { # parameters to generate learning_curve
                'start': int(5e4), # training set start size (including)
                'stop': int(1e6 + 2.5e4), # traing set stop size (excluding)
                'step': int(2.5e4)  # increase between iterations
#                 'start': int(5e1), # training set start size (including)
#                 'stop': int(1e3 + 2.5e1), # traing set stop size (excluding)
#                 'step': int(2.5e1)  # increase between iterations
                              },
            'random_state': { # Set random states so that the results are repeatable
                'shuffle': 42, # sklearn's shuffle method
                'split': 17 # sklearn's train_test_split method
            }
           }

# Load Data: load dataset, shuffle it and take subset

In [4]:
PATH = "../../../data/"

In [5]:
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')

In [6]:
data_full.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [7]:
len(data_full)

1482535

In [8]:
from sklearn.utils import shuffle
data_shuffled = shuffle(data_full, random_state=settings['random_state']['shuffle'])

In [9]:
if (settings['del']):
    del data_full

In [10]:
data = data_shuffled.iloc[:settings['data_size'], :]

In [11]:
if (settings['del']):
    del data_shuffled

In [12]:
len(data)

1482535

In [13]:
data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
777341,777341,F/ship 4 Totoro Washi + 1 pen,1,Handmade/Paper Goods/Stationery,,12.0,1,This listing is for all 4 Totoro washi tape fo...
1463629,1463629,UCLA Men's Bundle + Shorts,1,Women/Other/Other,Adidas,76.0,1,7 items. 1: XL. 2: 2XL. 3:2XL. 4: XL. 5: 2XL. ...
350669,350669,Listing for lol,1,Beauty/Makeup/Lips,,12.0,1,- sunglasses and necklace :)
310222,310222,25 pcs kawaii sticker flakes,1,Kids/Toys/Arts & Crafts,,3.0,1,I ordered a bunch of stickers so you will reci...
759257,759257,Chanel Mini Lipgloss Set,2,Beauty/Makeup/Lips,Chanel,30.0,1,Brand new never used authentic Mini Lipgloss g...


# Data preparation

## Item Category

Split category_name into main_cat, subcat_1 and subcat_2

In [14]:
# reference: BuryBuryZymon at https://www.kaggle.com/maheshdadhich/i-will-sell-everything-for-free-0-55
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

In [15]:
data.loc[:,'main_cat'], data.loc[:,'subcat_1'], data.loc[:,'subcat_2'] = \
zip(*data.loc[:,'category_name'].apply(lambda x: split_cat(x)))
# data.head()

In [16]:
data[:15]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,main_cat,subcat_1,subcat_2
777341,777341,F/ship 4 Totoro Washi + 1 pen,1,Handmade/Paper Goods/Stationery,,12.0,1,This listing is for all 4 Totoro washi tape fo...,Handmade,Paper Goods,Stationery
1463629,1463629,UCLA Men's Bundle + Shorts,1,Women/Other/Other,Adidas,76.0,1,7 items. 1: XL. 2: 2XL. 3:2XL. 4: XL. 5: 2XL. ...,Women,Other,Other
350669,350669,Listing for lol,1,Beauty/Makeup/Lips,,12.0,1,- sunglasses and necklace :),Beauty,Makeup,Lips
310222,310222,25 pcs kawaii sticker flakes,1,Kids/Toys/Arts & Crafts,,3.0,1,I ordered a bunch of stickers so you will reci...,Kids,Toys,Arts & Crafts
759257,759257,Chanel Mini Lipgloss Set,2,Beauty/Makeup/Lips,Chanel,30.0,1,Brand new never used authentic Mini Lipgloss g...,Beauty,Makeup,Lips
288846,288846,Maroon Foamposites,3,Men/Shoes/Fashion Sneakers,Nike,225.0,1,9/10 Condition N Sz 12,Men,Shoes,Fashion Sneakers
1178450,1178450,INC studdedHeart Black Blouse Dolman,2,Women/Tops & Blouses/Blouse,INC International Concepts,16.0,1,New without tags INC International Concepts Sh...,Women,Tops & Blouses,Blouse
726296,726296,Leggo silicone molds,3,Home/Kitchen & Dining/Bakeware,,12.0,0,I used these for my son's leggo birthday party...,Home,Kitchen & Dining,Bakeware
840510,840510,Supreme Uzi Chain,1,Handmade/Accessories/Men,,15.0,1,10/10 New,Handmade,Accessories,Men
1473033,1473033,Women Gold Palm Pendant Necklace FC,1,Vintage & Collectibles/Jewelry/Necklace,,17.0,1,High quality Immediate purchase Ok? Free shipp...,Vintage & Collectibles,Jewelry,Necklace


## Overview

In [17]:
print("Number of unique fields:\n")

print("main_cat: \t%d" % data['main_cat'].nunique())
print("subcat_1: \t%d" % data['subcat_1'].nunique())
print("subcat_2: \t%d" % data['subcat_2'].nunique())
print("brand_name: \t%d" % data['brand_name'].nunique())
print()

print("%d items have no category" % len(data.loc[data['main_cat'] == 'No Label']))
print("%d items have no brand" % data['brand_name'].isna().sum())

Number of unique fields:

main_cat: 	11
subcat_1: 	114
subcat_2: 	871
brand_name: 	4809

6327 items have no category
632682 items have no brand


## Numerically represent features

### train_id: copy

Create new DataFrame called data_num for numerical representations

In [17]:
data_num = pd.DataFrame(data.loc[:,'train_id'], columns=['train_id'])

### name: represent name by its length as name_len

In [18]:
data_num.loc[:,'name_len'] = data['name'].str.len()

### item_condition_id, price, shipping: copy

In [19]:
data_num[['item_condition_id', 'price', 'shipping']] = data.loc[:,['item_condition_id', 'price', 'shipping']]

### item_description

Represent item_description by its length as item_description_len

In [20]:
item_description_len = data.loc[:,'item_description'].str.len()
data_num['item_description_len'] = item_description_len

# Replace NaN in data_num.item_description_len column by zeros
data_num['item_description_len'] = data_num['item_description_len'].fillna(0)

# Change data type of this column to uint16 provided the max val is less than 65535
if (data_num.item_description_len.max() < 65535):
    data_num['item_description_len'] = data_num['item_description_len'].astype(np.uint16)

In [21]:
data_num.head()

Unnamed: 0,train_id,name_len,item_condition_id,price,shipping,item_description_len
777341,777341,29,1,12.0,1,158
1463629,1463629,26,1,76.0,1,57
350669,350669,15,1,12.0,1,28
310222,310222,28,1,3.0,1,68
759257,759257,24,2,30.0,1,53


In [22]:
# data.head()

### Define function for making binary columns

In [23]:
def make_binary_columns(df_str, df_num, column_name):
    """
    Turns a single column named column_name (with various categories) into m binary columns, where m is the number
    of unique values in the original column. For each sample, the value for all new columns is 0 apart the one
    that matches the value of the original column. Names of new binary columns are formed as follows:
    column_name + '_' + str(original column value)
    Inputs:
        two pandas DataFrames: df_str where a single column contains information about given category
                               df_num that will later contain many binary columns
        column_name: name of the column that will be split into several binary columns
    Returns nothing. It appends the collumns directly into df_num to increase performance.
    """
    m = len(df_str)
    if (m != len(df_num)):
        raise Exception("df_str and df_num must have the same size.")
 
    categories = df_str[column_name].unique()

    from tqdm import tqdm_notebook # progress bar

    # Create a new feature for each category and initialize it to 0
    for i in tqdm_notebook(categories, desc='1/2'):
        df_num[column_name + '_' + str(i)] = np.zeros((m, 1), dtype=np.int8)

#     Loop thorugh all rows and assign 1 to the column whose name is the same as category
    for i in tqdm_notebook(df_str.index, desc='2/2'): # loop through all rows
        category = str(df_str.at[i, column_name])
        df_num.at[i, column_name + '_' + category] = 1

### brand_name: for each unique one create new binary feature

In [24]:
make_binary_columns(data, data_num, 'brand_name')

HBox(children=(IntProgress(value=0, description='1/2', max=4810, style=ProgressStyle(description_width='initia…




HBox(children=(IntProgress(value=0, description='2/2', max=1482535, style=ProgressStyle(description_width='ini…




In [25]:
# data.head()

### main_cat, subcat_1, subcat_2: for each unique one create new binary feature

In [26]:
make_binary_columns(data, data_num, 'main_cat')
make_binary_columns(data, data_num, 'subcat_1')
make_binary_columns(data, data_num, 'subcat_2')

HBox(children=(IntProgress(value=0, description='1/2', max=11, style=ProgressStyle(description_width='initial'…




HBox(children=(IntProgress(value=0, description='2/2', max=1482535, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='1/2', max=114, style=ProgressStyle(description_width='initial…




HBox(children=(IntProgress(value=0, description='2/2', max=1482535, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='1/2', max=871, style=ProgressStyle(description_width='initial…




HBox(children=(IntProgress(value=0, description='2/2', max=1482535, style=ProgressStyle(description_width='ini…




In [27]:
data_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482535 entries, 777341 to 121958
Columns: 5812 entries, train_id to subcat_2_Child Friendly
dtypes: float64(1), int64(4), int8(5806), uint16(1)
memory usage: 8.1 GB


In [28]:
if (settings['del']):
    del data

In [29]:
data_num.head()

Unnamed: 0,train_id,name_len,item_condition_id,price,shipping,item_description_len,brand_name_nan,brand_name_Adidas,brand_name_Chanel,brand_name_Nike,...,subcat_2_Entertainment,subcat_2_Bathroom Shelves,subcat_2_Suits & Blazers,subcat_2_Seasonal,subcat_2_Bathroom,subcat_2_Tiles,subcat_2_Dishwashers,subcat_2_Ephemera,subcat_2_Towel,subcat_2_Child Friendly
777341,777341,29,1,12.0,1,158,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1463629,1463629,26,1,76.0,1,57,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
350669,350669,15,1,12.0,1,28,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
310222,310222,28,1,3.0,1,68,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
759257,759257,24,2,30.0,1,53,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Split data and extract X, y and train_id

In [30]:
from sklearn.model_selection import train_test_split
X_train_unscaled, X_test_unscaled = train_test_split(data_num, test_size = settings['test_size'], random_state=settings['random_state']['split']) # randomly split data
# ! X_train_unscaled and X_test_unscaled STILL CONTAINS PRICE AT THIS MOMENT !

In [31]:
if (settings['del']):
    del data_num

In [32]:
# Pop price from X_train and X_test
y_train = X_train_unscaled.pop('price')
y_test = X_test_unscaled.pop('price')

In [33]:
# Pop id_train from both training and test data set

id_train = X_train_unscaled.pop('train_id')
id_test = X_test_unscaled.pop('train_id')

# Scale data

In [34]:
from sklearn.preprocessing import MaxAbsScaler

X_train = X_train_unscaled.copy()
X_test = X_test_unscaled.copy()

columns_to_scale = ['name_len', 'item_condition_id', 'item_description_len']

scaler = MaxAbsScaler()
scaler.fit(X_train_unscaled[columns_to_scale]) # Compute the mean adn std of training data to be used for future scaling

X_train[columns_to_scale] = pd.DataFrame(scaler.transform(X_train_unscaled[columns_to_scale]), index=X_train_unscaled.index, columns=columns_to_scale)
if (settings['del']):
    del X_train_unscaled

X_test[columns_to_scale] = pd.DataFrame(scaler.transform(X_test_unscaled[columns_to_scale]), index=X_test_unscaled.index, columns=columns_to_scale)
if (settings['del']):
    del X_test_unscaled

In [35]:
# X_train_unscaled.info()

In [36]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1111901 entries, 596936 to 75785
Columns: 5810 entries, name_len to subcat_2_Child Friendly
dtypes: float64(3), int64(1), int8(5806)
memory usage: 6.1 GB


In [37]:
X_train.head()

Unnamed: 0,name_len,item_condition_id,shipping,item_description_len,brand_name_nan,brand_name_Adidas,brand_name_Chanel,brand_name_Nike,brand_name_INC International Concepts,brand_name_Starbucks,...,subcat_2_Entertainment,subcat_2_Bathroom Shelves,subcat_2_Suits & Blazers,subcat_2_Seasonal,subcat_2_Bathroom,subcat_2_Tiles,subcat_2_Dishwashers,subcat_2_Ephemera,subcat_2_Towel,subcat_2_Child Friendly
596936,0.302326,0.2,0,0.028487,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
856984,0.627907,0.2,1,0.112967,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1055728,0.418605,0.2,0,0.098232,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
256961,0.418605,0.4,0,0.068762,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1217735,0.72093,0.6,1,0.194499,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# X_test_unscaled.info()

In [39]:
X_test.head()

Unnamed: 0,name_len,item_condition_id,shipping,item_description_len,brand_name_nan,brand_name_Adidas,brand_name_Chanel,brand_name_Nike,brand_name_INC International Concepts,brand_name_Starbucks,...,subcat_2_Entertainment,subcat_2_Bathroom Shelves,subcat_2_Suits & Blazers,subcat_2_Seasonal,subcat_2_Bathroom,subcat_2_Tiles,subcat_2_Dishwashers,subcat_2_Ephemera,subcat_2_Towel,subcat_2_Child Friendly
1296364,0.790698,0.6,1,0.163065,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
822285,0.55814,0.2,0,0.059921,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81175,0.55814,0.2,0,0.166994,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1193836,0.348837,0.4,0,0.044204,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
648318,0.860465,0.2,0,0.030452,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# X_train.to_csv('X_train.csv')
# X_train_scaled.to_csv('X_train_scaled.csv')
# X_test.to_csv('X_test.csv')
# X_test_scaled.to_csv('X_test_scaled.csv')

In [41]:
if (settings['save_env']):
    import dill                            #pip install dill --user
    dill.dump_session('splittedData.pkl')

# Check size of variables

In [42]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

                       X_train:   6.1GiB
                        X_test:   2.0GiB
          item_description_len:  62.6MiB
                       y_train:  17.0MiB
                      id_train:  17.0MiB
                        y_test:   5.7MiB
                       id_test:   5.7MiB
                           ___:  28.6KiB
                           _29:  28.6KiB
                             _:  28.6KiB


# Performance measure: RMSE

$$\text{RMSE} \left( \mathbf{Y} , \mathbf{\hat{Y}} \right) = \sqrt{\frac{1}{n} \sum_{i=1}^n \left( y_i - \hat{y_i} \right)^2 } $$

In [43]:
def rmse(y_test, y_pred):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(y_test, y_pred))

# Create directory and file for saving data

In [None]:
# Create directory for saving data if it does not already exist
import os

if os.path.isdir('generated_data'):
    print("Directory generated_data already exists")
else:
    os.mkdir('generated_data')
    print ("Successfully created the directory called generated_data")

In [None]:
# Create file with headings to save training and test RMSEs for learning curves
import datetime
filename = datetime.datetime.now().strftime('%y%m%d') + '_' + settings['filename_str'] # create file name starting with yymmdd_
with open('generated_data/' + filename, 'w') as f:
    f.write('training_set_size,training_error,test_error\n')

# Learning curve for linear regression

In [None]:
from tqdm import tqdm_notebook # progress bar

# Initialize arrays for plotting learning curves
training_set_sizes = []
train_rmses = []
test_rmses = []

# Generate learning curves and save them
for m in tqdm_notebook(range(settings['learning_curve']['start'], settings['learning_curve']['stop'], settings['learning_curve']['step'])):
    # Slice dataset
    X_train_red = X_train[:m]
    y_train_red = y_train[:m]
    
    # Applu linear regression
    from sklearn.linear_model import LinearRegression
    lnr_regr = LinearRegression(n_jobs=-1)
    print("Trainig for training set size of: " + str(m) + "...")
    lnr_regr.fit(X_train_red, y_train_red)
    
    # Make predictions and compute RMSEs
    pred_train = lnr_regr.predict(X_train_red)
    rmse_train = rmse(y_train_red, pred_train)
    print("Training set RMSE: %.2f" % rmse_train)
    pred_test = lnr_regr.predict(X_test)
    rmse_test = rmse(y_test, pred_test)
    print("Training set RMSE: %.2f" % rmse_test)
    
    # Save to csv file
    print("Saving to file...")
    with open('generated_data/' + filename, 'a') as f:
        f.write(str(m) + ',' + str(rmse_train) + ',' + str(rmse_test) +'\n')
    print("")
print("Done.")
print("Your training and test RMSEs are saved in generated_data/" + filename)

# Apply linear regression to the full trainin set, compute training and test RMSE and add them to the file created above

In [None]:
# Fit linear regression
from sklearn.linear_model import LinearRegression
lnr_regr = LinearRegression(n_jobs=-1)
print("Trainig...")
lnr_regr.fit(X_train, y_train)
print("Training done.")

if (settings['save_env']):
    dill.dump_session('linearModel.pkl')
    
# Make predictions and report train and test RMSEs

print("Evaluating performance on the training set...")
pred_train = lnr_regr.predict(X_train)
rmse_train = rmse(y_train, pred_train)
print("Training set RMSE: %.2f" % rmse_train)

print("Evaluating performance on the test test...")
pred_test = lnr_regr.predict(X_test)
rmse_test = rmse(y_test, pred_test)
print("Test set RMSE: %.2f" % rmse_test)

# Save to csv file
print("Saving to file...")
with open('generated_data/' + filename, 'a') as f:
    f.write(str(len(X_train)) + ',' + str(rmse_train) + ',' + str(rmse_test) +'\n')
print("Done.")

# Gradient Descent

In [None]:
eta = 0.1  # learning rate
n_iterations = 1000
m, n = X_train.shape

X_b = np.c_[np.ones((m, 1)), X_train]  # add x0 = 1 to each instance

theta = np.random.randn(n,1)  # random initialization

In [None]:
if (settings['del']):
    del X_train

In [None]:
for i in range(n_iterations):
    print("Iteration: ", i)
    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
    theta = theta - eta * gradients
    pred = X_b * theta
    rmse = rmse(y, pred)
    print(rmse)

# Stochastic Gradient Descent


In [45]:
from sklearn.linear_model import SGDRegressor

sgd_clf = SGDRegressor(penalty='none',)

print("Trainig...")
sgd_clf.fit(X_train, y_train)
print("Training done.")

Trainig...




Training done.


In [48]:
print("Evaluating performance on the training set...")
pred_train = sgd_clf.predict(X_train)
rmse_train = rmse(y_train, pred_train)
print("Training set RMSE: %.2f" % rmse_train)

print("Evaluating performance on the test test...")
pred_test = sgd_clf.predict(X_test)
rmse_test = rmse(y_test, pred_test)
print("Test set RMSE: %.2f" % rmse_test)

Evaluating performance on the training set...
Training set RMSE: 33.29
Evaluating performance on the test test...
Test set RMSE: 32.62


In [49]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1111901 entries, 596936 to 75785
Columns: 5810 entries, name_len to subcat_2_Child Friendly
dtypes: float64(3), int64(1), int8(5806)
memory usage: 6.1 GB


In [53]:
y_train.mean()

26.757328665052015

In [None]:
# import dill

In [None]:
# dill.load_session('linearModel.pkl')

In [None]:
# who