In [1]:
import numpy as np
import pandas as pd

# Load Data: load only partion of the whole dataset

In [2]:
PATH = "../data/"

In [3]:
data = pd.read_csv(f'{PATH}train.tsv', sep='\t', nrows=None)

In [4]:
data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
# data.info()

In [6]:
len(data)

1000

# Data preparation

## Item Category

Split category_name into main_cat, subcat_1 and subcat_2

In [7]:
# reference: BuryBuryZymon at https://www.kaggle.com/maheshdadhich/i-will-sell-everything-for-free-0-55
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

In [8]:
data['main_cat'], data['subcat_1'], data['subcat_2'] = \
zip(*data['category_name'].apply(lambda x: split_cat(x)))
# data.head()

## Overview

In [9]:
print("Number of unique fields:\n")

print("main_cat: \t%d" % data['main_cat'].nunique())
print("subcat_1: \t%d" % data['subcat_1'].nunique())
print("subcat_2: \t%d" % data['subcat_2'].nunique())
print("brand_name: \t%d" % data['brand_name'].nunique())
print()

print("%d items have no category" % len(data.loc[data['main_cat'] == 'No Label']))
print("%d items have no brand" % data['brand_name'].isna().sum())

Number of unique fields:

main_cat: 	11
subcat_1: 	78
subcat_2: 	216
brand_name: 	229

4 items have no category
418 items have no brand


## Numerically represent features

### train_id: copy

Create new DataFrame called data_num for numerical representations

In [10]:
data_num = pd.DataFrame(data.loc[:,'train_id'], columns=['train_id'])

### name: represent name by its length as name_len

In [11]:
name_len = data['name'].str.len()
data_num['name_len'] = name_len

### item_condition_id, price, shipping: copy

In [12]:
data_num[['item_condition_id', 'price', 'shipping']] = data[['item_condition_id', 'price', 'shipping']]

### item_description

Represent item_description by its length as item_description_len

In [13]:
item_description_len = data['item_description'].str.len()
data_num['item_description_len'] = item_description_len

In [14]:
data_num.head()

Unnamed: 0,train_id,name_len,item_condition_id,price,shipping,item_description_len
0,0,35,3,10.0,1,18
1,1,32,3,52.0,0,188
2,2,14,1,10.0,1,124
3,3,21,1,35.0,1,173
4,4,20,1,44.0,0,41


In [15]:
data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,main_cat,subcat_1,subcat_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


### Define function for making binary columns

In [16]:
def make_binary_columns(df_str, df_num, column_name):
    """
    Turns a single column named column_name (with various categories) into m binary columns, where m is the number
    of unique values in the original column. For each sample, the value for all new columns is 0 apart the one
    that matches the value of the original column. Names of new binary columns are formed as follows:
    column_name + '_' + str(original column value)
    Inputs:
        two pandas DataFrames: df_str where a single column contains information about given category
                               df_num that will later contain many binary columns
        column_name: name of the column that will be split into several binary columns
    Returns df_num
    """
    df_str_c = df_str.copy()
    df_num_c = df_num.copy()
    
    m = len(df_str_c)
    if (m != len(df_num_c)):
        raise Exception("df_str and df_num must have the same size.")
 
    categories = df_str_c[column_name].unique()

    from tqdm import tqdm_notebook # progress bar

    # Create a new feature for each category and initialize it to 0
    for i in tqdm_notebook(categories, desc='1/2'):
        df_num_c[column_name + '_' + str(i)] = 0

    # Loop thorugh all rows and assign 1 to the column whose name is the same as category
    for i in tqdm_notebook(df_str_c.index, desc='2/2'): # loop through all rows
        category = str(df_str_c.at[i, column_name])
        df_num_c.at[i, column_name + '_' + category] = 1
    return df_num_c

### brand_name: for each unique one create new binary feature

In [17]:
data_num = make_binary_columns(data, data_num, 'brand_name')

HBox(children=(IntProgress(value=0, description='1/2', max=230), HTML(value='')))




HBox(children=(IntProgress(value=0, description='2/2', max=1000), HTML(value='')))




In [18]:
# data_num.head()

In [19]:
# data.head()

### main_cat, subcat_1, subcat_2: for each unique one create new binary feature

In [20]:
data_num = make_binary_columns(data, data_num, 'main_cat')
data_num = make_binary_columns(data, data_num, 'subcat_1')
data_num = make_binary_columns(data, data_num, 'subcat_2')

HBox(children=(IntProgress(value=0, description='1/2', max=11), HTML(value='')))




HBox(children=(IntProgress(value=0, description='2/2', max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='1/2', max=78), HTML(value='')))




HBox(children=(IntProgress(value=0, description='2/2', max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, description='1/2', max=216), HTML(value='')))




HBox(children=(IntProgress(value=0, description='2/2', max=1000), HTML(value='')))




# Split data and extract X and y

In [31]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data_num, test_size = 0.25, random_state=None) # randomly split data

NameError: name 'data_num' is not defined

In [None]:
y_train = train.pop('price')
y_test = test.pop('price')

In [32]:
X_train = train
X_test = test

NameError: name 'train' is not defined

# Scale data

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train) # Compute the mean adn std of training data to be used for future scaling

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

NameError: name 'X_train' is not defined

In [34]:
import dill                            #pip install dill --user
dill.dump_session('splittedData.pkl')

# Delete data that are not needed anymore

In [35]:
# del data, data_num, train, test, X_train, X_test

NameError: name 'data' is not defined

# Apply linear regression

### Performance measure: RMSE

$$\text{RMSE} \left( \mathbf{Y} , \mathbf{\hat{Y}} \right) = \sqrt{\frac{1}{n} \sum_{i=1}^n \left( y_i - \hat{y_i} \right)^2 } $$

In [36]:
def rmse(y_test, y_pred):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [28]:
# Fit linear regression
from sklearn.linear_model import LinearRegression
lnr_regr = LinearRegression()
print("Trainig...")
lnr_regr.fit(X_train_scaled, y_train)
print("Training done.")

Trainig...
Training done.


In [29]:
dill.dump_session('linearModel.pkl')

In [30]:
# Make predictions and report train and test RMSEs

print("Evaluating performance on the training set...")
pred_train = lnr_regr.predict(X_train_scaled)
rmse_train = rmse(y_train, pred_train)
print("Training set RMSE: %.2f" % rmse_train)

print("Evaluating performance on the test test...")
pred_test = lnr_regr.predict(X_test_scaled)
rmse_test = rmse(y_test, pred_test)
print("Test set RMSE: %.2f" % rmse_test)

Evaluating performance on the training set...
Training set RMSE: 50.46
Evaluating performance on the test test...
Test set RMSE: 5752071901278255.00


In [4]:
# import dill

In [6]:
# dill.load_session('linearModel.pkl')

In [7]:
# who

In	 LinearRegression	 Out	 PATH	 StandardScaler	 X_test_scaled	 X_train_scaled	 dill	 item_description_len	 
lnr_regr	 make_binary_columns	 name_len	 np	 pd	 pickle	 rmse	 scaler	 split_cat	 
train_test_split	 y_test	 y_train	 
