In [None]:
import numpy as np
import pandas as pd

# Settings

In [None]:
settings = { 
            'data_size': None, # size of data subset after shuffle is performed
            'test_size': 0.25, # fraction of data set to be assigned as test data
            'save_env': False # save environment
           }

# Load Data: load dataset, shuffle it and take subset

In [None]:
PATH = "../data/"

In [None]:
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')

In [None]:
data_full.head()

In [None]:
len(data_full)

In [None]:
from sklearn.utils import shuffle
data_shuffled = shuffle(data_full)

In [None]:
data = data_shuffled.iloc[:settings['data_size'], :]

In [None]:
len(data)

In [None]:
data.head()

# Data preparation

## Item Category

Split category_name into main_cat, subcat_1 and subcat_2

In [None]:
# reference: BuryBuryZymon at https://www.kaggle.com/maheshdadhich/i-will-sell-everything-for-free-0-55
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

In [None]:
data.loc[:,'main_cat'], data.loc[:,'subcat_1'], data.loc[:,'subcat_2'] = \
zip(*data.loc[:,'category_name'].apply(lambda x: split_cat(x)))
# data.head()

In [None]:
data.head()

## Overview

In [None]:
print("Number of unique fields:\n")

print("main_cat: \t%d" % data['main_cat'].nunique())
print("subcat_1: \t%d" % data['subcat_1'].nunique())
print("subcat_2: \t%d" % data['subcat_2'].nunique())
print("brand_name: \t%d" % data['brand_name'].nunique())
print()

print("%d items have no category" % len(data.loc[data['main_cat'] == 'No Label']))
print("%d items have no brand" % data['brand_name'].isna().sum())

## Numerically represent features

### train_id: copy

Create new DataFrame called data_num for numerical representations

In [None]:
data_num = pd.DataFrame(data.loc[:,'train_id'], columns=['train_id'])

### name: represent name by its length as name_len

In [None]:
name_len = data['name'].str.len()
data_num.loc[:,'name_len'] = name_len

### item_condition_id, price, shipping: copy

In [None]:
data_num[['item_condition_id', 'price', 'shipping']] = data.loc[:,['item_condition_id', 'price', 'shipping']]

### item_description

Represent item_description by its length as item_description_len

In [None]:
item_description_len = data.loc[:,'item_description'].str.len()
data_num['item_description_len'] = item_description_len

# Replace NaN in data_num.item_description_len column by zeros
data_num['item_description_len'] = data_num['item_description_len'].fillna(0)

# Change data type of this column to uint16 provided the max val is less than 65535
if (data_num.item_description_len.max() < 65535):
    data_num['item_description_len'] = data_num['item_description_len'].astype(np.uint16)

In [None]:
# data_num.head()

In [None]:
# data.head()

### Define function for making binary columns

In [None]:
def make_binary_columns(df_str, df_num, column_name):
    """
    Turns a single column named column_name (with various categories) into m binary columns, where m is the number
    of unique values in the original column. For each sample, the value for all new columns is 0 apart the one
    that matches the value of the original column. Names of new binary columns are formed as follows:
    column_name + '_' + str(original column value)
    Inputs:
        two pandas DataFrames: df_str where a single column contains information about given category
                               df_num that will later contain many binary columns
        column_name: name of the column that will be split into several binary columns
    Returns nothing. It appends the collumns directly into df_num to increase performance.
    """
    m = len(df_str)
    if (m != len(df_num)):
        raise Exception("df_str and df_num must have the same size.")
 
    categories = df_str[column_name].unique()

    from tqdm import tqdm_notebook # progress bar

    # Create a new feature for each category and initialize it to 0
    for i in tqdm_notebook(categories, desc='1/2'):
        df_num[column_name + '_' + str(i)] = np.zeros((m, 1), dtype=np.int8)

#     Loop thorugh all rows and assign 1 to the column whose name is the same as category
    for i in tqdm_notebook(df_str.index, desc='2/2'): # loop through all rows
        category = str(df_str.at[i, column_name])
        df_num.at[i, column_name + '_' + category] = 1

### brand_name: for each unique one create new binary feature

In [None]:
make_binary_columns(data, data_num, 'brand_name')

In [None]:
# data.head()

### main_cat, subcat_1, subcat_2: for each unique one create new binary feature

In [None]:
make_binary_columns(data, data_num, 'main_cat')
make_binary_columns(data, data_num, 'subcat_1')
make_binary_columns(data, data_num, 'subcat_2')

# Split data and extract X, y and train_id

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data_num, test_size = settings['test_size'], random_state=None) # randomly split data

In [None]:
y_train = train.pop('price')
y_test = test.pop('price')

In [None]:
X_train = train
X_test = test

In [None]:
# Pop id_train from both training and test data set

id_train = X_train.pop('train_id')
id_test = X_test.pop('train_id')

# Scale data

In [None]:
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
scaler.fit(X_train) # Compute the mean adn std of training data to be used for future scaling

X_train_scaled_ar = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled_ar, index=X_train.index, columns=X_train.columns)
del X_train_scaled_ar

X_test_scaled_ar = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled_ar, index=X_test.index, columns=X_test.columns)
del X_test_scaled_ar

In [None]:
# X_train_scaled.head()

In [None]:
# X_train.head()

In [None]:
# X_test_scaled.head()

In [None]:
# X_test.head()

In [None]:
who

In [None]:
if (settings['save_env']):
    import dill                            #pip install dill --user
    dill.dump_session('splittedData.pkl')

# Delete data that are not needed anymore

In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

In [None]:
del data_num, train, test, X_train, X_test, data_full, data_shuffled, data

# Performance measure: RMSE

$$\text{RMSE} \left( \mathbf{Y} , \mathbf{\hat{Y}} \right) = \sqrt{\frac{1}{n} \sum_{i=1}^n \left( y_i - \hat{y_i} \right)^2 } $$

In [None]:
def rmse(y_test, y_pred):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(y_test, y_pred))

# Apply linear regression

In [None]:
# Fit linear regression
from sklearn.linear_model import LinearRegression
lnr_regr = LinearRegression(n_jobs=-1)
print("Trainig...")
lnr_regr.fit(X_train_scaled, y_train)
print("Training done.")

if (settings['save_env']):
    dill.dump_session('linearModel.pkl')
    
# Make predictions and report train and test RMSEs

print("Evaluating performance on the training set...")
pred_train = lnr_regr.predict(X_train_scaled)
rmse_train = rmse(y_train, pred_train)
print("Training set RMSE: %.2f" % rmse_train)

print("Evaluating performance on the test test...")
pred_test = lnr_regr.predict(X_test_scaled)
rmse_test = rmse(y_test, pred_test)
print("Test set RMSE: %.2f" % rmse_test)

# Learning curves

In [None]:
# import dill

In [None]:
# dill.load_session('linearModel.pkl')

In [None]:
# who