In [161]:
import numpy as np
import pandas as pd
import gc
import time
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score

# preprocessing
from sklearn.preprocessing import LabelBinarizer

# models
from sklearn.linear_model import Ridge
import lightgbm as lgb

# for sparse matrixes
from scipy.sparse import csr_matrix, hstack

In [162]:
for dirname, _, filenames in os.walk('./train/'):
    for filename in filenames:
        print(filename)

train.tsv


In [163]:
start = time.time()
train =pd.read_csv('./train/train.tsv', sep='\t')
test = pd.read_csv('./test/test.tsv', sep='\t')
print('Time taken to load:', time.time() - start)

Time taken to load: 20.594209671020508


In [164]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [165]:
nrows_train = train.shape[0]
y = np.log1p(train['price'])
print('Number of rows', nrows_train)

Number of rows 1482535


In [166]:
merge: pd.DataFrame = pd.concat([train, test], sort=False)
submission: pd.DataFrame = test[['test_id']]

In [167]:
import psutil
psutil.cpu_stats()

scpustats(ctx_switches=216476474, interrupts=175255337, soft_interrupts=0, syscalls=1635375935)

In [168]:
psutil.virtual_memory()

svmem(total=8151916544, available=943820800, percent=88.4, used=7208095744, free=943820800)

In [169]:
del train
del test
gc.collect()

89

In [170]:
merge.isnull().sum()

train_id              693359
name                       0
item_condition_id          0
category_name           9385
brand_name            928207
price                 693359
shipping                   0
item_description           4
test_id              1482535
dtype: int64

In [171]:
# fill the missing values
def handle_missing_features(df):
    df['category_name'].fillna('missing', inplace=True)
    df['brand_name'].fillna('missing', inplace=True)
    df['item_description'].fillna('missing', inplace=True)   

In [172]:
start = time.time()
handle_missing_features(merge)
print('time taken to handle missing features is: ', time.time() - start)

time taken to handle missing features is:  0.44700074195861816


In [173]:
merge.isnull().sum()

train_id              693359
name                       0
item_condition_id          0
category_name              0
brand_name                 0
price                 693359
shipping                   0
item_description           0
test_id              1482535
dtype: int64

In [175]:
NUM_BRANDS = 4004
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 3

In [176]:
# pop_brand = merge['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
# pop_brand

In [177]:
#merge['brand_name'].value_counts()[merge['brand_name'].value_counts().index != 'missing'].index[:40]
# whaterver is not in popular brands make it missing
# merge.loc[~merge['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
# pop_category = merge['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
# merge.loc[~merge['category_name'].isin(pop_category), 'category_name'] = 'missing'

In [178]:
def cutting(df):
    pop_brand = df['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    df.loc[~df['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = df['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    df.loc[~merge['category_name'].isin(pop_category), 'category_name'] = 'missing'

In [179]:
start = time.time()
cutting(merge)
print('time taken to handle missing features is: ', time.time() - start)

time taken to handle missing features is:  0.8640005588531494


In [180]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175894 entries, 0 to 693358
Data columns (total 9 columns):
train_id             float64
name                 object
item_condition_id    int64
category_name        object
brand_name           object
price                float64
shipping             int64
item_description     object
test_id              float64
dtypes: float64(3), int64(2), object(4)
memory usage: 166.0+ MB


In [181]:
def to_categorical(df):
    df['category_name']= df['category_name'].astype('object')
    df['brand_name'] = df['brand_name'].astype('object')
    df['item_condition_id'] = df['item_condition_id'].astype('object')
    df['shipping'] = df['shipping'].astype('object')

In [182]:
to_categorical(merge)

In [183]:
# Count vectorizer
# 'name'
start = time.time()
cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(merge['name'])
print('time taken for Count Vectorizer is: ', time.time() - start)

time taken for Count Vectorizer is:  14.340493202209473


In [184]:
# 'category_name'
start = time.time()
X_category = cv.fit_transform(merge['category_name'])
print('time taken for Count Vectorizer is: ', time.time() - start)

time taken for Count Vectorizer is:  10.678362369537354


In [185]:
# tfidf vectorizer
tv = TfidfVectorizer(max_features = MAX_FEATURES_ITEM_DESCRIPTION, 
               ngram_range= (1,3), 
               stop_words='english')

In [186]:
start = time.time()
X_description = tv.fit_transform(merge['item_description'])
print('time taken for tfidf Vectorizer is: ', time.time() - start)

time taken for tfidf Vectorizer is:  384.1552941799164


In [213]:
pickle.dump(tv, open('tfidf_model.pkl', 'wb'))

In [187]:
lb = LabelBinarizer(sparse_output=True)
start = time.time()
X_brand = lb.fit_transform(merge['brand_name'])
print('time taken for Label Binarizer is: ', time.time() - start)

time taken for Label Binarizer is:  154.28654170036316


In [215]:
pickle.dump(lb, open('label_binarizer_model.pkl', 'wb'))

In [188]:
merge.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,test_id
0,0.0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,10.0,1,No description yet,
1,1.0,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,
2,2.0,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,
3,3.0,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,35.0,1,New with tags. Leather horses. Retail for [rm]...,
4,4.0,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,44.0,0,Complete with certificate of authenticity,


In [189]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175894 entries, 0 to 693358
Data columns (total 9 columns):
train_id             float64
name                 object
item_condition_id    object
category_name        object
brand_name           object
price                float64
shipping             object
item_description     object
test_id              float64
dtypes: float64(3), object(6)
memory usage: 166.0+ MB


In [190]:
# pd.get_dummies()
# converting to a compresses sparsed row matrix or csr
start = time.time()
X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values)
print('time taken to get the dummies is: ', time.time() - start)

time taken to get the dummies is:  5.329585075378418


In [191]:
# horizonatally stack and convert to a csr to save memory
start = time.time()
sparse_merge = hstack((X_dummies, X_brand, X_description, X_category, X_name)).tocsr()
print('time taken for the sparse_merge is: ', time.time() - start)

time taken for the sparse_merge is:  2.3150148391723633


In [193]:
# get the train and test

X = sparse_merge[:nrows_train]
X_test = sparse_merge[nrows_train:]

In [198]:
lgb_train = lgb.Dataset(X, label = y)

params = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'RMSE',
    }

In [206]:
start = time.time()
lgb_model = lgb.train(params, train_set = lgb_train, num_boost_round=3200, verbose_eval=100)
print('time taken for the lgb model is: ', time.time() - start)

time taken for the lgb model is:  220.9490737915039


In [212]:
import pickle
pickle.dump(lgb_model, open('lgb_model.pkl', 'wb'))

In [209]:
lgb_prediction = lgb_model.predict(X_test)

In [210]:
# Using Ridge model
start = time.time()
ridge_model = Ridge(solver='sag', fit_intercept=True, random_state=123)
ridge_model.fit(X, y)
print('time taken for the ridge model is: ', time.time() - start)

time taken for the ridge model is:  85.06183791160583


In [211]:
pickle.dump(ridge_model, open('ridge_model.pkl', 'wb'))

In [217]:
ridge_prediction = ridge_model.predict(X_test)

In [218]:
# combined_models
combined_prediction = 0.57*lgb_prediction + 0.43*ridge_prediction

In [220]:
submission['price'] = np.exp(combined_prediction)

In [224]:
# to release memory

del tv
del lb
del lgb_model
del ridge_model
gc.collect()

28