Import libraries

In [1]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv('./notebooks/preprocessed_data.csv')

data

In [None]:
data.info()

In [None]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

data = downcast_dtypes(data)
print(data.info())

In [None]:
train_set = data.query('date_block_num >= 3 and date_block_num < 28').copy()
validation_set = data.query('date_block_num >= 28 and date_block_num < 33').copy()
test_set = data.query('date_block_num == 33').copy()

print('Train set records:', train_set.shape[0])
print('Validation set records:', validation_set.shape[0])
print('Test set records:', test_set.shape[0])

print('Train set records: %s (%.f%% of complete data)' % (train_set.shape[0], ((train_set.shape[0]/data.shape[0])*100)))
print('Validation set records: %s (%.f%% of complete data)' % (validation_set.shape[0], ((validation_set.shape[0]/data.shape[0])*100)))

In [9]:
# Merge with test competition data to ensure test data is in the correct order.

# Load in competition test  dataset proviided
test_competition  = pd.read_csv('./input/test.csv', 
                    dtype={'ID': 'int16', 'shop_id': 'int16', 'item_id': 'int16'}
                   ).set_index('ID')
test_competition.head()

Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [18]:
# Merge and check
test_set=pd.read_csv('./data/output/test_X.csv')
test_X = pd.merge(test_competition, test_set, on=['shop_id', 'item_id'], how='left')
print(len(test_X))
test_X=test_X.drop(['Unnamed: 0'], axis=1)
test_X.head()

220192


Unnamed: 0,shop_id,item_id,item_category_type_code,item_category_subtype_code,shop_city_code,item_category_id,year,month,item_platform,item_digital,...,price_increase,price_decrease,item_cnt_min,item_cnt_max,item_cnt_mean,item_cnt_std,item_cnt_shifted1,item_cnt_shifted2,item_cnt_shifted3,item_trend
0,5,5037,0,0,0,0,0,0,0,0,...,0.0,25990.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,5320,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,5233,0,0,0,0,0,0,0,0,...,0.0,7191.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,5232,3,16,10,23,2015,9,0,0,...,942.0,3854.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
4,5,5268,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Create X and Y Subsets for train, val and test
train_X = train_set.drop(['date_block_num', 'sum_item_cnt_next_month'], axis=1)
train_Y = train_set['sum_item_cnt_next_month']

validation_X = validation_set.drop(['date_block_num', 'sum_item_cnt_next_month'], axis=1)
validation_Y = validation_set['sum_item_cnt_next_month']

test_X = test_X.drop(['date_block_num', 'sum_item_cnt_next_month'], axis=1)

In [None]:
# Too slow so will not replace missing values with mean
datasets = [train_X, train_Y, validation_X, validation_Y, test_X]

# Replace missing values with the median of the column. 
for dataset in datasets:
    dataset.fillna(dataset.median(), inplace=True)

In [None]:
# Sanity check number we have no na.
print("Train X Null:", train_X.isnull().sum().sum())
print("Test X Null:", validation_X.isnull().sum().sum())
print("Test X Null:", test_X.isnull().sum().sum())

In [None]:
# Sanity check number we have no na.
print("Train Y Null:", train_Y.isnull().sum())
print("Test X Null:", validation_Y.isnull().sum())

In [None]:
# Sanity check the order
test_X.head()

In [None]:
# Sanity check the order
train_X.head()

In [None]:
# Sanity check the order
validation_X.head()

In [None]:
import os
# Build output directory
data_dir = './data/output/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [None]:
# Save dataframes as csv files 
pd.DataFrame(train_X).to_csv(os.path.join(data_dir, 'train_X.csv'), header=True, index=True)
pd.DataFrame(train_Y).to_csv(os.path.join(data_dir, 'train_Y.csv'), header=True, index=True)

pd.DataFrame(validation_X).to_csv(os.path.join(data_dir, 'validation_X.csv'), header=True, index=True)
pd.DataFrame(validation_Y).to_csv(os.path.join(data_dir, 'validation_Y.csv'), header=True, index=True)

pd.DataFrame(test_X).to_csv(os.path.join(data_dir, 'test_X.csv'), header=True, index=True)

In [None]:
train=train_set.drop(['date_block_num'], axis=1)
validation=validation_set.drop(['date_block_num'], axis=1)


In [None]:
train.to_csv('./data/output/train.csv', index=False)
validation.to_csv('./data/output/validation.csv', index=False)

In [None]:
validation.columns

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor
import argparse
import logging
import os

In [None]:
import pandas as   pd
train=pd.read_csv('./data/output/train.csv')
train

In [None]:
train.columns

In [None]:
X_train=train[['item_category_type_code', 'item_category_subtype_code', 'item_id',
       'shop_id', 'shop_city_code', 'item_category_id', 'year', 'month',
       'item_platform', 'item_digital', 'item_lang', 'sum_item_price',
       'mean_item_price', 'sum_item_count', 'mean_item_count', 'transactions',
       'item_price_unit', 'hist_min_item_price',
       'hist_max_item_price', 'price_increase', 'price_decrease',
       'item_cnt_min', 'item_cnt_max', 'item_cnt_mean', 'item_cnt_std',
       'item_cnt_shifted1', 'item_cnt_shifted2', 'item_cnt_shifted3',
       'item_trend']]
y_train=train['sum_item_cnt_next_month'] 

In [None]:
validation=pd.read_csv('./data/output/validation.csv')
validation

In [None]:
X_validation=validation[['item_category_type_code', 'item_category_subtype_code', 'item_id',
       'shop_id', 'shop_city_code', 'item_category_id', 'year', 'month',
       'item_platform', 'item_digital', 'item_lang', 'sum_item_price',
       'mean_item_price', 'sum_item_count', 'mean_item_count', 'transactions',
       'item_price_unit', 'hist_min_item_price',
       'hist_max_item_price', 'price_increase', 'price_decrease',
       'item_cnt_min', 'item_cnt_max', 'item_cnt_mean', 'item_cnt_std',
       'item_cnt_shifted1', 'item_cnt_shifted2', 'item_cnt_shifted3',
       'item_trend']]
y_validation=validation['sum_item_cnt_next_month'] 

The first base catboost regression running by default

In [None]:


# Build output directory
model_dir = './models/output/'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)


 
# define and train model
model = CatBoostRegressor()



model.fit(X_train, y_train, eval_set=(X_validation, y_validation), logging_level='Silent') 

logging.info('validating model')
abs_err = np.abs(model.predict(X_validation) - y_validation)

# print couple perf metrics
for q in [10, 50, 90]:
    logging.info('AE-at-' + str(q) + 'th-percentile: '
        + str(np.percentile(a=abs_err, q=q)))

In [None]:
import numpy as np
# print abs error
logging.info('validating model')
abs_err = np.abs(model.predict(X_validation) - y_validation)

# print couple perf metrics
for q in [10, 50, 90]:
    logging.info('AE-at-' + str(q) + 'th-percentile: '
        + str(np.percentile(a=abs_err, q=q)))



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
y_predict=model.predict(X_validation)
print('Model  validation rmse:', np.sqrt(mean_squared_error(y_validation, y_predict)))
print('Model  validation mae:', mean_absolute_error(y_validation, y_predict))

In [None]:
for q in [10, 50, 90]:
    print(str(np.percentile(a=abs_err, q=q)))

In [None]:
import pickle
model_savepath = os.path.join(model_dir)
  # Save our model
pickle.dump(model, open( 'CB_MODEL.pickle', "wb"))

The secong runningv of catboost

In [None]:
import numpy as np
cat_columns = [i for i, col in enumerate(X_train) if not issubclass(X_train[col].dtype.type, np.floating)]
cat_columns

In [None]:
for feature in cat_columns:
    print(X_train[feature])

In [None]:
clf_params = dict(
    random_state=242,
    od_type="Iter",
    od_wait=20,
    verbose=10,
    learning_rate=0.1,
    iterations=200,
)

clf = CatBoostRegressor(**clf_params)

fit_params = dict(
    X=X_train, 
    y=y_train,
    cat_features=cat_columns,
    eval_set=(X_validation, y_validation),
    plot=True
)

clf.fit(**fit_params)

In [None]:
X_test=pd.read_csv('./data/output/test_X.csv')
X_test=X_test.drop(['Unnamed: 0'], axis=1)
X_test

In [None]:
y_pred=clf.predict(X_test)

In [None]:
X_test.info()

In [None]:
submission=pd.read_csv('./input/sample_submission.csv')

In [None]:
clf.get_feature_importance(prettified=True)

In [None]:
for i, v in clf.get_feature_importance(prettified=True):
     print(i.ljust(20), v)

In [None]:
for i, j, value in clf.get_feature_importance(fstr_type="Interaction", prettified=True)[:10]:
    print(X_train.columns[i].ljust(20), X_train.columns[j].ljust(20), value)

In [None]:
models = [('LR', LinearRegression()),
          ('LASSO', Lasso()),
          ('EN', ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('S-LR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])),
          ('S-LASSO', Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])),
          ('S-EN', Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])),
          ('S-KNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])),
          ('S-CART', Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])), 
          ('S-SVR', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR(gamma='auto'))])), 
          ('S-AB', Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())])), 
          ('S-GBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor())])), 
          ('S-RF', Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestRegressor(n_estimators=10))])), 
          ('S-ET', Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesRegressor(n_estimators=10))]))]

# ('SVR', SVR(gamma='auto'))

In [None]:
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'neg_mean_absolute_error'
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)

In [None]:
results = []
names = []
for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))

In [None]:
fig = plt.figure(figsize=(10,4), dpi= 100, facecolor='lightblue', edgecolor='w')
fig.suptitle('Algorithm Performance Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()