In [92]:
import numpy as np
import pandas as pd

training_data = pd.read_csv('input/training_data.csv')
training_data.head()

Unnamed: 0,item_id,date_block_num,shop_id,item_cnt_day,item_category_id,year,month,item_cnt_tm1,item_cnt_tm2,item_cnt_tm12,item_cnt_tp1
0,27,17,2,1.0,19,2014,5,0.0,0.0,0.0,0.0
1,30,15,2,1.0,40,2014,3,0.0,0.0,0.0,1.0
2,30,16,2,1.0,40,2014,4,1.0,0.0,0.0,0.0
3,31,16,2,1.0,37,2014,4,0.0,0.0,0.0,0.0
4,32,12,2,1.0,40,2014,0,0.0,0.0,0.0,0.0


In [94]:
y = training_data['item_cnt_tp1']
#X = training_data.drop(['date_block_num','item_cnt_tp1'], axis=1)
X.rename(index=str, columns={"item_cnt_day":"t","item_cnt_tm1":"tm1","item_cnt_tm2":"tm2","item_cnt_tm12":"tm12"}, inplace=True)
X = X[['year','month','shop_id','item_category_id','item_id','t','tm1','tm2','tm12']]
X.head()

Unnamed: 0,year,month,shop_id,item_category_id,item_id,t,tm1,tm2,tm12
0,2014,5,2,19,27,1.0,0.0,0.0,0.0
1,2014,3,2,40,30,1.0,0.0,0.0,0.0
2,2014,4,2,40,30,1.0,1.0,0.0,0.0
3,2014,4,2,37,31,1.0,0.0,0.0,0.0
4,2014,0,2,40,32,1.0,0.0,0.0,0.0


In [95]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Training set has {} columns.".format(X_train.shape[1]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 701061 samples.
Training set has 9 columns.
Testing set has 175266 samples.


In [96]:
from sklearn.metrics import mean_squared_error

def calc_RMSE(actuals, predictions):
    return np.sqrt(mean_squared_error(actuals, predictions))

In [7]:
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

def fit_model(inputs, targets):
    cv_sets = ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
    regressor = LGBMRegressor(n_estimators=300)
    params = {'max_depth':[50,75,100],'num_leaves':[600,900,1200],'learning_rate':[0.1,0.01,0.001]}
    scoring_fnc = make_scorer(calc_RMSE, greater_is_better=False)
    grid = GridSearchCV(regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    grid = grid.fit(inputs, targets)
    return grid.best_estimator_

best_model = fit_model(X_train, y_train)
print("Parameter 'max_depth' is {} for the optimal model.",format(best_model.get_params()['max_depth']))
print("Parameter 'num_leaves' is {} for the optimal model.",format(best_model.get_params()['num_leaves']))
print("Parameter 'learning_rate' is {} for the optimal model.",format(best_model.get_params()['learning_rate']))

Parameter 'max_depth' is {} for the optimal model. 75
Parameter 'num_leaves' is {} for the optimal model. 1200
Parameter 'learning_rate' is {} for the optimal model. 0.1


In [116]:
regressor = LGBMRegressor(max_depth=75, num_leaves=1200, learning_rate=0.1, n_estimators=300, n_jobs=2)
regressor_model = regressor.fit(X_train, y_train, categorical_feature=['year','month','shop_id','item_id','item_category_id'])
predictions_train = regressor_model.predict(X_train)
print('Training RMSE:', calc_RMSE(y_train, predictions_train))

predictions_test = regressor_model.predict(X_test)
print('Testing RMSE:', calc_RMSE(y_test, predictions_test))



Training RMSE: 8.13922054059074
Testing RMSE: 6.267020280125137


In [108]:
test_file = pd.read_csv('input/test.csv')
print(test_file.shape)
test_file.head()

(214200, 3)


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [109]:
#get previous year's sales totals
test_file['date_block_num'] = 33
monthly_totals_all = pd.read_csv('staging/monthly_totals_prevs_all.csv')

test_data = test_file.merge(monthly_totals_all, on=['date_block_num','shop_id','item_id'], how='left')
print(test_data.shape)
test_data.head()

(214200, 12)


Unnamed: 0,ID,shop_id,item_id,date_block_num,item_cnt_day,item_category_id,year,month,item_cnt_tm1,item_cnt_tm2,item_cnt_tm12,item_cnt_tp1
0,0,5,5037,33,,,,,,,,
1,1,5,5320,33,,,,,,,,
2,2,5,5233,33,1.0,19.0,2015.0,9.0,3.0,1.0,0.0,
3,3,5,5232,33,,,,,,,,
4,4,5,5268,33,,,,,,,,


In [110]:
test_data.drop(['ID','year','month','item_category_id','item_cnt_tm2','item_cnt_tm12'], axis=1, inplace=True)
test_data.rename(index=str, columns={"item_cnt_tm1":"tm2","item_cnt_day":"tm1","item_cnt_tp1":"t"}, inplace=True)
test_data.fillna(0.0, inplace=True)
test_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,tm1,tm2,t
0,5,5037,33,0.0,0.0,0.0
1,5,5320,33,0.0,0.0,0.0
2,5,5233,33,1.0,3.0,0.0
3,5,5232,33,0.0,0.0,0.0
4,5,5268,33,0.0,0.0,0.0


In [111]:
#get previous year's sales totals
test_data['date_block_num'] = 22
test_data = test_data.merge(monthly_totals_all, on=['date_block_num','shop_id','item_id'], how='left')
test_data.drop(['year','month','item_category_id','item_cnt_tm1','item_cnt_tm2','item_cnt_tm12','item_cnt_tp1'], axis=1, inplace=True)
test_data.rename(index=str, columns={"item_cnt_day":"tm12"}, inplace=True)
test_data.fillna(0.0, inplace=True)
test_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,tm1,tm2,t,tm12
0,5,5037,22,0.0,0.0,0.0,1.0
1,5,5320,22,0.0,0.0,0.0,0.0
2,5,5233,22,1.0,3.0,0.0,0.0
3,5,5232,22,0.0,0.0,0.0,0.0
4,5,5268,22,0.0,0.0,0.0,0.0


In [112]:
#merge in category id
items = pd.read_csv('input/items.csv', index_col=1)
test_data = test_data.merge(items, on=['item_id'], how='left').drop(['item_name'], axis=1)
test_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,tm1,tm2,t,tm12,item_category_id
0,5,5037,22,0.0,0.0,0.0,1.0,19
1,5,5320,22,0.0,0.0,0.0,0.0,55
2,5,5233,22,1.0,3.0,0.0,0.0,19
3,5,5232,22,0.0,0.0,0.0,0.0,23
4,5,5268,22,0.0,0.0,0.0,0.0,20


In [113]:
#set the year and month
test_data['year'] = 2015
test_data['month'] = 10
test_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,tm1,tm2,t,tm12,item_category_id,year,month
0,5,5037,22,0.0,0.0,0.0,1.0,19,2015,10
1,5,5320,22,0.0,0.0,0.0,0.0,55,2015,10
2,5,5233,22,1.0,3.0,0.0,0.0,19,2015,10
3,5,5232,22,0.0,0.0,0.0,0.0,23,2015,10
4,5,5268,22,0.0,0.0,0.0,0.0,20,2015,10


In [114]:
test_data = test_data[['year','month','shop_id','item_category_id','item_id','t','tm1','tm2','tm12']]
print(test_data.shape)
test_data.head()

(214200, 9)


Unnamed: 0,year,month,shop_id,item_category_id,item_id,t,tm1,tm2,tm12
0,2015,10,5,19,5037,0.0,0.0,0.0,1.0
1,2015,10,5,55,5320,0.0,0.0,0.0,0.0
2,2015,10,5,19,5233,0.0,1.0,3.0,0.0
3,2015,10,5,23,5232,0.0,0.0,0.0,0.0
4,2015,10,5,20,5268,0.0,0.0,0.0,0.0


In [115]:
preds_submissions = regressor_model.predict(test_data)

submissions = pd.DataFrame({
    "ID": test_file["ID"],
    "item_cnt_month": preds_submissions.clip(0., 20.)
})
submissions.to_csv("output/submission.csv", index=False)

In [None]:
#First submission: score of 1.23840 