In [1]:
import numpy as np
import pandas as pd

training_data = pd.read_csv('input/training_data.csv')
training_data.head()

Unnamed: 0,item_id,date_block_num,shop_id,item_cnt_day,item_category_id,year,month,item_cnt_tm1,item_cnt_tm2,item_cnt_tm12,item_cnt_tp1
0,27,17,2,1.0,19,2014,5,0.0,0.0,0.0,0.0
1,30,15,2,1.0,40,2014,3,0.0,0.0,0.0,1.0
2,30,16,2,1.0,40,2014,4,1.0,0.0,0.0,0.0
3,31,16,2,1.0,37,2014,4,0.0,0.0,0.0,0.0
4,32,12,2,1.0,40,2014,0,0.0,0.0,0.0,0.0


In [2]:
y = training_data['item_cnt_tp1']
X = training_data.drop(['date_block_num','item_cnt_tp1'], axis=1)
X.head()

Unnamed: 0,item_id,shop_id,item_cnt_day,item_category_id,year,month,item_cnt_tm1,item_cnt_tm2,item_cnt_tm12
0,27,2,1.0,19,2014,5,0.0,0.0,0.0
1,30,2,1.0,40,2014,3,0.0,0.0,0.0
2,30,2,1.0,40,2014,4,1.0,0.0,0.0
3,31,2,1.0,37,2014,4,0.0,0.0,0.0
4,32,2,1.0,40,2014,0,0.0,0.0,0.0


In [3]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Training set has {} columns.".format(X_train.shape[1]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 701061 samples.
Training set has 9 columns.
Testing set has 175266 samples.


In [4]:
from sklearn.metrics import mean_squared_error

def calc_RMSE(actuals, predictions):
    return np.sqrt(mean_squared_error(actuals, predictions))

In [5]:
def assess_model(regressor):
    regressor_model = regressor.fit(X_train, y_train)

    predictions_train = regressor_model.predict(X_train)
    print('Training RMSE:', calc_RMSE(y_train, predictions_train))

    predictions_test = regressor_model.predict(X_test)
    print('Testing RMSE:', calc_RMSE(y_test, predictions_test))

In [6]:
from sklearn.linear_model import LinearRegression

algo = LinearRegression()
assess_model(algo)

Training RMSE: 6.373731660761329
Testing RMSE: 4.952904288191872


In [13]:
from sklearn.tree import DecisionTreeRegressor

algo = DecisionTreeRegressor()
assess_model(algo)

Training RMSE: 0.0
Testing RMSE: 6.172087777561161


In [None]:
from sklearn.svm import SVR

algo = SVR(kernel='linear', C=1e3)
assess_model(algo)

In [None]:
algo = SVR(kernel='rbf', C=1e3, gamma=0.1)
assess_model(algo)

In [None]:
algo = SVR(kernel='poly', C=1e3, degree=2)
assess_model(algo)

In [11]:
from sklearn.ensemble import AdaBoostRegressor

algo = AdaBoostRegressor(DecisionTreeRegressor())
assess_model(algo)

Training RMSE: 0.04754848996714612
Testing RMSE: 4.2451923993665845


In [12]:
from sklearn.ensemble import BaggingRegressor

algo = BaggingRegressor()
assess_model(algo)

Training RMSE: 2.6829997536269734
Testing RMSE: 4.7375427706414746


In [10]:
from sklearn.ensemble import ExtraTreesRegressor

algo = ExtraTreesRegressor()
assess_model(algo)

Training RMSE: 0.0
Testing RMSE: 4.536129501019812


In [11]:
from sklearn.ensemble import GradientBoostingRegressor

algo = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls')
assess_model(algo)

Training RMSE: 6.089925652944762
Testing RMSE: 4.719512195719846


In [14]:
from sklearn.ensemble import RandomForestRegressor

#algo = RandomForestRegressor(max_depth=10)
algo = RandomForestRegressor()
assess_model(algo)

Training RMSE: 2.8496578733132583
Testing RMSE: 4.61697840816801


In [6]:
import xgboost as xgb
from sklearn.model_selection import KFold

#rng = np.random.RandomState(1)
#kf = KFold(n_splits=2, shuffle=True, random_state=rng)
#for train_index, test_index in kf.split(X):
#    xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
#    predictions = xgb_model.predict(X[test_index])
#    actuals = y[test_index]
#    print('Testing RMSE:', np.sqrt(mean_squared_error(actuals, predictions)))

algo = xgb.XGBRegressor()
assess_model(algo)

Training RMSE: 4.8801253644189195
Testing RMSE: 4.171941026549597


In [8]:
from lightgbm.sklearn import LGBMRegressor

algo = LGBMRegressor()
assess_model(algo)

Training RMSE: 4.239085256647325
Testing RMSE: 4.116389459567357


In [17]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

def fit_model(inputs, targets):
    cv_sets = ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
    regressor = LGBMRegressor(learning_rate=0.001)
    params = {'max_depth':[3,4,5,6,7,8,9,10,11,12],'num_leaves':[10,20,30,40,50,60,70,80,90,100],'min_data_in_leaf':[250,500,750,1000]}
    scoring_fnc = make_scorer(calc_RMSE, greater_is_better=False)
    grid = GridSearchCV(regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    grid = grid.fit(inputs, targets)
    return grid.best_estimator_

best_model = fit_model(X_train, y_train)
print("Parameter 'max_depth' is {} for the optimal model.",format(best_model.get_params()['max_depth']))
print("Parameter 'num_leaves' is {} for the optimal model.",format(best_model.get_params()['num_leaves']))
print("Parameter 'min_data_in_leaf' is {} for the optimal model.",format(best_model.get_params()['min_data_in_leaf']))

Parameter 'max_depth' is {} for the optimal model. 12
Parameter 'num_leaves' is {} for the optimal model. 100
Parameter 'min_data_in_leaf' is {} for the optimal model. 250


In [26]:
algo = LGBMRegressor(max_depth=75, num_leaves=1200, learning_rate=0.01, n_estimators=300)
assess_model(algo)

Training RMSE: 4.8143035054691286
Testing RMSE: 4.106802898225224
