In [1]:
import numpy as np
import pandas as pd

training_data = pd.read_csv('input/training_data_0_categ_date.csv')
training_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day,item_cnt_tm1,item_cnt_tm2,item_cnt_tm12,item_cnt_tp1,categ_id,year,month
0,12,2,32,1.0,0.0,0.0,0.0,0.0,40,2014,0
1,12,2,33,1.0,1.0,2.0,1.0,0.0,37,2014,0
2,12,2,99,1.0,0.0,0.0,0.0,0.0,37,2014,0
3,12,2,482,2.0,1.0,2.0,1.0,1.0,73,2014,0
4,12,2,485,1.0,1.0,0.0,0.0,1.0,73,2014,0


In [2]:
y = training_data['item_cnt_tp1']
X = training_data.drop(['date_block_num','item_cnt_tp1'], axis=1)
X.head()

Unnamed: 0,shop_id,item_id,item_cnt_day,item_cnt_tm1,item_cnt_tm2,item_cnt_tm12,categ_id,year,month
0,2,32,1.0,0.0,0.0,0.0,40,2014,0
1,2,33,1.0,1.0,2.0,1.0,37,2014,0
2,2,99,1.0,0.0,0.0,0.0,37,2014,0
3,2,482,2.0,1.0,2.0,1.0,73,2014,0
4,2,485,1.0,1.0,0.0,0.0,73,2014,0


In [3]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Training set has {} columns.".format(X_train.shape[1]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 73912 samples.
Training set has 9 columns.
Testing set has 18478 samples.


In [4]:
from sklearn.metrics import mean_squared_error

def calc_RMSE(actuals, predictions):
    return np.sqrt(mean_squared_error(actuals, predictions))

In [5]:
def assess_model(regressor):
    regressor_model = regressor.fit(X_train, y_train)

    predictions_train = regressor_model.predict(X_train)
    print('Training RMSE:', calc_RMSE(y_train, predictions_train))

    predictions_test = regressor_model.predict(X_test)
    print('Testing RMSE:', calc_RMSE(y_test, predictions_test))

In [6]:
from sklearn.linear_model import LinearRegression

algo = LinearRegression()
assess_model(algo)

Training RMSE: 2.32661726961
Testing RMSE: 2.6823616003


In [7]:
from sklearn.tree import DecisionTreeRegressor

algo = DecisionTreeRegressor(max_depth=4)
assess_model(algo)

Training RMSE: 2.01804112145
Testing RMSE: 2.32785924455


In [None]:
from sklearn.svm import SVR

algo = SVR(kernel='linear', C=1e3)
assess_model(algo)

In [None]:
algo = SVR(kernel='rbf', C=1e3, gamma=0.1)
assess_model(algo)

In [None]:
algo = SVR(kernel='poly', C=1e3, degree=2)
assess_model(algo)

In [8]:
from sklearn.ensemble import AdaBoostRegressor

rng = np.random.RandomState(1)
algo = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng)
assess_model(algo)

Training RMSE: 7.6424001107
Testing RMSE: 7.76636463966


In [9]:
from sklearn.ensemble import BaggingRegressor

algo = BaggingRegressor()
assess_model(algo)

Training RMSE: 0.873377920671
Testing RMSE: 2.12343943194


In [12]:
from sklearn.ensemble import ExtraTreesRegressor

algo = ExtraTreesRegressor()
assess_model(algo)

Training RMSE: 0.0
Testing RMSE: 2.34190381431


In [11]:
from sklearn.ensemble import GradientBoostingRegressor

algo = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls')
assess_model(algo)

Training RMSE: 2.18374109222
Testing RMSE: 2.59514560408


In [13]:
from sklearn.ensemble import RandomForestRegressor

#learner = RandomForestRegressor(max_depth=10, random_state=42)
algo = RandomForestRegressor(random_state=42)
assess_model(algo)

Training RMSE: 0.876969664717
Testing RMSE: 2.09983634986


In [14]:
import xgboost as xgb
from sklearn.model_selection import KFold

#kf = KFold(n_splits=2, shuffle=True, random_state=rng)
#for train_index, test_index in kf.split(X):
#    xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
#    predictions = xgb_model.predict(X[test_index])
#    actuals = y[test_index]
#    print('Testing RMSE:', np.sqrt(mean_squared_error(actuals, predictions)))

algo = xgb.XGBRegressor()
assess_model(algo)

Training RMSE: 1.60150104512
Testing RMSE: 2.07593797624


In [15]:
from lightgbm.sklearn import LGBMRegressor

algo = LGBMRegressor()
assess_model(algo)

Training RMSE: 1.58615573017
Testing RMSE: 2.13790224458
