In [1]:
import pandas as pd             
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from aux_fun import evaluate

import json

In [2]:
df = pd.read_csv('./data/train_imputed.csv')

In [3]:
y = df['NumberOfSales']

In [4]:
X = df.drop(df[['NumberOfSales', 'WindDirDegrees']], axis=1)

# Train-Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
dates_test = X_test['Date']

In [7]:
X_train = X_train.drop(['Date'], axis=1)
X_test = X_test.drop(['Date'], axis=1)

# Model 

In [8]:
#parameters
params_dict = {'n_estimators' : 15}

In [9]:
rf = RandomForestRegressor(**params_dict)

In [10]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [11]:
y_pred = rf.predict(X_test)

# Creation of the submission 

In [25]:
result = pd.DataFrame(X_test['StoreID'])
result['Month'] = pd.DatetimeIndex(dates_test).month
result['NumberOfSales'] = y_pred
#Group by Month
result =result.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()

In [13]:
X_test['NumberOfSales']= y_test
X_test['Date'] = dates_test

In [14]:
evaluation =evaluate(X_test,result)
evaluation

0.013833048898238541

In [15]:
r2 = r2_score(y_test, y_pred)
r2

0.9831658223798222

In [26]:
#r2 on grouped data
test =X_test.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()
r2grouped = r2_score(test['NumberOfSales'], result['NumberOfSales'])
r2grouped

0.99605231827628504

In [31]:
model_result={'method': 'Random Forest',
       'parameters':params_dict,
       'evaluation': evaluation, 
       'r2':r2,
       'r2_grouped':r2grouped,
       'notes': 'None'
    }

In [32]:
with open('results.json', 'a') as fp:
    json.dump(model_result, fp, indent=2)
    fp.write('\n')