# Random Forest Per Store

In [1]:
import pandas as pd
import numpy as np
from aux_fun import my_eval, r2_month
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

## Dropping problems, setting variables

In [2]:
train_data = pd.read_csv('./data/train_imputed.csv')
train_data.head()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,...,Day,Month,Year,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,0,1,0,326,7,495,5676,9643,17130,...,1,3,2016,1,0,0,0,1,0,0
1,1000,0,1,0,326,7,608,8111,9643,17130,...,2,3,2016,1,0,0,0,1,0,0
2,1000,0,1,0,326,7,665,8300,9643,17130,...,4,3,2016,1,0,0,0,1,0,0
3,1000,0,1,0,326,7,630,7154,9643,17130,...,5,3,2016,1,0,0,0,1,0,0
4,1000,0,0,0,326,7,0,0,9643,17130,...,6,3,2016,1,0,0,0,1,0,0


In [3]:
X=train_data.drop(['NumberOfSales','WindDirDegrees','NumberOfCustomers'], axis=1)
Y=train_data[['StoreID','NumberOfSales']]

## Algorithm Evaluation

In [4]:
stores=X['StoreID'].unique().astype(int)
nstores=len(stores)

In [5]:
ind_params = {'n_estimators' : 30}

In [6]:
model=RandomForestRegressor(**ind_params)
evaluations=0
r2=0
i=0
y_pred=[]
for store in stores:
    if i%100==0:
        print("Eval: "+str(i)+" of "+str(nstores)+" completed")
    
    #Making sub-dataset
    y_store=Y[Y['StoreID']==store].drop('StoreID',axis=1)
    X_store=X[X['StoreID']==store]
    X_train, X_test, y_train, y_test = train_test_split(X_store, y_store, test_size=0.2, random_state=42)
    
    #fitting the forest
    model.fit(X_train, np.ravel(y_train))
    y_pred.append(model.predict(X_test))

    X_test['NumberOfSales'] = y_test

    #Saving the dataset for evaluation
    if(i==0):
        X_test_history=X_test.copy()
        y_test_history=y_test.copy()
    else:
        X_test_history=pd.concat([X_test_history,X_test], axis=0, ignore_index=True)
        y_test_history=pd.concat([y_test_history,y_test], axis=0, ignore_index=True)
    i+=1
y_pred_history=np.concatenate(y_pred, axis=0)

Eval: 0 of 749 completed
Eval: 100 of 749 completed
Eval: 200 of 749 completed
Eval: 300 of 749 completed
Eval: 400 of 749 completed
Eval: 500 of 749 completed
Eval: 600 of 749 completed
Eval: 700 of 749 completed


In [7]:
print("Prediction Error:",my_eval(X_test_history, y_test_history, y_pred_history))
print("R2:",r2_score(y_test_history, y_pred_history))
print("R2 Month:",r2_month(X_test_history, y_test_history, y_pred_history))

Prediction Error: 0.0472165202764
R2: 0.895671054977
R2 Month: 0.991614816546
