In [1]:
import pandas as pd             
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer

from aux_fun import my_eval, my_grid_search_cv

import json

In [20]:
df = pd.read_csv('./data/train_imputed.csv')

In [3]:
y = df['NumberOfSales']

In [4]:
X = df.drop(df[['NumberOfSales','NumberOfCustomers', 'WindDirDegrees']], axis=1)

# Train-Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train = X_train.reset_index(drop =True)
y_train = y_train.reset_index(drop =True)

# Find the combination with best parameters

In [9]:
#parameters
params_array = [
    {'n_estimators' : 5},
    {'n_estimators' : 10},
    {'n_estimators' : 15},
    {'n_estimators' : 20},
    {'n_estimators' : 25},
    {'n_estimators' : 30},
    {'n_estimators' : 20,'max_features' : 1},
    {'n_estimators' : 20,'max_features' : 2},
    {'n_estimators' : 20,'max_features' : 5},
    {'n_estimators' : 20,'max_features' : 10},
    {'n_estimators' : 20,'max_features' : 20},
    {'n_estimators' : 20,'max_features' : 30},
    {'n_estimators' : 10,'max_depth':1},
    {'n_estimators' : 20,'max_depth':1},
    {'n_estimators' : 30,'max_depth':1},
    {'n_estimators' : 20,'max_depth':2},
    {'n_estimators' : 20,'max_depth':5},
    {'n_estimators' : 20,'max_depth':10},
    {'n_estimators' : 20,'max_depth':20},
    {'n_estimators' : 20,'max_depth':30},
    {'n_estimators' : 20,'max_depth':50},
    {'n_estimators' : 20,'min_samples_split':0.1},
    {'n_estimators' : 20,'min_samples_split':0.25},
    {'n_estimators' : 20,'min_samples_split':0.50},
    {'n_estimators' : 20,'min_impurity_split':0.20},
    {'n_estimators' : 20,'min_impurity_split':0.50},
    {'n_estimators' : 20,'min_impurity_split':0.80},
    {'n_estimators' : 15,'bootstrap':False},
    {'n_estimators' : 20,'bootstrap':False},
    {'n_estimators' : 25,'bootstrap':False},
]

In [10]:
results = my_grid_search_cv(RandomForestRegressor(), params_array, X_train, y_train, n_folds=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the document

In [11]:
results

Unnamed: 0,Method,Folds,Parameters,Eval_test,R2,R2_month
0,"(DecisionTreeRegressor(criterion='mse', max_de...",5,{'n_estimators': 5},inf,0.152649,0.152649
1,"(DecisionTreeRegressor(criterion='mse', max_de...",5,{'n_estimators': 10},inf,0.269654,0.269654
2,"(DecisionTreeRegressor(criterion='mse', max_de...",5,{'n_estimators': 15},inf,0.310221,0.310221
3,"(DecisionTreeRegressor(criterion='mse', max_de...",5,{'n_estimators': 20},inf,0.337022,0.337022
4,"(DecisionTreeRegressor(criterion='mse', max_de...",5,{'n_estimators': 25},inf,0.272371,0.272371
5,"(DecisionTreeRegressor(criterion='mse', max_de...",5,{'n_estimators': 30},inf,0.392939,0.392939
6,"(DecisionTreeRegressor(criterion='mse', max_de...",5,"{'max_features': 1, 'n_estimators': 20}",inf,-0.027404,-0.027404
7,"(DecisionTreeRegressor(criterion='mse', max_de...",5,"{'max_features': 2, 'n_estimators': 20}",inf,0.096857,0.096857
8,"(DecisionTreeRegressor(criterion='mse', max_de...",5,"{'max_features': 5, 'n_estimators': 20}",inf,0.224887,0.224887
9,"(DecisionTreeRegressor(criterion='mse', max_de...",5,"{'max_features': 10, 'n_estimators': 20}",inf,0.223691,0.223691


In [12]:
results.to_csv('random_forest_results.csv')

# Model Building

In [13]:
best_params = results[results['Eval_test']==min(results['Eval_test'])]['Parameters'].reset_index(drop =True)[0]

In [14]:
best_params

{'bootstrap': False, 'n_estimators': 15}

In [15]:
rf = RandomForestRegressor(**best_params)
rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [16]:
y_pred = rf.predict(X_test)

In [17]:
print(y_pred)

[  4742.66666667   6610.           5721.2          6894.33333333
   4800.06666667   4093.93333333   1673.8          5730.           3612.8
  12344.06666667   4002.4          1802.6          3124.66666667
   3470.93333333   7519.73333333   3420.4          3144.26666667      0.
      0.           3179.8       ]


In [18]:
print(y_test)

464703    2699
69545     5636
348274    1413
311327    5603
520309    3964
270408    6200
276681    3548
184234    4454
3143      3856
269389    9640
4929      3363
161757    3324
14076     4301
71707     8265
480136    5687
188974    5148
90400     2726
224842       0
14879        0
156691    4069
Name: NumberOfSales, dtype: int64


In [19]:
evaluation = my_eval(X_test, y_test, y_pred)
evaluation

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


0.33209866236362989

# Creation of the submission 

In [18]:
model_result={'method': 'Random Forest',
       'parameters':25,
       'evaluation': evaluation, 
       'r2':r2,
       'r2_grouped':r2grouped,
       'notes': 'None'
    }

In [19]:
with open('results.json', 'a') as fp:
    json.dump(model_result, fp, indent=2)
    fp.write('\n')