In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.pylab as pylab

%matplotlib inline
matplotlib.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
rossman_df = pd.read_csv("train.csv", low_memory=False)

In [None]:
store_df = pd.read_csv("store.csv", low_memory=False)

In [None]:
rossman_df.drop(rossman_df[rossman_df.Open == 0].index, inplace = True)
rossman_df.drop(rossman_df[rossman_df.Sales == 0].index, inplace = True)

In [None]:
rossman_df['Date']=pd.to_datetime(rossman_df['Date'])
rossman_df['Year'] = rossman_df['Date'].apply(lambda x: x.year)
rossman_df['Month'] = rossman_df['Date'].apply(lambda x: x.month)
rossman_df['Day'] = rossman_df['Date'].apply(lambda x: x.day)
rossman_df['WeekOfYear'] = rossman_df['Date'].apply(lambda x: x.weekofyear)

In [None]:
rossman_df.drop(columns=['StateHoliday', 'Open'], inplace=True)


In [None]:
store_df = store_df.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2SinceWeek',
                     'Promo2SinceYear', 'PromoInterval'], axis=1)

In [None]:
store_df.CompetitionDistance.fillna(store_df.CompetitionDistance.mode(), inplace=True)

In [None]:
df = pd.merge(rossman_df, store_df, how='left', on='Store')

In [None]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [None]:
df = remove_outlier(df, 'Sales')
df = remove_outlier(df, 'Customers')
df = remove_outlier(df, 'CompetitionDistance')

In [None]:
df = pd.get_dummies(df,columns=['StoreType','Assortment'])

In [None]:
df = df.drop(['Store','Date','Year', 'WeekOfYear'] , axis = 1)

In [None]:
X = df.drop(['Sales'] , axis = 1)
y= df.Sales

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 16, endpoint = True)],
                'max_features': ['auto', 'sqrt', 'log2'],
                'max_depth': [int(x) for x in np.linspace(3, 20)],
                'min_samples_split': [int(x) for x in np.linspace(2, 20)],
                'min_samples_leaf': [int(x) for x in np.linspace(1, 20)],
                'bootstrap': [True, False]}

In [None]:
X_train, X_param_test, y_train, y_param_test = train_test_split(X, y, test_size=0.1, shuffle = True, random_state=0)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 5, n_iter = 100, verbose=2, random_state=0)
# Fit the random search model
rf_random.fit(X_param_test, y_param_test)

In [None]:
y = y.reset_index(drop=True)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

r2_scores = []
mae_scores = []
rmse_scores = []

for random_seed in range(10):
  kf = KFold(shuffle=True, random_state=random_seed)
  for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rf = RandomForestRegressor(verbose=1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f'R2-score of this fold is {r2}')

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)
    print(f'MAE of this fold is {mae}')

    rmse = mean_squared_error(y_test, y_pred, squared = False)
    rmse_scores.append(rmse)
    print(f'RMSE of this fold is {rmse}')


In [None]:
np.mean(r2_scores)

0.9537615225358129

In [None]:
np.mean(mae_scores)

347.972810763286

In [None]:
np.mean(rmse_scores)

491.2141247083989

In [None]:
import joblib

joblib.dump(rf, "random_forest.joblib", compress=3)

['random_forest.joblib']