In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import matplotlib.pylab as pylab

%matplotlib inline
matplotlib.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 8,6

from sklearn.model_selection import train_test_split

In [None]:
rossman_df = pd.read_csv("train.csv", low_memory=False)

In [None]:
store_df = pd.read_csv("store.csv", low_memory=False)

In [None]:
rossman_df.drop(rossman_df[rossman_df.Open == 0].index, inplace = True)
rossman_df.drop(rossman_df[rossman_df.Sales == 0].index, inplace = True)

In [None]:
rossman_df['Date']=pd.to_datetime(rossman_df['Date'])
rossman_df['Year'] = rossman_df['Date'].apply(lambda x: x.year)
rossman_df['Month'] = rossman_df['Date'].apply(lambda x: x.month)
rossman_df['Day'] = rossman_df['Date'].apply(lambda x: x.day)
rossman_df['WeekOfYear'] = rossman_df['Date'].apply(lambda x: x.weekofyear)

In [None]:
rossman_df.drop(columns=['StateHoliday', 'Open'], inplace=True)


In [None]:
store_df = store_df.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2SinceWeek',
                     'Promo2SinceYear', 'PromoInterval'], axis=1)

In [None]:
store_df.CompetitionDistance.fillna(store_df.CompetitionDistance.mode(), inplace=True)

In [None]:
df = pd.merge(rossman_df, store_df, how='left', on='Store')

In [None]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [None]:
df = remove_outlier(df, 'Sales')
df = remove_outlier(df, 'Customers')
df = remove_outlier(df, 'CompetitionDistance')

In [None]:
df = pd.get_dummies(df,columns=['StoreType','Assortment'])

In [None]:
df = df.drop(['Store','Date','Year', 'WeekOfYear'] , axis = 1)

In [None]:
X = df.drop(['Sales'] , axis = 1)
y = df.Sales
y = y.reset_index(drop=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, Dropout
r2_scores = []
mae_scores = []
rmse_scores = []

for random_seed in range(10):
  kf = KFold(shuffle=True, random_state=random_seed)
  for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = Sequential()
    model.add(Dense(350,input_dim = 15,activation="relu"))
    model.add(Dense(350,activation="relu"))
    model.add(Dense(350,activation="relu"))
    model.add(Dense(350,activation="relu"))
    model.add(Dense(350,activation="relu"))
    model.add(Dense(1,activation = "linear"))
    model.compile(optimizer='adam',loss="mean_squared_error",metrics=["mean_absolute_error"])
    model.fit(X_train,y_train,epochs=50,batch_size=32)

    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f'R2-score of this fold is {r2}')

    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)
    print(f'MAE of this fold is {mae}')

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    rmse_scores.append(rmse)
    print(f'RMSE of this fold is {rmse}')


In [None]:
np.mean(r2_scores)

0.9096389642208643

In [None]:
np.mean(mae_scores)

499.3843271323466

In [None]:
np.mean(rmse_scores)

686.510198140048

In [None]:
import joblib

joblib.dump(model, "nn.joblib", compress=3)

INFO:tensorflow:Assets written to: C:\Users\22300293\AppData\Local\Temp\tmpcrpw4a1z\assets


INFO:tensorflow:Assets written to: C:\Users\22300293\AppData\Local\Temp\tmpcrpw4a1z\assets


['nn.joblib']