In [1]:
import pandas as pd             
import numpy as np

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

import sklearn
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score

from aux_fun import evaluate

In [2]:
df = pd.read_csv('./data/train_imputed.csv')

In [3]:
df.shape

(523021, 45)

In [4]:
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,...,Max_VisibilityKm,Min_VisibilitykM,Mean_VisibilityKm,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,2016-01-03,0,1,0,326,7,495,5676,9643,...,19.0,6.0,11.0,1,0,0,0,1,0,0
1,1000,2016-02-03,0,1,0,326,7,608,8111,9643,...,23.0,10.0,13.0,1,0,0,0,1,0,0
2,1000,2016-04-03,0,1,0,326,7,665,8300,9643,...,31.0,8.0,11.0,1,0,0,0,1,0,0
3,1000,2016-05-03,0,1,0,326,7,630,7154,9643,...,31.0,10.0,15.0,1,0,0,0,1,0,0
4,1000,2016-06-03,0,0,0,326,7,0,0,9643,...,31.0,5.0,12.0,1,0,0,0,1,0,0


In [5]:
df.columns

Index(['StoreID', 'Date', 'IsHoliday', 'IsOpen', 'HasPromotions',
       'NearestCompetitor', 'Region', 'NumberOfCustomers', 'NumberOfSales',
       'Region_AreaKM2', 'Region_GDP', 'Region_PopulationK', 'Max_Dew_PointC',
       'Max_Gust_SpeedKm_h', 'Max_Humidity', 'Max_Sea_Level_PressurehPa',
       'Max_TemperatureC', 'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC',
       'Mean_Humidity', 'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',
       'WindDirDegrees', 'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow',
       'Thunderstorm', 'Max_VisibilityKm', 'Min_VisibilitykM',
       'Mean_VisibilityKm', 'Hyper_Market', 'Shopping_Center',
       'Standard_Market', 'Super_Market', 'General', 'With_Fish_Department',
       'With_Non-Food_Department'],
      dtype='object')

In [6]:
y = df['NumberOfSales']

In [7]:
X = df.drop(df[['NumberOfSales', 'WindDirDegrees']], axis=1)

# Train-Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
dates_test = X_test['Date']
dates_train = X_train['Date']

In [10]:
X_train = X_train.drop(['Date'], axis=1)
X_test = X_test.drop(['Date'], axis=1)

# Feature Selection

In [14]:
rfe = RFE(linear_model.LinearRegression(), 3) #10 is the number of features to consider
fit = rfe.fit(X_train, y_train)

In [60]:
feature_mask = fit.get_support() #mask of the chosen features
feature_mask

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True,  True,  True,  True, False], dtype=bool)

In [61]:
feature_selected =X_train.columns[feature_mask]
feature_selected

Index(['Hyper_Market', 'Shopping_Center', 'Standard_Market', 'Super_Market',
       'General', 'With_Fish_Department'],
      dtype='object')

# Model Creation

## OLS

In [11]:
regr = linear_model.LinearRegression()

In [12]:
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
y_pred = regr.predict(X_test)

In [14]:
print('Coefficients: \n', regr.coef_)
#The mean squared error

Coefficients: 
 [ -9.45355568e-02  -2.78753131e+02   4.90733902e+02   7.93713099e+02
   9.91099852e-03  -3.12129345e+01   1.31308787e+01   6.26788112e-03
  -7.50608794e-03   4.56153023e-02   1.82165875e+00   3.23014908e-01
  -1.09694516e+00   8.36379763e-01  -4.27036202e+00   2.25580316e-01
   5.91034052e-01   3.31180809e+00  -7.95379441e+00  -8.08435514e+00
   3.17042746e+00  -1.11846616e+01  -3.08566458e-01   1.23570070e+01
   1.25599696e+01  -4.04107457e-01   7.42459601e+00   7.08456280e+00
   6.73328092e+01   1.85602028e+01  -6.51508397e+01   2.53400828e+01
   2.17957314e+00   3.21578373e+00  -5.26503522e-01   3.16292527e+02
  -1.48425683e+03   9.41430309e+02   2.26533991e+02   1.29380672e+03
  -2.91080530e+03   1.61699858e+03]


In [15]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test,y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))

Mean squared error: 668212.20
R2 score: 0.91


# Creation of the submission 

In [16]:
result = X_test.copy()

In [17]:
result = result.drop(result[['IsHoliday', 'IsOpen', 'HasPromotions', 'NearestCompetitor',\
       'Region', 'NumberOfCustomers', 'Region_AreaKM2', 'Region_GDP',\
       'Region_PopulationK', 'Max_Dew_PointC', 'Max_Gust_SpeedKm_h',\
       'Max_Humidity', 'Max_Sea_Level_PressurehPa', 'Max_TemperatureC',\
       'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC', 'Mean_Humidity',\
       'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',\
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',\
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',\
       'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow', 'Thunderstorm',\
       'Max_VisibilityKm', 'Min_VisibilitykM', 'Mean_VisibilityKm',\
       'Hyper_Market', 'Shopping_Center', 'Standard_Market', 'Super_Market',\
       'General', 'With_Fish_Department', 'With_Non-Food_Department']], axis=1)

In [18]:
result['Month'] = pd.DatetimeIndex(dates_test).month

In [19]:
result['NumberOfSales'] = y_pred

In [20]:
result.head()

Unnamed: 0,StoreID,Month,NumberOfSales
199480,1284,2,4252.323338
492279,1705,4,-106.213763
500907,1717,6,4863.10498
63763,1091,1,5456.015387
74453,1107,10,717.238902


In [21]:
result.shape

(104605, 3)

In [22]:
#Group by Month
target =result.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()

In [23]:
#set to 0 the prediction for days in which the shop is closed
#result['pred'][result['IsOpen']==0]=0 

In [24]:
X_test['NumberOfSales']= y_test
X_test['Date'] = dates_train

In [25]:
evaluate(X_test,result)

nan