In [57]:
import pandas as pd             
import numpy as np

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

import sklearn
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score

from aux_fun import evaluate

import csv

In [58]:
df = pd.read_csv('./data/train_imputed.csv')

In [59]:
df.shape

(523021, 45)

In [60]:
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,...,Max_VisibilityKm,Min_VisibilitykM,Mean_VisibilityKm,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,2016-01-03,0,1,0,326,7,495,5676,9643,...,19.0,6.0,11.0,1,0,0,0,1,0,0
1,1000,2016-02-03,0,1,0,326,7,608,8111,9643,...,23.0,10.0,13.0,1,0,0,0,1,0,0
2,1000,2016-04-03,0,1,0,326,7,665,8300,9643,...,31.0,8.0,11.0,1,0,0,0,1,0,0
3,1000,2016-05-03,0,1,0,326,7,630,7154,9643,...,31.0,10.0,15.0,1,0,0,0,1,0,0
4,1000,2016-06-03,0,0,0,326,7,0,0,9643,...,31.0,5.0,12.0,1,0,0,0,1,0,0


In [61]:
df.columns

Index(['StoreID', 'Date', 'IsHoliday', 'IsOpen', 'HasPromotions',
       'NearestCompetitor', 'Region', 'NumberOfCustomers', 'NumberOfSales',
       'Region_AreaKM2', 'Region_GDP', 'Region_PopulationK', 'Max_Dew_PointC',
       'Max_Gust_SpeedKm_h', 'Max_Humidity', 'Max_Sea_Level_PressurehPa',
       'Max_TemperatureC', 'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC',
       'Mean_Humidity', 'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',
       'WindDirDegrees', 'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow',
       'Thunderstorm', 'Max_VisibilityKm', 'Min_VisibilitykM',
       'Mean_VisibilityKm', 'Hyper_Market', 'Shopping_Center',
       'Standard_Market', 'Super_Market', 'General', 'With_Fish_Department',
       'With_Non-Food_Department'],
      dtype='object')

In [62]:
y = df['NumberOfSales']

In [63]:
X = df.drop(df[['NumberOfSales', 'WindDirDegrees']], axis=1)

# Train-Test Split

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
dates_test = X_test['Date']

In [66]:
X_train = X_train.drop(['Date'], axis=1)
X_test = X_test.drop(['Date'], axis=1)

# Feature Selection

In [14]:
rfe = RFE(linear_model.LinearRegression(), 3) #10 is the number of features to consider
fit = rfe.fit(X_train, y_train)

In [60]:
feature_mask = fit.get_support() #mask of the chosen features
feature_mask

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True,  True,  True,  True, False], dtype=bool)

In [61]:
feature_selected =X_train.columns[feature_mask]
feature_selected

Index(['Hyper_Market', 'Shopping_Center', 'Standard_Market', 'Super_Market',
       'General', 'With_Fish_Department'],
      dtype='object')

# Model Creation

## OLS

In [38]:
X_train.columns

Index(['StoreID', 'IsHoliday', 'IsOpen', 'HasPromotions', 'NearestCompetitor',
       'Region', 'NumberOfCustomers', 'Region_AreaKM2', 'Region_GDP',
       'Region_PopulationK', 'Max_Dew_PointC', 'Max_Gust_SpeedKm_h',
       'Max_Humidity', 'Max_Sea_Level_PressurehPa', 'Max_TemperatureC',
       'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC', 'Mean_Humidity',
       'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',
       'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow', 'Thunderstorm',
       'Max_VisibilityKm', 'Min_VisibilitykM', 'Mean_VisibilityKm',
       'Hyper_Market', 'Shopping_Center', 'Standard_Market', 'Super_Market',
       'General', 'With_Fish_Department', 'With_Non-Food_Department'],
      dtype='object')

In [39]:
regr = linear_model.LinearRegression()

In [41]:
regr.fit(X_train['NumberOfCustomers'].reshape(-1, 1), y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [43]:
y_pred = regr.predict(X_test['NumberOfCustomers'].reshape(-1, 1))

In [44]:
print('Coefficients: \n', regr.coef_)
#The mean squared error

Coefficients: 
 [ 13.3448609]


## Lasso

In [67]:
lasso = linear_model.Lasso(0.01, normalize=True)

In [68]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [69]:
y_pred = lasso.predict(X_test)

In [70]:
y_pred

array([ 4188.84530665,   -66.9666144 ,  4806.47294474, ...,  6102.82701915,
        6395.80065768,  -409.50500737])

## Lasso CV

In [11]:
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1]

In [12]:
lassocv = linear_model.LassoCV(eps=0.001, n_alphas=100, alphas=alphas, fit_intercept=True, normalize=False, max_iter=10000,  cv=10)

In [13]:
lassocv.fit(X_train, y_train)

LassoCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1], copy_X=True, cv=10,
    eps=0.001, fit_intercept=True, max_iter=10000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [14]:
y_pred = lassocv.predict(X_test)

In [15]:
lassocv.alpha_

0.01

# Creation of the submission 

In [71]:
result = X_test.copy()

In [72]:
result = result.drop(result[['IsHoliday', 'IsOpen', 'HasPromotions', 'NearestCompetitor',\
       'Region', 'NumberOfCustomers', 'Region_AreaKM2', 'Region_GDP',\
       'Region_PopulationK', 'Max_Dew_PointC', 'Max_Gust_SpeedKm_h',\
       'Max_Humidity', 'Max_Sea_Level_PressurehPa', 'Max_TemperatureC',\
       'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC', 'Mean_Humidity',\
       'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',\
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',\
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',\
       'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow', 'Thunderstorm',\
       'Max_VisibilityKm', 'Min_VisibilitykM', 'Mean_VisibilityKm',\
       'Hyper_Market', 'Shopping_Center', 'Standard_Market', 'Super_Market',\
       'General', 'With_Fish_Department', 'With_Non-Food_Department']], axis=1)

In [73]:
result['Month'] = pd.DatetimeIndex(dates_test).month

In [74]:
result['NumberOfSales'] = y_pred

In [75]:
result.head()

Unnamed: 0,StoreID,Month,NumberOfSales
199480,1284,2,4188.845307
492279,1705,4,-66.966614
500907,1717,6,4806.472945
63763,1091,1,5415.749179
74453,1107,10,727.217203


In [76]:
result.shape

(104605, 3)

In [77]:
#Group by Month
target =result.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()

In [78]:
#set to 0 the prediction for days in which the shop is closed
#result['pred'][result['IsOpen']==0]=0 

In [79]:
X_test['NumberOfSales']= y_test
X_test['Date'] = dates_test

In [80]:
X_test.head()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,Region_AreaKM2,Region_GDP,Region_PopulationK,...,Mean_VisibilityKm,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department,NumberOfSales,Date
199480,1284,0,1,1,1145,6,245,16327,15931,4911,...,26.0,1,0,0,0,1,0,0,3452,2017-02-21
492279,1705,0,0,0,3482,2,0,32221,16186,5727,...,10.0,1,0,0,0,1,0,0,0,2016-04-17
500907,1717,0,1,1,3029,2,252,32221,16186,5727,...,9.0,1,0,0,0,0,0,1,4679,2017-06-23
63763,1091,0,1,0,7337,9,351,15566,15017,8146,...,8.0,1,0,0,0,0,0,1,5232,2016-01-03
74453,1107,0,0,0,22406,1,0,7385,9893,1018,...,10.0,0,0,1,0,0,0,1,0,2016-10-07


In [81]:
eval =evaluate(X_test,result)
eval

0.92128141678975584

In [82]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test,y_pred))
# Explained variance score: 1 is perfect prediction
print('R2 score: %.2f' % r2_score(y_test, y_pred))

Mean squared error: 669964.41
R2 score: 0.91


In [83]:
with open('evaluation.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow('linear_regression,'+X_train.columns+','+eval)

TypeError: Can't convert 'float' object to str implicitly

In [None]:
# Regression result with all the features: 0.92127101142788959
# Regression on Number of customers: 0.9215451778994711
# Lasso alpha = 0.01, norm=True:   0.92128141678975584
# Lasso alpha = 0.0001, norm=True: 0.92127100850880839
# LassoCV(eps=0.001, n_alphas=100, alphas=alphas, \
    #fit_intercept=True, normalize=False, max_iter=10000,  cv=10): 0.9212710583672874