In [2]:
import pandas as pd             
import numpy as np

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

import sklearn
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score

#from aux_fun import evaluate

import csv

In [3]:
df = pd.read_csv('./data/train_imputed.csv')

In [4]:
df.shape

(523021, 47)

In [5]:
df.head()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,...,Day,Month,Year,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department
0,1000,0,1,0,326,7,495,5676,9643,17130,...,3,1,2016,1,0,0,0,1,0,0
1,1000,0,1,0,326,7,608,8111,9643,17130,...,3,2,2016,1,0,0,0,1,0,0
2,1000,0,1,0,326,7,665,8300,9643,17130,...,3,4,2016,1,0,0,0,1,0,0
3,1000,0,1,0,326,7,630,7154,9643,17130,...,3,5,2016,1,0,0,0,1,0,0
4,1000,0,0,0,326,7,0,0,9643,17130,...,3,6,2016,1,0,0,0,1,0,0


In [6]:
df.columns

Index(['StoreID', 'IsHoliday', 'IsOpen', 'HasPromotions', 'NearestCompetitor',
       'Region', 'NumberOfCustomers', 'NumberOfSales', 'Region_AreaKM2',
       'Region_GDP', 'Region_PopulationK', 'Max_Dew_PointC',
       'Max_Gust_SpeedKm_h', 'Max_Humidity', 'Max_Sea_Level_PressurehPa',
       'Max_TemperatureC', 'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC',
       'Mean_Humidity', 'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',
       'WindDirDegrees', 'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow',
       'Thunderstorm', 'Max_VisibilityKm', 'Min_VisibilitykM',
       'Mean_VisibilityKm', 'Day', 'Month', 'Year', 'Hyper_Market',
       'Shopping_Center', 'Standard_Market', 'Super_Market', 'General',
       'With_Fish_Department', 'With_Non-Food_Department'],
      dtype='object')

In [7]:
y = df['NumberOfSales']

In [8]:
X = df.drop(df[['NumberOfSales', 'WindDirDegrees']], axis=1)

# Train-Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Selection

## NOT WORKING

In [10]:
rfe = RFE(linear_model.LinearRegression(), 3) #10 is the number of features to consider
fit = rfe.fit(X_train, y_train)

In [11]:
feature_mask = fit.get_support() #mask of the chosen features
feature_mask

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False], dtype=bool)

In [12]:
feature_selected =X_train.columns[feature_mask]
feature_selected

Index(['Hyper_Market', 'Standard_Market', 'Super_Market'], dtype='object')

# Model Creation

## OLS

In [13]:
X_train.columns

Index(['StoreID', 'IsHoliday', 'IsOpen', 'HasPromotions', 'NearestCompetitor',
       'Region', 'NumberOfCustomers', 'Region_AreaKM2', 'Region_GDP',
       'Region_PopulationK', 'Max_Dew_PointC', 'Max_Gust_SpeedKm_h',
       'Max_Humidity', 'Max_Sea_Level_PressurehPa', 'Max_TemperatureC',
       'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC', 'Mean_Humidity',
       'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC',
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Precipitationmm',
       'CloudCover', 'Fog', 'Hail', 'Rain', 'Snow', 'Thunderstorm',
       'Max_VisibilityKm', 'Min_VisibilitykM', 'Mean_VisibilityKm', 'Day',
       'Month', 'Year', 'Hyper_Market', 'Shopping_Center', 'Standard_Market',
       'Super_Market', 'General', 'With_Fish_Department',
       'With_Non-Food_Department'],
      dtype='object')

In [14]:
regr = linear_model.LinearRegression()

In [15]:
regr.fit(X_train['NumberOfCustomers'].reshape(-1, 1), y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
y_pred = regr.predict(X_test['NumberOfCustomers'].reshape(-1, 1))

In [17]:
print('Coefficients: \n', regr.coef_)
#The mean squared error

Coefficients: 
 [ 13.3448609]


## Lasso

In [18]:
lasso = linear_model.Lasso(0.01, normalize=True)

In [19]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [20]:
y_pred_lasso = lasso.predict(X_test)

In [21]:
y_pred_lasso

array([ 4154.18319127,   -82.18713212,  4772.11054264, ...,  6060.33915427,
        6333.42289979,  -353.03471754])

## Lasso CV

In [22]:
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1]


In [23]:
lassocv = linear_model.LassoCV(eps=0.001, n_alphas=100, alphas=alphas, fit_intercept=True, normalize=False, max_iter=10000,  cv=10)

In [24]:
lassocv.fit(X_train, y_train)

LassoCV(alphas=[1e-05, 0.0001, 0.001, 0.01, 0.1], copy_X=True, cv=10,
    eps=0.001, fit_intercept=True, max_iter=10000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [25]:
y_pred_lassocv = lassocv.predict(X_test)

In [26]:
lassocv.alpha_

0.01

# Creation of the submission 

In [27]:
result = pd.DataFrame(X_test['StoreID'])
result['Month']=X_test['Month']
result['NumberOfSales'] = y_pred
#Group by Month
result =result.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()

In [28]:
result_lasso = pd.DataFrame(X_test['StoreID'])
result_lasso['Month']=X_test['Month']
result_lasso['NumberOfSales'] = y_pred_lasso
#Group by Month
result_lasso =result_lasso.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()

In [29]:
result_lassocv = pd.DataFrame(X_test['StoreID'])
result_lassocv['Month']=X_test['Month']
result_lassocv['NumberOfSales'] = y_pred_lassocv
#Group by Month
result_lassocv =result_lassocv.groupby(['StoreID','Month'], as_index=False)['NumberOfSales'].sum()

In [30]:
X_test['NumberOfSales']= y_test

In [31]:
X_test.head()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,Region_AreaKM2,Region_GDP,Region_PopulationK,...,Month,Year,Hyper_Market,Shopping_Center,Standard_Market,Super_Market,General,With_Fish_Department,With_Non-Food_Department,NumberOfSales
199480,1284,0,1,1,1145,6,245,16327,15931,4911,...,2,2017,1,0,0,0,1,0,0,3452
492279,1705,0,0,0,3482,2,0,32221,16186,5727,...,4,2016,1,0,0,0,1,0,0,0
500907,1717,0,1,1,3029,2,252,32221,16186,5727,...,6,2017,1,0,0,0,0,0,1,4679
63763,1091,0,1,0,7337,9,351,15566,15017,8146,...,1,2016,1,0,0,0,0,0,1,5232
74453,1107,0,0,0,22406,1,0,7385,9893,1018,...,10,2016,0,0,1,0,0,0,1,0


### Error function given

In [32]:
from aux_fun import evaluate

ImportError: cannot import name 'evaluate'

In [None]:
X_test.columns

In [None]:
eval_lr =evaluate(X_test,result)
eval_lasso = evaluate(X_test,result_lasso)
eval_lassocv = evaluate(X_test,result_lassocv)

In [None]:
print('Linear Regression Error on Number of Customers: ', eval_lr)
print('Lasso Error: ',  eval_lasso)
print('Lasso CV Error: ', eval_lassocv)

### R2

In [None]:
r2_lr = r2_score(y_test, y_pred)
r2_lasso =r2_score(y_test, y_pred_lasso)
r2_lassocv = r2_score(y_test, y_pred_lassocv)

In [None]:
print('Linear Regression R2: ', r2_lr)
print('Lasso R2: ',  r2_lasso)
print('Lasso CV R2: ', r2_lassocv)