In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import holidays

In [2]:
dftrain = pd.read_csv("../data/train.csv")
dftest = pd.read_csv("../data/test.csv")
dftrain0 = dftrain
dftest0 = dftest

In [3]:
dftrain.columns

Index(['row_id', 'date', 'country', 'store', 'product', 'num_sold'], dtype='object')

In [4]:
print(dftrain.isna().sum())
print(dftest.isna().sum())

row_id      0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64
row_id     0
date       0
country    0
store      0
product    0
dtype: int64


In [5]:
dftrain.drop(['row_id'],axis=1,inplace=True)
dftest.drop(['row_id'],axis=1,inplace=True)

In [6]:
dftrain.head()

Unnamed: 0,date,country,store,product,num_sold
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [7]:
dftest.head()

Unnamed: 0,date,country,store,product
0,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,2019-01-01,Finland,KaggleRama,Kaggle Hat


In [8]:
dftrain['date']=pd.to_datetime(dftrain['date'])
dftest['date']=pd.to_datetime(dftest['date'])

In [9]:
dftrain['weekday']=dftrain['date'].dt.day_name()
dftest['weekday']=dftest['date'].dt.day_name()

In [10]:
dftrain.head()

Unnamed: 0,date,country,store,product,num_sold,weekday
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329,Thursday
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520,Thursday
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146,Thursday
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572,Thursday
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911,Thursday


In [11]:
dftrain['year']=dftrain['date'].dt.year
dftest['year']=dftest['date'].dt.year
dftrain['weekofyear']=dftrain['date'].dt.isocalendar().week
dftest['weekofyear']=dftest['date'].dt.isocalendar().week
dftrain['month']=dftrain['date'].dt.month
dftest['month']=dftest['date'].dt.month
dftrain['day_of_month']=dftrain['date'].dt.day
dftest['day_of_month']=dftest['date'].dt.day
dftrain['day_of_year']=dftrain['date'].dt.day_of_year
dftest['day_of_year']=dftest['date'].dt.day_of_year

In [12]:
dftest['is_weekend'] = dftest['weekday'].isin(['Saturday','Sunday'])
dftrain['is_weekend'] = dftrain['weekday'].isin(['Saturday','Sunday'])

In [13]:
traincountry = pd.get_dummies(dftrain['country'],prefix='country_')
testcountry = pd.get_dummies(dftest['country'],prefix='country_')
trainstore = pd.get_dummies(dftrain['store'],prefix='store_')
teststore = pd.get_dummies(dftest['store'],prefix='store_')
trainproduct = pd.get_dummies(dftrain['product'],prefix='product_')
testproduct = pd.get_dummies(dftest['product'],prefix='product_')
trainweekday = pd.get_dummies(dftrain['weekday'])
testweekday = pd.get_dummies(dftest['weekday'])
trainyear = pd.get_dummies(dftrain['year'],prefix='year_')
testyear = pd.get_dummies(dftest['year'],prefix='year_')
trainweekofyear = pd.get_dummies(dftrain['weekofyear'],prefix='weekofyear_')
testweekofyear = pd.get_dummies(dftest['weekofyear'],prefix='weekofyear_')
trainmonth = pd.get_dummies(dftrain['month'],prefix='month_')
testmonth = pd.get_dummies(dftest['month'],prefix='month_')
trainday_of_month = pd.get_dummies(dftrain['day_of_month'],prefix='day_of_month_')
testday_of_month = pd.get_dummies(dftest['day_of_month'],prefix='day_of_month_')
trainday_of_year = pd.get_dummies(dftrain['day_of_year'],prefix='day_of_year_')
testday_of_year = pd.get_dummies(dftest['day_of_year'],prefix='day_of_year_')


dftrain = pd.merge(left=dftrain,right=traincountry,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testcountry,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainstore,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testcountry,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainproduct,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testproduct,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainweekday,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testweekday,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainyear,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testyear,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainweekofyear,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testweekofyear,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainmonth,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testmonth,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainday_of_month,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testday_of_month,
    left_index=True,right_index=True)

dftrain = pd.merge(left=dftrain,right=trainday_of_year,
    left_index=True,right_index=True)
dftest = pd.merge(left=dftest,right=testday_of_year,
    left_index=True,right_index=True)

In [14]:
dftrain.drop('country',axis=1,inplace=True)
dftest.drop('country',axis=1,inplace=True)
dftrain.drop('store',axis=1,inplace=True)
dftest.drop('store',axis=1,inplace=True)
dftrain.drop('product',axis=1,inplace=True)
dftest.drop('product',axis=1,inplace=True)
dftrain.drop('date',axis=1,inplace=True)
dftest.drop('date',axis=1,inplace=True)
dftrain.drop('weekday',axis=1,inplace=True)
dftest.drop('weekday',axis=1,inplace=True)
dftrain.drop('year',axis=1,inplace=True)
dftest.drop('year',axis=1,inplace=True)
dftrain.drop('weekofyear',axis=1,inplace=True)
dftest.drop('weekofyear',axis=1,inplace=True)
dftrain.drop('month',axis=1,inplace=True)
dftest.drop('month',axis=1,inplace=True)
dftrain.drop('day_of_month',axis=1,inplace=True)
dftest.drop('day_of_month',axis=1,inplace=True)
dftrain.drop('day_of_year',axis=1,inplace=True)
dftest.drop('day_of_year',axis=1,inplace=True)

In [15]:
dftrain.head()

Unnamed: 0,num_sold,is_weekend,country__Finland,country__Norway,country__Sweden,store__KaggleMart,store__KaggleRama,product__Kaggle Hat,product__Kaggle Mug,product__Kaggle Sticker,...,day_of_year__357,day_of_year__358,day_of_year__359,day_of_year__360,day_of_year__361,day_of_year__362,day_of_year__363,day_of_year__364,day_of_year__365,day_of_year__366
0,329,False,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,520,False,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,146,False,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,572,False,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,911,False,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
dftest.columns

Index(['is_weekend', 'country__Finland_x', 'country__Norway_x',
       'country__Sweden_x', 'country__Finland_y', 'country__Norway_y',
       'country__Sweden_y', 'product__Kaggle Hat', 'product__Kaggle Mug',
       'product__Kaggle Sticker',
       ...
       'day_of_year__356', 'day_of_year__357', 'day_of_year__358',
       'day_of_year__359', 'day_of_year__360', 'day_of_year__361',
       'day_of_year__362', 'day_of_year__363', 'day_of_year__364',
       'day_of_year__365'],
      dtype='object', length=478)

In [17]:
dftest.head()

Unnamed: 0,is_weekend,country__Finland_x,country__Norway_x,country__Sweden_x,country__Finland_y,country__Norway_y,country__Sweden_y,product__Kaggle Hat,product__Kaggle Mug,product__Kaggle Sticker,...,day_of_year__356,day_of_year__357,day_of_year__358,day_of_year__359,day_of_year__360,day_of_year__361,day_of_year__362,day_of_year__363,day_of_year__364,day_of_year__365
0,False,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,False,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,False,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,False,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,False,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
Y = dftrain["num_sold"]
X = dftrain.drop("num_sold",axis=1)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [22]:
lr = LinearRegression()
ridge = Ridge()

In [23]:
lr.fit(X_train,y_train)
ridge.fit(X_train,y_train)

Ridge()

In [24]:
print(lr.score(X_test,y_test))
print(ridge.score(X_test,y_test))

0.8337993604366821
0.834495679614659


In [25]:
from sklearn import linear_model
reg = linear_model.BayesianRidge()
reg.fit(X_train,y_train)
print(reg.score(X_test,y_test))

0.8349257327895001


In [26]:
from sklearn import svm
svmreg = svm.SVR()
svmreg.fit(X_train,y_train)
print(svmreg.score(X_test,y_test))

0.6603111730219622


In [27]:
from sklearn.linear_model import SGDClassifier
sgdclf = SGDClassifier()
sgdclf.fit(X_train,y_train)
print(sgdclf.score(X_test,y_test))

0.004752851711026616


In [28]:
from sklearn.neighbors import NearestCentroid
knnclfcent = NearestCentroid()
knnclfcent.fit(X_train,y_train)
print(knnclfcent.score(X_test,y_test))

0.007414448669201521


In [29]:
X_train.head()

Unnamed: 0,is_weekend,country__Finland,country__Norway,country__Sweden,store__KaggleMart,store__KaggleRama,product__Kaggle Hat,product__Kaggle Mug,product__Kaggle Sticker,Friday,...,day_of_year__357,day_of_year__358,day_of_year__359,day_of_year__360,day_of_year__361,day_of_year__362,day_of_year__363,day_of_year__364,day_of_year__365,day_of_year__366
25456,False,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14983,False,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18009,False,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
22326,False,0,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10114,False,0,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [30]:
dftrain0.shape

(26298, 12)

In [31]:
dftrain.shape

(26298, 483)

In [32]:
X_train.shape

(21038, 482)

In [34]:
#dftrain.to_csv('featurehotencoded_train.csv', index=False)
#dftest.to_csv('featurehotencoded_test.csv', index=False)

**Grid Searches to be done**