In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import pandas as pd

import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
data_path = './store-sales-time-series-forecasting/'
datas = os.listdir(data_path)
datas

['holidays_events.csv',
 'oil.csv',
 'sample_submission.csv',
 'stores.csv',
 'test.csv',
 'train.csv',
 'transactions.csv']

In [4]:
holidays = pd.read_csv(data_path +'holidays_events.csv').drop('transferred', axis = 1)

In [5]:
holidays

Unnamed: 0,date,type,locale,locale_name,description
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba
...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3
346,2017-12-23,Additional,National,Ecuador,Navidad-2
347,2017-12-24,Additional,National,Ecuador,Navidad-1
348,2017-12-25,Holiday,National,Ecuador,Navidad


In [6]:
train = pd.read_csv(data_path + 'train.csv', usecols=['date', 'store_nbr', 'family', 'sales', 'onpromotion'])

In [7]:
oil = pd.read_csv(data_path + 'oil.csv')

In [8]:
stores = pd.read_csv(data_path + 'stores.csv')

In [9]:
transactions = pd.read_csv(data_path + 'transactions.csv')

In [10]:
train

Unnamed: 0,date,store_nbr,family,sales,onpromotion
0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,2013-01-01,1,BABY CARE,0.000,0
2,2013-01-01,1,BEAUTY,0.000,0
3,2013-01-01,1,BEVERAGES,0.000,0
4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0
3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [11]:
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [12]:
stores

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [13]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [14]:
pd.merge(oil, transactions)

Unnamed: 0,date,dcoilwtico,store_nbr,transactions
0,2013-01-01,,25,770
1,2013-01-02,93.14,1,2111
2,2013-01-02,93.14,2,2358
3,2013-01-02,93.14,3,3487
4,2013-01-02,93.14,4,1922
...,...,...,...,...
59646,2017-08-15,47.57,50,2804
59647,2017-08-15,47.57,51,1573
59648,2017-08-15,47.57,52,2255
59649,2017-08-15,47.57,53,932


In [15]:
X = pd.merge(pd.merge(pd.merge(train, oil, how='outer'), transactions, how='outer'), stores)

In [16]:
X.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico,transactions,city,state,type,cluster
0,2013-01-01,1.0,AUTOMOTIVE,0.0,0.0,,,Quito,Pichincha,D,13
1,2013-01-01,1.0,BABY CARE,0.0,0.0,,,Quito,Pichincha,D,13
2,2013-01-01,1.0,BEAUTY,0.0,0.0,,,Quito,Pichincha,D,13
3,2013-01-01,1.0,BEVERAGES,0.0,0.0,,,Quito,Pichincha,D,13
4,2013-01-01,1.0,BOOKS,0.0,0.0,,,Quito,Pichincha,D,13


In [45]:
categorical_clumns = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster']
numerical_columns = ['sales', 'onpromotion']

In [46]:
categorical_encoder = OneHotEncoder(handle_unknown='ignore')
oil_pipe = Pipeline([
    ("imputer_dcoilwtico", SimpleImputer(strategy="constant", fill_value = 93.14)),
    ("scaler", StandardScaler())])
transactions_pipe = Pipeline([
    ("imputer_transactions", SimpleImputer(strategy="constant", fill_value = 0)),
    ("scaler", StandardScaler())])


In [48]:
preprocessing = ColumnTransformer(
    [
        ("oil_pipe", oil_pipe, ['dcoilwtico']),
        ("transactions_pipe", transactions_pipe, ['transactions']),
        ("scaler", StandardScaler(), numerical_columns),
        ("cat", categorical_encoder, categorical_clumns),

    ])

In [49]:
X_pre = preprocessing.fit_transform(X.iloc[:, 1:])

In [50]:
X_pre

<3000888x151 sparse matrix of type '<class 'numpy.float64'>'
	with 30008880 stored elements in Compressed Sparse Row format>

In [54]:
X_pre.toarray()[1000]

array([ 0.90197136,  0.14631015, -0.21486051, -0.21301217,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.  