# 0.0 Imports

In [22]:
import pandas as pd
import numpy  as np

import inflection
import math
import datetime
import random

import seaborn as sns
import xgboost as xgb

#from boruta import BorutaPy
#from scipy import stats
from matplotlib import pyplot as plt
from matplotlib.gridspec import GridSpec
from IPython.core.display import HTML

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso

## 0.1 Helper functions

In [23]:
def rename_columns(dataframe):
    df = dataframe.copy()
    title = lambda x: inflection.titleize(x)
    snakecase = lambda x: inflection.underscore(x)
    spaces = lambda x: x.replace(" ", "")
    cols_old = list(df.columns)
    cols_old = list(map(title, cols_old))
    cols_old = list(map(spaces, cols_old))
    cols_new = list(map(snakecase, cols_old))
    df.columns = cols_new
    return df

## 0.2 Loading data

In [3]:
df_raw_1 = pd.read_csv('../data/train.csv', low_memory=False)
df_raw_2 = pd.read_csv('../data/store.csv', low_memory=False)

In [4]:
# Merging the data, on train we have the sales date on each store for a time, and on store we have information for each store
df_raw = pd.merge(df_raw_1, df_raw_2, how='left', on='Store')

# 1.0 Data description

In [71]:
df1 = df_raw.copy()

## 1.1 Rename columns

In [72]:
#Using the function we created in section 0.1
df1 = rename_columns(df1)

## 1.2 Data dimensions

In [73]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of cols: {}'.format(df1.shape[1]))

Number of rows: 1017209
Number of cols: 18


## 1.3 Data types

In [74]:
df1.dtypes

store                             int64
day_of_week                       int64
date                             object
sales                             int64
customers                         int64
open                              int64
promo                             int64
state_holiday                    object
school_holiday                    int64
store_type                       object
assortment                       object
competition_distance            float64
competition_open_since_month    float64
competition_open_since_year     float64
promo2                            int64
promo2_since_week               float64
promo2_since_year               float64
promo_interval                   object
dtype: object

In [75]:
#Changing the type to date
df1['date'] = pd.to_datetime(df1['date'])

## 1.4 Check NA

In [76]:
df1.isna().sum()

store                                0
day_of_week                          0
date                                 0
sales                                0
customers                            0
open                                 0
promo                                0
state_holiday                        0
school_holiday                       0
store_type                           0
assortment                           0
competition_distance              2642
competition_open_since_month    323348
competition_open_since_year     323348
promo2                               0
promo2_since_week               508031
promo2_since_year               508031
promo_interval                  508031
dtype: int64

In [81]:
df1.isna().mean()

store                           0.000000
day_of_week                     0.000000
date                            0.000000
sales                           0.000000
customers                       0.000000
open                            0.000000
promo                           0.000000
state_holiday                   0.000000
school_holiday                  0.000000
store_type                      0.000000
assortment                      0.000000
competition_distance            0.000000
competition_open_since_month    0.000000
competition_open_since_year     0.000000
promo2                          0.000000
promo2_since_week               0.499436
promo2_since_year               0.499436
promo_interval                  0.499436
dtype: float64

## 1.5 Fillout NA 

In [80]:
# Dealing the columns with NaNs

# competition_distance - imputting 200000 distance for nan values
df1['competition_distance'] = df1['competition_distance'].apply(lambda x: 200000 if np.isnan(x) else x)

# competition_open_since_month - imputting the month of register for competition open
df1['competition_open_since_month'] = df1.apply(lambda x: x['date'].month if np.isnan(x['competition_open_since_month']) else x['competition_open_since_month'], axis=1)

# competition_open_since_year     
df1['competition_open_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['competition_open_since_year']) else x['competition_open_since_year'], axis=1)
  

# promo2_since_week
# promo2_since_year
# promo_interval



In [None]:
df1['date'][0].year

## 1.6 Change Types

## 1.7 Descriptive Statistical

### 1.7.1 Numerical attributes

### 1.7.2 Categorical attributes

# 2.0 Feature engineering

## 2.1 Mapa mental de Hipoteses

## 2.2 Criação de Hipoteses

## 2.3 Lista final de Hipoteses

## 2.4 Feature engineering

# 3.0 Feature filtering

## 3.1 Filtragem das linhas

## 3.2 Seleção das colunas

# 4.0 EDA - Análise exploratória dos dados

## 4.1 Análise univariada

### 4.1.1 Response variable

### 4.1.2 Numerical variable

### 4.1.3 Categorical variable

## 4.2 Análise bivariada

### 4.2.x Hipothesis validation

### 4.2.y Hipothesis resume

## 4.3 Análise multivariada

### 4.3.1 Numerical attributes

### 4.3.2 Categorical attributes

# 5.0 Data Preparation

## 5.1 Normalization

## 5.2 Rescaling

## 5.3 Transformation

### 5.3.1 Encoding

### 5.3.2 Response variable transformation

### 5.3.3 Cyclical features encoding

# 6.0 Feature selection

## 6.1 Splitting dataframe

## 6.2 Boruta as Feature Selector

## 6.3 Manual feature selection

# 7.0 Machine Learning Modelling

## 7.1 Baseline model

## 7.2 Linear regression model

### 7.2.1 Linear regression model - Cross validation

## 7.3 Random Forest 

### 7.4.1 Random Forest  - Cross validation

## 7.5 XGBoost Regrssor

### 7.5.1 XGBoost Regressor - Cross Validation

## 7.6 Compare Model's Performance


### 7.6.1 Single performance

### 7.6.2 Real performance - Cross validation

# 8.0 Hyperparameter Fine Tuning

## 8.1 Random search

## 8.2 - Final model