In [4]:
#!pip install sklearn
import pandas as pd
import inflection
import math
import numpy as np
import datetime
df_sales_raw = pd.read_csv('train.csv', low_memory=False)
df_store_raw = pd.read_csv('store.csv', low_memory=False)
df_raw = pd.merge(df_sales_raw, df_store_raw, how='left', on='Store')
df1 = df_raw.copy()
cols_old = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
            'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
            'CompetitionDistance', 'CompetitionOpenSinceMonth',
            'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
            'Promo2SinceYear', 'PromoInterval']
snakecase = lambda x: inflection.underscore(x)
cols_new = list(map(snakecase, cols_old))
df1.columns = cols_new
df1['date'] = pd.to_datetime(df1['date'])
df1['competition_distance'] = df1['competition_distance'].apply(lambda x: 200000.0 if math.isnan(x) else x)
df1['competition_open_since_month'] = df1.apply(lambda x: x['date'].month if math.isnan(x['competition_open_since_month']) else x['competition_open_since_month'], axis=1)
df1['competition_open_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['competition_open_since_year']) else x['competition_open_since_year'], axis=1)
df1['promo2_since_week'] = df1.apply(lambda x: x['date'].week if math.isnan(x['promo2_since_week']) else x['promo2_since_week'], axis=1)
df1['promo2_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['promo2_since_year']) else x['promo2_since_year'], axis=1)
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
df1['promo_interval'] = df1['promo_interval'].fillna(0)
df1['month_map'] = df1['date'].dt.month.map(month_map)
df1['is_promo'] = df1[['promo_interval', 'month_map']].apply(lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split(',') else 0, axis=1)
df1['competition_open_since_month'] = df1['competition_open_since_month'].astype(int)
df1['competition_open_since_year'] = df1['competition_open_since_year'].astype(int)
df1['promo2_since_week'] = df1['promo2_since_week'].astype(int)
df1['promo2_since_year'] = df1['promo2_since_year'].astype(int)
df2 = df1.copy()
df2['year'] = df2['date'].dt.year
df2['month'] = df2['date'].dt.month
df2['day'] = df2['date'].dt.day
df2['week_of_year'] = df2['date'].dt.isocalendar().week
df2['year_week'] = df2['date'].dt.strftime('%Y-%W')
df2['competition_since'] = df2.apply(lambda x: datetime.datetime(year=x['competition_open_since_year'], month=x['competition_open_since_month'], day=1), axis=1)
df2['competition_time_month'] = ((df2['date'] - df2['competition_since']) / 30).apply(lambda x: x.days).astype(int)
df2['competition_since'] = df2.apply( lambda x: datetime.datetime(year=x['competition_open_since_year'],month=x['competition_open_since_month'],day=1 ), axis=1 )
df2['competition_time_month'] = ( ( df2['date'] - df2['competition_since'] )/30).apply( lambda x: x.days ).astype( int )
df2['promo_since'] = df2['promo2_since_year'].astype( str ) + '-' + df2['promo2_since_week'].astype( str )
df2['promo_since'] = df2['promo_since'].apply( lambda x: datetime.datetime.strptime( x + '-1', '%Y-%W-%w' ) - datetime.timedelta( days=7 ) )
df2['promo_time_week'] = ( ( df2['date'] - df2['promo_since'] )/7 ).apply(lambda x: x.days ).astype( int )
df2['assortment'] = df2['assortment'].apply( lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended' )
df2['state_holiday'] = df2['state_holiday'].apply( lambda x: 'public_holiday' if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x == 'c' else 'regular_day' )
df3 = df2.copy()
df3
df3 = df3[(df3['open'] != 0) & (df3['sales'] > 0)]
cols_drop = ['customers','open','promo_interval','month_map']
df3 = df3.drop(cols_drop,axis=1)
df3.columns
num_attributes = df3.select_dtypes(include=['int64', 'float64'])
cat_attributes = df3.select_dtypes(exclude=['int64','float64','datetime64[ns]'])
df4 = df3.copy()
aux1 = df4.select_dtypes(include = ['int64','float64'])
from sklearn.preprocessing import RobustScaler,MinMaxScaler
rs = RobustScaler()
df4['competition_distance'] = rs.fit_transform(df4[['competition_distance']].values)
rs1 = RobustScaler()
df4['competition_time_month'] = rs1.fit_transform(df4[['competition_time_month']].values)
mms = MinMaxScaler()
df4['promo_time_week'] = mms.fit_transform(df4[['promo_time_week']].values)
mms1 = MinMaxScaler()
df4['year'] = mms1.fit_transform(df4[['year']].values)
df4 = pd.get_dummies(df4, prefix=['state_holiday'], columns=['state_holiday'], dtype=int)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df4['store_type'] = le.fit_transform(df4['store_type'])
assortment_dict = {'basic': 1, 'extra': 2, 'extended': 3}
df4['assortment'] = df4['assortment'].map(assortment_dict)
df4['month_sin'] = df4['month'].apply(lambda x: np.sin(x * (2. * np.pi/12)))
df4['sales'] = np.log1p(df4['sales'])
df4['month_cos'] = df4['month'].apply(lambda x: np.cos(x *(2. * np.pi/12)))
df4['day_sin'] = df4['day'].apply(lambda x: np.sin(x * (2. * np.pi/30)))
df4['day_cos'] = df4['day'].apply(lambda x: np.cos(x *(2. * np.pi/30)))
df4['week_of_year_sin'] = df4['week_of_year'].apply(lambda x: np.sin(x * (2. * np.pi/52)))
df4['week_of_year_cos'] = df4['week_of_year'].apply(lambda x: np.cos(x *(2. * np.pi/52)))
df4['day_of_week_sin'] = df4['day_of_week'].apply(lambda x: np.sin(x * (2. * np.pi/7)))
df4['day_of_week_cos'] = df4['day_of_week'].apply(lambda x: np.cos(x *(2. * np.pi/7)))
df6 = df4.copy()
cols_drop = ['week_of_year','day','month','day_of_week','promo_since','competition_since','year_week']
df6 = df6.drop(cols_drop,axis=1)
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']
X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
X_train_n = X_train.drop(['date', 'sales'], axis=1).values
y_train_n = y_train.values.ravel()
# Criar um classificador RandomForest simples com menos estimadores
#rf = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=42)
# Passar o RandomForestClassifier para o BorutaPy
#boruta = BorutaPy(rf, n_estimators=10, random_state=20)

#boruta.fit(X_train_n, y_train_n)
cols_selected_borutaa = df6[['store','promo','store_type','assortment','competition_distance','competition_open_since_month','competition_open_since_year','promo2','promo2_since_week','promo2_since_year','competition_time_month','promo_time_week','day_of_week_sin','day_of_week_cos','month_cos','month_sin','day_sin','day_cos','week_of_year_cos','week_of_year_sin']]
cols_selected_borutaa
cols_selected_boruta = cols_selected_borutaa.columns.tolist()
#cols_selected_boruta.extend(['date', 'sales'])



# ML

In [None]:
# classificação e regressão e series temporais 
#deep leraning é classificação
quando quer fazer previsão isso se chama predição

In [3]:
# nao supervisionado - Clusterização
# agrupamento/clusterização
!pip install pandas



In [None]:
# semi-supervisionado
#ação     > <       ambiente

#ação faz ação e recebe uma recompensa
#e depois de um tempo quando ele só recebe recompensa positiva ele só faz aquela ação
EX: igual capa de serie de netflix
quando vc clica é a recompensa e ai acaba que vai aprendendo em qual vc mais clicka é a melhor


# machine learning moddeling

In [5]:
#selecionando as colunas mais relevantes para xtrain e xtest

x_train = X_train[ cols_selected_boruta ]
x_test = X_test[ cols_selected_boruta ]

## 1 Modelo de média

In [None]:
para ter uma base doq é bom ou ruim , se eu fazer um ML e for pior que a média o algortmo nao está bom


In [6]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error


In [7]:
def ml_error(model_name,y,y_hat):
    mae = mean_absolute_error(y,y_hat)
    mape = mean_absolute_percentage_error(y,y_hat)
    rmse = np.sqrt(mean_squared_error(y,y_hat))
    return pd.DataFrame({'Model Name' : model_name,
                         'MAE' : mae,
                         'MAPE' : mape,
                         'rmse' : rmse }, index=[0])
    #index 0 n sei pq

In [16]:
aux1 = x_test.copy()
aux1['sales'] = y_test.copy()
#agora temos todas as predicoes medias por média por loja agora anexar no conjunto de dados original
aux2 = aux1[['store', 'sales']].groupby( 'store' ).mean().reset_index().rename(columns={'sales': 'predictions'} )
aux1 = pd.merge( aux1, aux2, how='left', on='store' )
#predição 
yhat_baseline = aux1['predictions']
#performace
#usamos logaritmo para modularizar a variavel(colocar na variavel certa)
#usamos log na variavel resposta sales(exponencial para voltar na variavel certa)(expm1)
baseline_result = ml_error( 'Average Model', np.expm1( y_test ), np.expm1(yhat_baseline ) )
baseline_result

Unnamed: 0,Model Name,MAE,MAPE,rmse
0,Average Model,1354.800353,0.2064,1835.135542


# 2 - regressão linear

In [17]:
from sklearn.linear_model import LinearRegression


#model 
lr = LinearRegression().fit(x_train, y_train)


#prediction
y_hat_lr = lr.predict(x_test)

#performace 
lr_result = ml_error('Linear Regressior', np.expm1(y_test) , np.expm1(y_hat_lr))
lr_result

Unnamed: 0,Model Name,MAE,MAPE,rmse
0,Linear Regressior,1867.089774,0.292694,2671.049215


# 3 - regressão linear regularização - Lasso

In [18]:
from sklearn.linear_model import LinearRegression,Lasso


#model 
lrr = Lasso(alpha = 0.01).fit(x_train, y_train)


#prediction
y_hat_lrr = lrr.predict(x_test)

#performace 
lrr_result = ml_error('Lasso Regressior', np.expm1(y_test) , np.expm1(y_hat_lrr))
lrr_result

Unnamed: 0,Model Name,MAE,MAPE,rmse
0,Lasso Regressior,1891.704881,0.289106,2744.451737


In [None]:
# notase que os modelos lieares tiveram performace pior doq a média entao isso quer dizer que nosso modelo
# nao é linear ele nao é simples ele é mais complexos , entao os proximos passo é testar modelos de regressao nao lineares


# 4 - random forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
# model 
#esti = quantas arvores aleatoria vai criar, criar em paralelo , random = origem dos numero aleatorio principamente na hora de escolher as features
#crio uma mesma origem aleatoria
rf = RandomForestRegressor(n_estimators=37,n_jobs=-1,random_state=42).fit(x_train,y_train)


#predict
yhat_rf = rf.predict(x_test)

#performace
rf_result = ml_error('Random Forest Regressor',np.expm1(y_test),np.expm1(yhat_rf))
rf_result

In [7]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:02[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


# 5 - XGBoost Regressor

In [23]:
import xgboost as xgb
# model 
#obj usar no problema de regressao
#eta quantas passagens usa pra fzer o aprendizado
#max maxima profundida da arvore
# sub quantas amostra ou % das variaveis que quero pega
# col varias arvores e quero pega varias arvores
#
#
model_xgb = xgb.XGBRegressor(objective='reg:squarederror',
                             n_estimators=100,
                             eta=0.01,
                             max_depth=10,
                             subsample=0.7,
                             colsample_bytee=0.9,
                                ).fit(x_train,y_train)


#predict
yhat_xgb = model_xgb.predict(x_test)

#performace
xgb_result = ml_error('XGBoost Regressor',np.expm1(y_test),np.expm1(yhat_xgb))
xgb_result

Parameters: { "colsample_bytee" } are not used.



Unnamed: 0,Model Name,MAE,MAPE,rmse
0,XGBoost Regressor,1695.767243,0.251781,2478.381381


# comparar models performace

In [24]:
modelling_result = pd.concat([baseline_result,lr_result,lrr_result,rf_result,xgb_result])

In [25]:
modelling_result.sort_values('rmse')

Unnamed: 0,Model Name,MAE,MAPE,rmse
0,Random Forest Regressor,693.652884,0.101926,1031.71697
0,Average Model,1354.800353,0.2064,1835.135542
0,XGBoost Regressor,1695.767243,0.251781,2478.381381
0,Linear Regressior,1867.089774,0.292694,2671.049215
0,Lasso Regressior,1891.704881,0.289106,2744.451737


In [None]:
# os modelos nao lineares estao com performace melhro que media
# esse erro nao pode usar pq o modelo foi treinado em vendas nas ultimas semanas
#tem que fazer a separação do treino e teste
#vamo aprender cross validatition