# Importando Bibliotecas

Modelagem

Passos:
* Pre-processamento dos dados, divisão train & test, preparar a pipeline.
* Testar Logistic Regression & Randome Forest, ajustar hyperparameters com GridSearchCV & StratifiedKFold.
* Selecionar recursos e modelos, prever a probabilidade no conjunto de testes.

In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, make_pipeline

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings

In [5]:
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

In [6]:
from pycaret.regression import *

# Análise exploratoria DataFrame

In [66]:
df = pd.read_csv('Data/final_df.csv')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100761 entries, 0 to 100760
Data columns (total 52 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Unnamed: 0.1                        100761 non-null  int64  
 1   order_id                            100761 non-null  object 
 2   product_id                          100761 non-null  object 
 3   seller_id                           100761 non-null  object 
 4   price                               100761 non-null  float64
 5   freight_value                       100761 non-null  float64
 6   Unnamed: 0_x                        100761 non-null  int64  
 7   customer_id                         100761 non-null  object 
 8   order_status                        100761 non-null  object 
 9   order_purchase_timestamp            100761 non-null  object 
 10  estimativa_dias_entrega             100761 non-null  float64
 11  dias_reais_entrega        

## Processamento do DataFrame

In [23]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,order_id,product_id,seller_id,price,freight_value,Unnamed: 0_x,customer_id,order_status,order_purchase_timestamp,estimativa_dias_entrega,dias_reais_entrega,ano_pedidos,antecipacao_entrega,Unnamed: 0_y,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,Unnamed: 0_x.1,review_score,tempo_resposta_avaliação,Unnamed: 0_y.1,customer_zip_code_prefix,customer_state,retorno_cliente,Unnamed: 0_x.2,seller_zip_code_prefix,Unnamed: 0_y.2,customer_lat,customer_lng,Unnamed: 0,payment_sequential,payment_type,payment_installments,payment_value,entrega_atrasada,delivered,erro_estimativa_dias_entrega,revisao_intervalo_atraso_resposta,revisão intervalo de atraso_0,revisão intervalo de atraso_1,revisão intervalo de atraso_2,revisão intervalo de atraso_3,revisão intervalo de atraso_3_mais,vendedor_popular,purchase_month,purchase_day
0,0,00010242fe8c5a6d1ba2dd792cb16214,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,58.9,13.29,85267,3ce436f183e68e07877b285a838db11a,delivered,2017-09-13,16.0,7.0,2017,9,25865,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0,51020,5,1,65557,28013,RJ,0,513,27277,7570.0,-21.758076,-41.312633,85283.0,1.0,credit_card,2.0,72.19,0,1,-9.0,1,0,1,0,0,0,1,9,13
1,1,00018f77f2f0320c557190d7a144bdd3,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,239.9,19.93,71853,f6dd3ec061db4e3987629fe6b26e5cce,delivered,2017-04-26,19.0,16.0,2017,3,27230,pet_shop,56.0,239.0,2.0,30000.0,50.0,30.0,40.0,27324,4,2,34265,15775,SP,0,471,3471,5620.0,-20.212393,-50.941471,2499.0,1.0,credit_card,3.0,259.83,0,1,-3.0,2,0,0,1,0,0,1,4,26
2,2,000229ec398224ef6ca0657da4fc703e,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,199.0,17.87,6298,6489ae5e4333f3693df5ad4372dab6d3,delivered,2018-01-14,22.0,8.0,2018,14,22624,moveis_decoracao,59.0,695.0,2.0,3050.0,33.0,13.0,33.0,4149,5,0,34955,35661,MG,0,1824,37564,8787.0,-19.860439,-44.597972,12393.0,1.0,credit_card,5.0,216.87,0,1,-14.0,0,1,0,0,0,0,0,1,14
3,3,00024acbcdf0a6daa1e931b038114c75,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,12.99,12.79,22550,d4eb9395c8c0431ee92fce09860c5a06,delivered,2018-08-08,12.0,6.0,2018,6,15403,perfumaria,42.0,480.0,1.0,200.0,16.0,10.0,15.0,38151,4,0,51763,12952,SP,0,2023,14403,4798.0,-23.144923,-46.53983,32971.0,1.0,credit_card,2.0,25.78,0,1,-6.0,0,1,0,0,0,0,0,8,8
4,4,00042b26cf59d7ce69dfabb4e55b4fd9,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,199.9,18.14,5247,58dbd0b2d70206bf40e62cd34e84d795,delivered,2017-02-04,41.0,25.0,2017,16,8862,ferramentas_jardim,59.0,409.0,1.0,3750.0,35.0,40.0,30.0,54668,5,1,7602,13226,SP,0,1597,87900,4967.0,-23.249008,-46.824961,98711.0,1.0,credit_card,3.0,218.04,0,1,-16.0,1,0,1,0,0,0,0,2,4


In [67]:
# drop colunas 
to_drop = ['product_category_name', 'order_id', 'seller_id', 'customer_id', 'product_id',  'order_status', 
           'order_purchase_timestamp', 'Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 0_x.1', 
           'Unnamed: 0_y.1', 'Unnamed: 0', 'order_status', 'customer_state', 'payment_type',
           'revisão intervalo de atraso_0', 'revisão intervalo de atraso_1','revisão intervalo de atraso_2',
           'revisão intervalo de atraso_3', 'revisão intervalo de atraso_3_mais']

df = df.drop(columns=to_drop, axis=1)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100761 entries, 0 to 100760
Data columns (total 32 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   price                              100761 non-null  float64
 1   freight_value                      100761 non-null  float64
 2   estimativa_dias_entrega            100761 non-null  float64
 3   dias_reais_entrega                 100761 non-null  float64
 4   ano_pedidos                        100761 non-null  int64  
 5   antecipacao_entrega                100761 non-null  int64  
 6   product_name_lenght                99302 non-null   float64
 7   product_description_lenght         99302 non-null   float64
 8   product_photos_qty                 99302 non-null   float64
 9   product_weight_g                   100761 non-null  float64
 10  product_length_cm                  100761 non-null  float64
 11  product_height_cm                  1007

In [68]:
df_model = df
X = df_model.drop(['review_score'], axis=1)
y = df_model['review_score']

A variável alvo (review_score) é bastante desequilibrada e vamos dividir essa variável em 2 categorias principais: clientes satisfeitos (pontuação da avaliação 4 e 5) e clientes insatisfeitos (pontuação da avaliação de 1 a 3). Isso ajudará a aumentar a representação das classes minoritárias e ajudará os modelos a detectá-las melhor, preservando os insights de negócios e a interpretabilidade.

In [69]:
y2 = y.apply(lambda x: 0 if x in [1,2,3] else 1)

### Dividindo os conjuntos em treino e teste

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y2,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=1)

In [71]:
print('Tamanho conjunto treino: ', X_train.shape, y_train.shape)
print('Tamanho conjunto teste:', X_test.shape, y_test.shape)

Tamanho conjunto treino:  (80608, 31) (80608,)
Tamanho conjunto teste: (20153, 31) (20153,)


In [72]:
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

### Base para avaliar o desempenho de nossos modelos

In [73]:
y_train.value_counts(normalize=True)

1    0.779588
0    0.220412
Name: review_score, dtype: float64

### Logistic Regression

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y2,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=0)

O GridSearchCV implementa um método “fit” e um método “score”. Também implementa “score_samples”, “predict”, “predict_proba”, “decision_function”, “transform” e “inverse_transform” se estiverem implementados no estimador utilizado

In [75]:
# modelo & pipeline para GridSearch
lr = LogisticRegression()

# adicionando pca
pca = PCA()
n_components = list(range(25,X.shape[1],2))

# pipeline
pipe_lr = make_pipeline(imputer, scaler, pca, lr)

# hiperparâmetros do modelo que será testado pelo GridSearchCV
parameters_lr = {
        'logisticregression__C': [0.01, 0.05, 0.1, 0.5, 1, 2],
        'logisticregression__penalty' : ['l2'],
        'logisticregression__solver' : ['newton-cg', 'sag', 'lbfgs'],
        'pca__n_components': n_components
        }

# definir GridSearchCV
grid_lr = GridSearchCV(pipe_lr, 
                       param_grid=parameters_lr, 
                       cv=StratifiedKFold(),
                       verbose=1,
                      n_jobs=-1)

warnings. filterwarnings("ignore")
grid_lr.fit(X_train, y_train)


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:   25.6s finished


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: '3_mais'

In [None]:
print('Melhor hyperparameters:\n{}'.format(grid_lr.best_params_))
print('Melhor estimator: ', grid_lr.best_estimator_)
print('Pontuação de validação cruzada: \n Accuracy:', grid_lr.best_score_)

Com os melhores parâmetros, vamos prever o resultado do conjunto de testes e calculamos a pontuação.

In [None]:
y_train_pred_lr = cross_val_predict(grid_lr.best_estimator_, X_train, y_train, cv=5, method='predict', n_jobs=-1)
print('Logistic Regression - melhor score:')
print('F1-score:', f1_score(y_train, y_train_pred_lr))
print('Confusion matrix: \n', confusion_matrix(y_train, y_train_pred_lr))

### Utilizando Picaret Regression

In [None]:
regression2 = setup(data = data, remove_outliers=True, unknown_categorical_method='most_frequent', target = 'review_score', session_id=42, imputation_type='iterative', normalize=True, n_jobs=-3)