In [1]:
!pip install feature-engine lightgbm xgboost catboost==0.25.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [58]:
import pandas as pd
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from feature_engine.imputation import ArbitraryNumberImputer, MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Reading ABT

In [46]:
# Read Google Drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
# Read abt

df = pd.read_csv('/content/drive/My Drive/TCC_FIA_IA/base_abt_modelagem.csv', index_col=0)
df

Unnamed: 0,Item,venfis,venfin,itens,itens_dist,datas,estoque,recencia,revendeu_prox_mes,ref_prev
0,-,3,127.50,1,1,28.0,0.0,156,0,2021-09-01
1,11ER030M D=3,3,27.52,1,1,28.0,0.0,113,0,2021-09-01
2,11ER040M D=4,10,27.52,1,1,28.0,0.0,113,0,2021-09-01
3,11ER060M D=6,10,27.52,1,1,28.0,0.0,113,0,2021-09-01
4,11IRAG60 (RT11 01NA60 YBG205),10,44.82,1,1,28.0,0.0,6,0,2021-09-01
...,...,...,...,...,...,...,...,...,...,...
1190,XDMT090308-HX PA120,130,51.00,2,1,28.0,0.0,154,0,2022-02-01
1191,XDMT090308-HX PA120,60,53.96,2,1,56.0,0.0,41,0,2022-02-01
1192,XDMT090308-HX PA120,50,51.90,2,1,56.0,0.0,95,0,2022-02-01
1193,XDMT090308-HX PA120,40,51.00,2,1,56.0,0.0,126,0,2022-02-01


# Rename variables

In [48]:
# Rename variables

key_vars = ['ref_prev', 'Item']

num_vars = ['venfis', 'venfin', 'itens', 'itens_dist', 'datas', 'estoque', 'recencia']

target = 'revendeu_prox_mes'

cat_vars = []

features = num_vars+cat_vars

In [49]:
X = df[features]
y = df[target]

# Apply a RandomForestModel

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    train_size=0.8, 
                                                    stratify= y, 
                                                    random_state=42)

In [51]:
rf = RandomForestClassifier(random_state = 42, max_depth = 4)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, random_state=42)

In [52]:
acc_train = accuracy_score(y_train, rf.predict(X_train))
acc_test = accuracy_score(y_test, rf.predict(X_test))
print(f'A Acuracia da base de treino é de {acc_train*100:.3f} e a da base de teste é de {acc_test*100:.3f}')

A Acuracia da base de treino é de 83.143 e a da base de teste é de 81.830


# Apply other algorithms

In [53]:
random_state = 42

modelos_lineares = [
    ('logistic_regression', LogisticRegression(random_state = random_state)),
    ('svm', SVC(random_state = random_state))
]

steps_modelos_lineares = [
    ('numeric_scaler', SklearnTransformerWrapper(variables = num_vars, transformer = StandardScaler())), 
    ]

steps_modelos_arvores = []

modelos_arvores = [
    ('decision_tree', DecisionTreeClassifier(random_state=random_state)),
    ('random_forest', RandomForestClassifier(random_state=random_state)),
    ('gb', GradientBoostingClassifier(random_state=random_state)),
    ('xgb', XGBClassifier(random_state=random_state)),
    ('lgbm', LGBMClassifier(random_state=random_state)),
    ('catboost', CatBoostClassifier(random_state=random_state))
]


In [54]:
# Dataframe with main metrics
df_resultados = pd.DataFrame(columns=['acuracidade', 'precision', 'recall', 'f1', 'roc_auc'])
df_resultados

Unnamed: 0,acuracidade,precision,recall,f1,roc_auc


In [55]:
# Function to automatizate
def rodar_modelos(modelo, steps, X_train, y_train, random_state , n_splits = 5 , n_jobs = -1):
    pipeline = Pipeline(steps = steps + [modelo])
    skf = StratifiedKFold(n_splits = n_splits, shuffle=True, random_state = random_state)
    resultado_cv = cross_validate(estimator = pipeline,
                                  X = X_train,
                                  y = y_train,
                                  scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
                                  cv = skf,
                                  n_jobs = n_jobs)
    
    df_aux = pd.DataFrame(resultado_cv)
    dfar = df_aux.mean()
    return [
            dfar.loc['test_accuracy'], 
            dfar.loc['test_precision'], 
            dfar.loc['test_recall'], 
            dfar.loc['test_f1'], 
            dfar.loc['test_roc_auc']
    ]
    

In [56]:
for i in modelos_lineares:
    print(f'Rodando o modelo {i[0]}')
    var = rodar_modelos(modelo = i, 
                       steps = steps_modelos_lineares, 
                       X_train = X_train, 
                       y_train = y_train, 
                       random_state = 42,
                       n_splits = 5,
                       n_jobs = -1)
    df_resultados.loc[i[0]] = var

for i in modelos_arvores:
    print(f'Rodando o modelo {i[0]}')
    var = rodar_modelos(modelo = i,
                       steps =  steps_modelos_arvores,
                       X_train = X_train, 
                       y_train = y_train, 
                       random_state = 42,
                       n_splits = 5,
                       n_jobs = -1)
    df_resultados.loc[i[0]] = var


Rodando o modelo logistic_regression
Rodando o modelo svm
Rodando o modelo decision_tree
Rodando o modelo random_forest
Rodando o modelo gb
Rodando o modelo xgb
Rodando o modelo lgbm
Rodando o modelo catboost


In [57]:
df_resultados

Unnamed: 0,acuracidade,precision,recall,f1,roc_auc
logistic_regression,0.821669,0.611251,0.113933,0.191835,0.687942
svm,0.827053,0.67114,0.138323,0.228483,0.632134
decision_tree,0.779944,0.408517,0.402324,0.40517,0.637438
random_forest,0.831761,0.581999,0.3499,0.436011,0.775087
gb,0.83782,0.677301,0.245008,0.359391,0.768598
xgb,0.833782,0.656132,0.225131,0.334827,0.766119
lgbm,0.838157,0.642282,0.298357,0.406819,0.767987
catboost,0.838155,0.657867,0.272137,0.384723,0.770264


# Usando 5 meses na ABT para prever o sexto mês, os modelos ficaram mto ruins para recall e precision