## Importação de bibliotecas

In [630]:
from mlflow.models.signature import infer_signature
from sklearn.metrics import plot_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from datetime import datetime
import mlflow
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import datetime
import warnings
warnings.filterwarnings('ignore')

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from unidecode import unidecode

from sklearn.preprocessing import RobustScaler, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold


## Carregamento do dataset

In [631]:
df = pd.read_feather('../data/carteira_total.feather')


## Data Preparation

### Correção da nomenclatura das colunas

In [632]:
newColumnsName = []
for column_name in df:
  newColumnsName.append(
      re.sub('[^A-Za-z0-9]+', '_', unidecode(column_name).lower()))
df.columns = newColumnsName


### Criação da variável target

In [633]:
df["upsale_downsale"].replace(
    {"Churn": "churn", "Upsell": "upsell", "Downsell": "downsell", "Ok": "ok"}, inplace=True)
df['churn'] = df['upsale_downsale']
df['churn'].replace(
    {"ok": "0", "upsell": "0", "downsell": "0", "churn": "1"}, inplace=True)

df.drop(columns=["upsale_downsale"], inplace=True)


### Criação da variável "quantidades mês" (feature engeneering com a variável nativa "mês")

In [634]:
df_grouped = df[['mes', 'id_sap']].groupby(['id_sap']).count().reset_index()

df_grouped.rename(columns = {'mes':'quantidade_mes'}, inplace=True)

In [635]:
df = df.join(df_grouped.set_index('id_sap'), on='id_sap')

### Criação da variável "status_pagamento" (feature engeneering utilizando fonte de dados externa)

In [636]:
xls = pd.ExcelFile('../data/quality_score.xlsx')
xls.sheet_names

i = 0
for data in xls.sheet_names:
    if i == 0:
        dfQuality = pd.read_excel(xls, data)
        dfQuality['data'] = data
        dfQuality.rename(columns={'Classificação Pagamento': 'status_pagamento', 'Quality Score Cobrança': 'status_pagamento',
                                  'PFIN': 'status_pagamento', 'PEFIN': 'status_pagamento'}, inplace=True)
    else:
        dfQualityAux = pd.read_excel(xls, data)
        dfQualityAux['data'] = data
        dfQualityAux.rename(columns={'Classificação Pagamento': 'status_pagamento', 'Quality Score Cobrança': 'status_pagamento',
                                     'PFIN': 'status_pagamento', 'PEFIN': 'status_pagamento'}, inplace=True)

        dfQuality = pd.concat([dfQuality, dfQualityAux])

    i+=1

In [637]:
dfQuality['status_pagamento'].replace({'4. Péssimo': 'Pessimo', '2. Regular ': 'Regular', '1. Bom': 'Bom', '3. Ruim': 'Ruim', '5. Novo': 'Novo',
                                       '2. Regular': 'Regular', '1. Bom ': 'Bom', 'lançamentos': np.nan, '5. novo': 'Novo', 0: np.nan}, inplace=True)


In [638]:
dfQuality.dropna(inplace=True)


In [639]:
regex = r'([0-9]{4})-([0-9]{2})-[0-9]{2}'


def fun_replace(data):
    return datetime.datetime.strptime(str(data.group(2)).lower(), '%m').strftime('%b').lower() + data.group(1)[-2:]


df['mes'] = df['mes'].astype(str).str.replace(regex, fun_replace, regex=True)


In [640]:
dfQuality.rename(columns={'ID SAP': 'id_sap', 'data': 'mes'}, inplace=True)


In [641]:
df = df.join(dfQuality.set_index(['id_sap', 'mes']), on=['id_sap', 'mes'])


### Correção dos valores de colunas categóricas

In [642]:
for column in df.select_dtypes(include=['object']):
    df[column] = df[column].apply(lambda x: re.sub(
        '[^A-Za-z0-9]+', '_', unidecode(x).lower()) if isinstance(x, str) else x)


### Seleção das colunas mais significativas

In [643]:
df = df[['pf_pj', 'contratado_ofertas_simples', 'utilizado_ofertas_simples',
         'leads_form', 'equipe', 'utilizado_destaque', 'valor_mensal',
         'quantidade_mes', 'status_pagamento', 'churn', 'regiao', 'oficina', 'tipo_de_plano', 'frequencia_de_faturamento']]

## Modeling

### Pipeline

In [644]:
df = df[df['status_pagamento'].notna()]


In [647]:
X = df.drop(['churn'], axis=1)
y = df['churn'].astype(int)

# kf = KFold(n_splits=5) # Define the split - into 2 folds 
# kf.get_n_splits(X)
# print(kf) 

# for train_index, test_index in kf.split(X):
 # print('TRAIN:', train_index, 'TEST:', test_index)
 # X_train, X_test = X[train_index], X[test_index]
 # y_train, y_test = y[train_index], y[test_index]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=1)

KFold(n_splits=5, random_state=None, shuffle=False)
TRAIN: [ 7068  7069  7070 ... 35335 35336 35337] TEST: [   0    1    2 ... 7065 7066 7067]


KeyError: "None of [Int64Index([ 7068,  7069,  7070,  7071,  7072,  7073,  7074,  7075,  7076,\n             7077,\n            ...\n            35328, 35329, 35330, 35331, 35332, 35333, 35334, 35335, 35336,\n            35337],\n           dtype='int64', length=28270)] are in the [columns]"

### Aplicação do Smote (VERIFICAR)

In [None]:
# smote = SMOTE(sampling_strategy='minority', random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)


### Criação do Pipeline

In [None]:
numeric_features = ['contratado_ofertas_simples', 'utilizado_ofertas_simples',
                    'leads_form', 'utilizado_destaque', 'valor_mensal', 'quantidade_mes']
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)

categorical_features = ['pf_pj', 'equipe', 'status_pagamento',
                        'regiao', 'oficina', 'tipo_de_plano', 'frequencia_de_faturamento']
categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown='ignore')),
           ("imputer", SimpleImputer(strategy="constant"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler(with_mean=False)),
        ('smote', SMOTE(random_state=0, sampling_strategy=0.75)),
        ("classifier", MLPClassifier(hidden_layer_sizes=(100, 300),
                      random_state=1,
                      learning_rate_init=0.01, solver='adam', activation='relu'))
    ]
)

# pipeline = Pipeline(
#     steps=[
#         ("onehot", LeaveOneOutEncoder()),
#         ("imputer", SimpleImputer(strategy="median")),
#         ("scaler", RobustScaler(with_centering=False)),
#         ('smote', SMOTE(random_state=0)),
#         ("classifier", MLPClassifier(hidden_layer_sizes=(6, 5),
#                                      random_state=1,
#                                      learning_rate_init=0.01))
#     ]
# )


In [None]:
# {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.01, 'solver': 'adam'}

In [None]:
X_train

### Predict

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


## Monitoramento (ml flow)

## Criação do experimento

In [None]:
mlflow.set_experiment('Churn Prediction')


### Registro da signature e do pipeline

In [None]:
signature = infer_signature(X_test, y_pred)
mlflow.sklearn.log_model(pipeline, 'model_pipeline', signature=signature)


### Registro dos Parâmetros do modelo

In [None]:
params = pipeline.named_steps["classifier"].get_params()
mlflow.log_params(params)

### Registro da matriz de confusão

In [None]:
plot_confusion_matrix(pipeline, X_test, y_test, display_labels=[
    "positivo", "negativo"], values_format="d")

plt.savefig("mlruns/atual_model_confusion_matrix_.png")
mlflow.log_artifact("mlruns/atual_model_confusion_matrix_.png")

plt.close()


### Registro de métricas

In [None]:
f1 = round(f1_score(y_test, y_pred, average='macro')*100, 2)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
precision = round(precision_score(y_test, y_pred)*100, 2)
recall = round(recall_score(y_test, y_pred)*100, 2)

mlflow.log_metric("f1", f1)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)

print("f1: ", f1)
print("accuracy:", accuracy)
print("precision: ", precision)
print("recall: ", recall)

### Finalização do experimento

In [None]:
mlflow.end_run()

In [None]:
activation = ['identity', 'logistic', 'tanh', 'relu']
solver = ['lbfgs', 'sgd', 'adam']
learning_rate = [0.01, 0.0001, 0.1]
hidden_layer_sizes = [(100,),(500,)]

In [None]:
parameters2 = {'activation': activation,
              'solver': solver,
              'learning_rate_init': learning_rate,
              'hidden_layer_sizes': hidden_layer_sizes,
              }

In [None]:
pipeline2 = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", Normalizer()),
        ('smote', SMOTE(random_state=0)),
        ('gridsearch', GridSearchCV(MLPClassifier(), param_grid=parameters2))
    ])

In [None]:
# grid_search = GridSearchCV(MLPClassifier(), param_grid=parameters)
# grid_search.fit(X_train, y_train)

In [None]:
# pipeline2.fit(X_train, y_train)
# print(pipeline2.best_params_)
# print(pipeline2.best_estimator_)
# params2 = pipeline2.named_steps["gridsearch"].best_params_
# print(params2)