In [1]:
from sklearn.utils import all_estimators
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import pandas as pd
from mlflow.models.signature import infer_signature
import mlflow
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
import numpy as np


In [2]:
df = pd.read_feather('../data/carteira_total_with_quality_score.feather')


In [3]:
df['total_utilizado_relativo'] = (np.mean(df.total_utilizado) - df.total_utilizado) / df.total_utilizado

In [4]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [5]:
df['total_utilizado_relativo'].fillna(df['total_utilizado_relativo'].max(), inplace=True)

In [6]:
df['nao_utilizado_total'] = df['total_contratado'] - df['total_utilizado']

In [7]:
df = df[["pf_pj", "total_utilizado_relativo", "contratado_ofertas_simples", "utilizado_ofertas_simples", "leads_form",
 "equipe", "utilizado_destaque", "valor_mensal", "quantidade_mes", "status_pagamento", "regiao", "oficina", "tipo_de_plano", "frequencia_de_faturamento", 'churn']]


In [8]:
numeric_features = ['contratado_ofertas_simples', 'utilizado_ofertas_simples',
                    'leads_form', 'utilizado_destaque', 'valor_mensal', 'quantidade_mes']
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)

categorical_features = ['pf_pj', 'equipe',
                        'regiao', 'oficina', 'tipo_de_plano', 'frequencia_de_faturamento']
categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown='ignore')),
           ("imputer", SimpleImputer(strategy="constant"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler2", RobustScaler(with_centering=False)),
        ('smote', RandomUnderSampler(sampling_strategy='majority', random_state=42))
    ]
)


In [9]:
X = df.drop(['churn'], axis=1).copy()
y = df['churn'].astype(int).copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=1, stratify=y)

In [10]:
def printAndSave(pipeline, X_test, y_test, y_pred):
    mlflow.set_experiment('Churn Prediction')
    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(pipeline, 'model_pipeline', signature=signature)
    params = pipeline.named_steps["classifier"].get_params()
    mlflow.log_params(params)

    plot_confusion_matrix(pipeline, X_test, y_test, display_labels=[
        "positivo", "negativo"], values_format="d")

    plt.savefig("mlruns/confusion_matrix_.png")
    mlflow.log_artifact("mlruns/confusion_matrix_.png")

    plt.close()

    mlflow.log_metric("metrics", classification_report(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    mlflow.end_run()

In [11]:
estimators = all_estimators(type_filter='classifier')

all_class = []
for name, ClassificationClass in estimators:
    try:
        clas = ClassificationClass()
        all_class.append(clas)
    except Exception as e:
        continue

In [12]:
for i in all_class:
    pipeline.steps.append(('classifier', i))

    try:
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        print(i)
        printAndSave(pipeline, X_test, y_test, y_pred)
    except Exception as e:
        print(e)
    
    pipeline.steps.remove(('classifier', i))


2022/06/02 23:30:16 INFO mlflow.tracking.fluent: Experiment with name 'Churn Prediction' does not exist. Creating a new experiment.


AdaBoostClassifier()
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     44628
           1       0.29      0.82      0.43      4605

    accuracy                           0.80     49233
   macro avg       0.63      0.81      0.65     49233
weighted avg       0.91      0.80      0.83     49233

BaggingClassifier()
              precision    recall  f1-score   support

           0       0.97      0.82      0.89     44628
           1       0.30      0.77      0.44      4605

    accuracy                           0.81     49233
   macro avg       0.64      0.80      0.66     49233
weighted avg       0.91      0.81      0.85     49233

BernoulliNB()
              precision    recall  f1-score   support

           0       0.96      0.71      0.81     44628
           1       0.20      0.69      0.31      4605

    accuracy                           0.71     49233
   macro avg       0.58      0.70      0.56     49233
weighted avg       