# Lab 2 - CMC-13
Equipe:
* Emanuel Reinaldo Gomes Bezerra
* Pedro Pinheiro Borges
* Guilherme Alt Chagas Merklein

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import joblib

### 1. Preparação dos dados

In [34]:
base_df = pd.read_csv("../data/lab2_2025_dataset.csv")
base_df.dropna(inplace=True)
base_df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [35]:
base_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440832 entries, 0 to 440832
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         440832 non-null  float64
 1   Age                440832 non-null  float64
 2   Gender             440832 non-null  object 
 3   Tenure             440832 non-null  float64
 4   Usage Frequency    440832 non-null  float64
 5   Support Calls      440832 non-null  float64
 6   Payment Delay      440832 non-null  float64
 7   Subscription Type  440832 non-null  object 
 8   Contract Length    440832 non-null  object 
 9   Total Spend        440832 non-null  float64
 10  Last Interaction   440832 non-null  float64
 11  Churn              440832 non-null  float64
dtypes: float64(9), object(3)
memory usage: 43.7+ MB


In [36]:
base_df.describe()

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Total Spend,Last Interaction,Churn
count,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0
mean,225398.667955,39.373153,31.256336,15.807494,3.604437,12.965722,631.616223,14.480868,0.567107
std,129531.91855,12.442369,17.255727,8.586242,3.070218,8.258063,240.803001,8.596208,0.495477
min,2.0,18.0,1.0,1.0,0.0,0.0,100.0,1.0,0.0
25%,113621.75,29.0,16.0,9.0,1.0,6.0,480.0,7.0,0.0
50%,226125.5,39.0,32.0,16.0,3.0,12.0,661.0,14.0,1.0
75%,337739.25,48.0,46.0,23.0,6.0,19.0,830.0,22.0,1.0
max,449999.0,65.0,60.0,30.0,10.0,30.0,1000.0,30.0,1.0


In [37]:
base_df["Subscription Type"].value_counts(), base_df["Gender"].value_counts(), base_df["Contract Length"].value_counts()

(Subscription Type
 Standard    149128
 Premium     148678
 Basic       143026
 Name: count, dtype: int64,
 Gender
 Male      250252
 Female    190580
 Name: count, dtype: int64,
 Contract Length
 Annual       177198
 Quarterly    176530
 Monthly       87104
 Name: count, dtype: int64)

In [38]:
df = base_df.drop(columns=["CustomerID"])

In [39]:
df.columns

Index(['Age', 'Gender', 'Tenure', 'Usage Frequency', 'Support Calls',
       'Payment Delay', 'Subscription Type', 'Contract Length', 'Total Spend',
       'Last Interaction', 'Churn'],
      dtype='object')

#### Preparação dos Datasets
Estratégia: Separar o dataset original em: Dataset de Treinamento, Dataset de Validação, Dataset de Teste
1. Dados de treinamento utilizados para encontrar um bom modelo de cada um dos três tipos definidos. Treinaremos Árvore de Decisão, Rede Neural MLP e Random Forest. 
2. Tendo o melhor modelo de cada um dos três, aplico-os no Dataset de Validação. Escolho o melhor dos três com base no resultado desta aplicação
3. Agora que tenho o melhor modelo, retreino ele no conjunto Dataset de Treinamento + Dataset de Validação
4. Aplico este modelo no Dataset de Teste e obtenho sua performance

In [40]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [41]:
X_temp, X_test, y_temp, y_test = train_test_split(
  X, y, test_size=0.20, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
  X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

In [42]:
print(f"Tamanho do conjunto de Treino: {X_train.shape[0]} amostras")
print(f"Tamanho do conjunto de Validação: {X_val.shape[0]} amostras")
print(f"Tamanho do conjunto de Teste Final: {X_test.shape[0]} amostras\n")

Tamanho do conjunto de Treino: 264498 amostras
Tamanho do conjunto de Validação: 88167 amostras
Tamanho do conjunto de Teste Final: 88167 amostras



#### Pipeline de pré-processamento de dados
Nesta seção, fazemos one-hot encoding das features categóricas e fazemos transformação de escala para as variáveis numéricas.

In [43]:
numerical_columns = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls','Payment Delay', 'Total Spend','Last Interaction',]
categorical_columns = ['Subscription Type', 'Contract Length', 'Gender']

In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
    ],
    remainder="drop",
)

### 2. Criação dos três modelos

#### 2.a) Modelo baseado em kNN, Árvore de Decisão ou SVM
roda em 1min

In [47]:
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_config = {
  "classifier__max_depth": [5, 10, 12],
  "classifier__min_samples_leaf": [2, 5, 10],
}

decision_tree_pipeline = Pipeline(
  steps=[("preprocessor", preprocessor), ("classifier", decision_tree_model)]
)
decision_tree_grid_search = GridSearchCV(
  decision_tree_pipeline, decision_tree_config, cv=10, scoring="roc_auc", n_jobs=-1
)

decision_tree_grid_search.fit(X_train, y_train)

decision_tree_model_hyperparams = {
  k.replace("classifier__", ""): v for k, v in decision_tree_grid_search.best_params_.items()
}

print(f"Performance {decision_tree_grid_search.best_score_} obtida com os hiperparâmetros da DecisionTree: {decision_tree_model_hyperparams}")

Performance 0.9999220472012631 obtida com os hiperparâmetros da DecisionTree: {'max_depth': 12, 'min_samples_leaf': 2}


In [56]:
decision_tree_pipeline = Pipeline(
  steps=[("preprocessor", preprocessor), ("classifier", decision_tree_model.set_params(**decision_tree_model_hyperparams))]
)

decision_tree_pipeline.fit(X_train, y_train)
print("Decision Tree treinada com os melhores hiperparâmetros")

Decision Tree treinada com os melhores hiperparâmetros


#### 2.b) Modelo baseado em Redes Neurais do tipo MLP (MultiLayer Perceptron)
roda em 10min

In [None]:
nn_mlp_model = MLPClassifier(random_state=42, max_iter=1000, early_stopping=True)
nn_mlp_config = {
  "classifier__hidden_layer_sizes": [(50,), (100,)],
  "classifier__alpha": [0.0001, 0.001],
}

nn_mlp_pipeline = Pipeline(
  steps=[("preprocessor", preprocessor), ("classifier", nn_mlp_model)]
)
nn_mlp_grid_search = GridSearchCV(
  nn_mlp_pipeline, nn_mlp_config, cv=10, scoring="roc_auc", n_jobs=-1
)

nn_mlp_grid_search.fit(X_train, y_train)

nn_mlp_model_hyperparams = {
  k.replace("classifier__", ""): v for k, v in nn_mlp_grid_search.best_params_.items()
}

print(f"Performance {nn_mlp_grid_search.best_score_} obtida com os hiperparâmetros da Rede Neural MLP: {nn_mlp_model_hyperparams}")

Performance 0.999952820948373 obtida com os hiperparâmetros da DecisionTree: {'alpha': 0.0001, 'hidden_layer_sizes': (100,)}


In [57]:
nn_mlp_pipeline = Pipeline(
  steps=[("preprocessor", preprocessor), ("classifier", nn_mlp_model.set_params(**nn_mlp_model_hyperparams))]
)

nn_mlp_pipeline.fit(X_train, y_train)
print("Rede Neural MLP treinada com os melhores hiperparâmetros")

Rede Neural MLP treinada com os melhores hiperparâmetros


#### 2.c) Modelo baseado em Comitês (Random Forests, AdamBoost, etc)
roda em 10min

In [None]:
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_config = {
  "classifier__n_estimators": [50, 100, 200],
  "classifier__max_features": ["sqrt", "log2"],
}

random_forest_pipeline = Pipeline(
  steps=[("preprocessor", preprocessor), ("classifier", random_forest_model)]
)
random_forest_grid_search = GridSearchCV(
  random_forest_pipeline, random_forest_config, cv=10, scoring="roc_auc", n_jobs=-1
)

random_forest_grid_search.fit(X_train, y_train)

random_forest_model_hyperparams = {
  k.replace("classifier__", ""): v for k, v in random_forest_grid_search.best_params_.items()
}

print(f"Performance {random_forest_grid_search.best_score_} obtida com os hiperparâmetros da Random Forest: {random_forest_model_hyperparams}")

Performance 0.9999992742312871 obtida com os hiperparâmetros da Random Forest: {'max_features': 'sqrt', 'n_estimators': 200}


In [58]:
random_forest_pipeline = Pipeline(
  steps=[("preprocessor", preprocessor), ("classifier", random_forest_model.set_params(**random_forest_model_hyperparams))]
)

random_forest_pipeline.fit(X_train, y_train)
print("Random Forest treinada com os melhores hiperparâmetros")

Random Forest treinada com os melhores hiperparâmetros


### 3. Análise comparativa do desempenho dos modelos

In [68]:
pipelines = {
  "DecisionTree": decision_tree_pipeline,
  "MLP": nn_mlp_pipeline,
  "RandomForest": random_forest_pipeline,
}


validation_score = {
  "DecisionTree": decision_tree_pipeline.score(X_val, y_val),
  "MLP": nn_mlp_pipeline.score(X_val, y_val),
  "RandomForest": random_forest_pipeline.score(X_val, y_val),
}

print(validation_score)

champion_model_name = max(validation_score, key=validation_score.get)
print(f"Modelo escolhido: {champion_model_name} com acurácia de {validation_score[champion_model_name]}")

{'DecisionTree': 0.9993308153844409, 'MLP': 0.997924393480554, 'RandomForest': 0.9989451835720848}
Modelo escolhido: DecisionTree com acurácia de 0.9993308153844409


#### Retreinando o melhor modelo com os dados de Treino e Validação

In [70]:
final_pipeline = pipelines[champion_model_name]
final_pipeline.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))

### 4. Aplicação da predição do Modelo Desenvolvido

In [73]:
y_pred_final = final_pipeline.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)

print(f"Acurácia Final do Modelo no Conjunto de Teste: {final_accuracy:.4f}")

Acurácia Final do Modelo no Conjunto de Teste: 0.9992


#### Exportando modelo

In [None]:
filename = "model.joblib"
joblib.dump(final_pipeline, filename)
print(f"Modelo final salvo em '{filename}'")

Modelo final salvoem 'model.joblib'


### 5. Conclusões