# Decision Trees

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(os.path.join('Customer-Churn-Records.csv'))

In [2]:
df = df.drop(columns=['Complain'])
df = df.drop(columns=['HasCrCard'])
df = df.drop(columns=['Surname'])
df = df.drop(columns=['CustomerId'])
df = df.drop(columns=['RowNumber'])

In [3]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

numerical_columns = ['CreditScore', 'Age']

df_no_outliers = df.copy()
for col in numerical_columns:
    df_no_outliers = remove_outliers(df_no_outliers, col)

rows_removed = df.shape[0] - df_no_outliers.shape[0]
print(f"Number of rows removed: {rows_removed}")
print(f"Shape of DataFrame before removing outliers: {df.shape}")
print(f"Shape of DataFrame after removing outliers: {df_no_outliers.shape}")

Number of rows removed: 374
Shape of DataFrame before removing outliers: (10000, 13)
Shape of DataFrame after removing outliers: (9626, 13)


In [4]:
# one-hot encoding
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CreditScore         10000 non-null  int64  
 1   Geography           10000 non-null  object 
 2   Age                 10000 non-null  int64  
 3   Tenure              10000 non-null  int64  
 4   Balance             10000 non-null  float64
 5   NumOfProducts       10000 non-null  int64  
 6   IsActiveMember      10000 non-null  int64  
 7   EstimatedSalary     10000 non-null  float64
 8   Exited              10000 non-null  int64  
 9   Satisfaction Score  10000 non-null  int64  
 10  Card Type           10000 non-null  object 
 11  Point Earned        10000 non-null  int64  
 12  Gender_Male         10000 non-null  bool   
dtypes: bool(1), float64(2), int64(8), object(2)
memory usage: 947.4+ KB


#### Para os modelos de Decision Tree e Random Forest, cremos que o Label Encoder para variáveis não binárias funcionará melhor.

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = ['Geography', 'Card Type']

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col]) 

print("\nDataFrame after Label Encoding multiple columns:") 
print(df)


DataFrame after Label Encoding multiple columns:
      CreditScore  Geography  Age  Tenure    Balance  NumOfProducts  \
0             619          0   42       2       0.00              1   
1             608          2   41       1   83807.86              1   
2             502          0   42       8  159660.80              3   
3             699          0   39       1       0.00              2   
4             850          2   43       2  125510.82              1   
...           ...        ...  ...     ...        ...            ...   
9995          771          0   39       5       0.00              2   
9996          516          0   35      10   57369.61              1   
9997          709          0   36       7       0.00              1   
9998          772          1   42       3   75075.31              2   
9999          792          0   28       4  130142.79              1   

      IsActiveMember  EstimatedSalary  Exited  Satisfaction Score  Card Type  \
0                

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CreditScore         10000 non-null  int64  
 1   Geography           10000 non-null  int32  
 2   Age                 10000 non-null  int64  
 3   Tenure              10000 non-null  int64  
 4   Balance             10000 non-null  float64
 5   NumOfProducts       10000 non-null  int64  
 6   IsActiveMember      10000 non-null  int64  
 7   EstimatedSalary     10000 non-null  float64
 8   Exited              10000 non-null  int64  
 9   Satisfaction Score  10000 non-null  int64  
 10  Card Type           10000 non-null  int32  
 11  Point Earned        10000 non-null  int64  
 12  Gender_Male         10000 non-null  bool   
dtypes: bool(1), float64(2), int32(2), int64(8)
memory usage: 869.3 KB


In [27]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Exited'])
y = df['Exited']

# Dividir o dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

train_df = X_train.copy()
train_df['Exited'] = y_train

test_df = X_test.copy()
test_df['Exited'] = y_test

#Impressão
print("Training Set:")
print(train_df)
print("\nTesting Set:")
print(test_df)

Training Set:
      CreditScore  Geography  Age  Tenure    Balance  NumOfProducts  \
7680          808          2   25       7       0.00              2   
1837          561          0   56       7  152759.00              2   
2920          696          0   33       4       0.00              2   
2642          637          2   22       5   98800.00              1   
4800          690          0   39       6       0.00              2   
...           ...        ...  ...     ...        ...            ...   
8134          577          0   41       6       0.00              1   
4239          636          0   39       3  118336.14              1   
4486          528          0   35       3  156687.10              1   
541           622          0   26       9       0.00              2   
8505          592          0   28       5  137222.77              1   

      IsActiveMember  EstimatedSalary  Satisfaction Score  Card Type  \
7680               1         23180.37                   4    

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [28]:

# Modelo Decision Tree 
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)

# Prever no teste
y_pred = dtree.predict(X_test)
y_pred_prob = dtree.predict_proba(X_test)[:, 1]


# Avaliar o modelo
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAUC-ROC Score:")
print(roc_auc_score(y_test, y_pred_prob))



Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      2389
           1       0.46      0.53      0.49       611

    accuracy                           0.78      3000
   macro avg       0.67      0.69      0.68      3000
weighted avg       0.79      0.78      0.78      3000


Confusion Matrix:
[[2010  379]
 [ 286  325]]

AUC-ROC Score:
0.6866355548034875


In [30]:
from imblearn.over_sampling import SMOTE
# Aplicar a mesma tecnica SMOTE para lidar com o facto de termos poucos 1s na target e tentar obter melhores resultados.
smote = SMOTE(random_state=42)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [31]:

# Modelo Decision Tree
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_res, y_train_res)

# Prever no teste
y_pred = dtree.predict(X_test)
y_pred_prob = dtree.predict_proba(X_test)[:, 1]


# Avaliar
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAUC-ROC Score:")
print(roc_auc_score(y_test, y_pred_prob))



Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.77      0.83      2389
           1       0.42      0.65      0.51       611

    accuracy                           0.74      3000
   macro avg       0.66      0.71      0.67      3000
weighted avg       0.80      0.74      0.76      3000


Confusion Matrix:
[[1833  556]
 [ 213  398]]

AUC-ROC Score:
0.7093289003952239


#### Aplicar a técnica de GridSearchCV para melhorar o modelo.

In [32]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

print("Best parameters found: ", grid_search.best_params_)

# Prever com o parametro que melhor estima
y_pred_best = grid_search.best_estimator_.predict(X_test)
y_pred_prob_best = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

# Avaliar
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))
print("\nConfusion Matrix (Best Model):")
print(confusion_matrix(y_test, y_pred_best))
print("\nAUC-ROC Score (Best Model):")
print(roc_auc_score(y_test, y_pred_prob_best))


Best parameters found:  {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.90      0.78      0.84      2389
           1       0.45      0.68      0.54       611

    accuracy                           0.76      3000
   macro avg       0.67      0.73      0.69      3000
weighted avg       0.81      0.76      0.78      3000


Confusion Matrix (Best Model):
[[1874  515]
 [ 198  413]]

AUC-ROC Score (Best Model):
0.8020191425649065


# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
# Modelo Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res)

# Prever no teste
y_pred = rf.predict(X_test)
y_pred_prob = rf.predict_proba(X_test)[:, 1]

# Avaliar
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAUC-ROC Score:")
print(roc_auc_score(y_test, y_pred_prob))


Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      2389
           1       0.55      0.64      0.59       611

    accuracy                           0.82      3000
   macro avg       0.73      0.75      0.74      3000
weighted avg       0.83      0.82      0.82      3000


Confusion Matrix:
[[2063  326]
 [ 217  394]]

AUC-ROC Score:
0.8440927765625181


In [24]:

# Aplicar a mesma técnica de GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

print("Best parameters found: ", grid_search.best_params_)

# Prever
y_pred_best = grid_search.best_estimator_.predict(X_test)
y_pred_prob_best = grid_search.best_estimator_.predict_proba(X_test)[:, 1]


# Avaliar
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))
print("\nConfusion Matrix (Best Model):")
print(confusion_matrix(y_test, y_pred_best))
print("\nAUC-ROC Score (Best Model):")
print(roc_auc_score(y_test, y_pred_prob_best))

Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      2389
           1       0.56      0.65      0.60       611

    accuracy                           0.82      3000
   macro avg       0.73      0.76      0.74      3000
weighted avg       0.84      0.82      0.83      3000


Confusion Matrix (Best Model):
[[2072  317]
 [ 214  397]]

AUC-ROC Score (Best Model):
0.8470447954653043


### Considerações finais

Aparentemente o modelo Random Forest, com a aplicação da tecnica Grid Search, tem o melhor desempenho.<br>
Ainda assim, não é ideal.<br>
Resultado: Accuracy: 0.82 Precision (churn): 0.56 Recall (churn): 0.65 F1-score (churn): 0.60 AUC-ROC Score: 0.85 <br>
Apenas conseguimos prever 65% dos clientes que realmente abandonarão (recall). Ou seja, 35% dos clientes que abandonaram , nós não conseguimos prever ( e posteriormente evitar). Assim sendo, interessa-nos minimizar os falsos negativos ao máximo possivel, deveriamos investir em otimizar o modelo para aumentar o recall, sem comprometer demasiado os restantes indicadores.