<a href="https://colab.research.google.com/github/diegoeller/Churn-Prediction-Project/blob/main/Telco_Customer_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Loading libraries and data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import shap

In [None]:
# Carregar dataset (Exemplo: Telco Customer Churn)
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

#3. Understanding the data

In [None]:
# Get the number of rows and columns
num_rows = df.shape[0]
num_cols = df.shape[1]

# Print the information in the specified format
print(f"The dataset has {num_rows} rows and {num_cols} columns.")

The dataset has 7043 rows and 21 columns.


In [None]:
# Verificar os primeiros dados
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


#4. Data Manipulation

In [None]:
# Convertendo colunas para numéricas
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'], errors='coerce')

##4.1. Visualizing missing values

In [None]:
df.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


Here we see that the TotalCharges has 11 missing values. Let's check this data.

In [None]:
df[np.isnan(df['TotalCharges'])]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


- It can be noted that the Tenure column is 0 for these entries (Months that the customer has been with the company), makes sense the TotalCharges column is missing, even though the MonthlyCharges column is not.
- Let's see if there are any other 0 values in the tenure column.

In [None]:
df[df['tenure'] == 0].index

Index([], dtype='int64')

- There are no additional missing values in the Tenure column.
- Let's delete the rows with missing values in Tenure columns since there are only 11 rows and deleting them will not affect the data.

In [None]:
# Drop rows where 'tenure' is 0
df.drop(df[df['tenure'] == 0].index, axis=0, inplace=True)

# Check for remaining rows with tenure = 0
df[df['tenure'] == 0].index

Index([], dtype='int64')

##4.2. Removing the ID column

- By removing the ID column, we are preparing the dataset for future analisys.
- We can still use the index to count the number of customers.

In [None]:
df = df.drop(['customerID'], axis = 1)
df.head()

KeyError: "['customerID'] not found in axis"

##4.3. Converting to binary

*Explain*

In [None]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

#5. Data Visualization

In [None]:
# Seleção de features numéricas e categóricas
df.info()
num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = df.select_dtypes(include=['object']).columns.tolist()
cat_features.remove('customerID')  # Remover ID do cliente

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
# Transformar variáveis categóricas em dummies
df = pd.get_dummies(df, columns=cat_features, drop_first=True)

In [None]:
df

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,7590-VHVEG,0,1,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,5575-GNVDE,0,34,56.95,0,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,3668-QPYBK,0,2,53.85,1,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,7795-CFOCW,0,45,42.30,0,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,9237-HQITU,0,2,70.70,1,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,0,24,84.80,0,True,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
7039,2234-XADUH,0,72,103.20,0,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
7040,4801-JZAZL,0,11,29.60,0,False,True,True,False,True,...,False,False,False,False,False,False,False,False,False,False
7041,8361-LTMKD,1,4,74.40,1,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Separar variáveis preditoras e alvo
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [None]:
# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Normalizar os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Treinar modelo Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Avaliar o modelo
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_pred))

In [None]:
# Matriz de confusão
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predito')
plt.ylabel('Real')
plt.title('Matriz de Confusão')
plt.show()

In [None]:
# Explicabilidade do modelo
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test)