# 0.Import Libraries

In [2]:
import os
import warnings
from warnings import simplefilter
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style = 'white')

import shap
from imblearn.over_sampling import SMOTE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import xgboost as xgb
from xgboost import XGBClassifier

from scipy.cluster.vq import kmeans, vq, whiten
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

# 1.Load Dataset

In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

data = pd.read_csv(path+'/WA_Fn-UseC_-Telco-Customer-Churn.csv')

Path to dataset files: C:\Users\esrag\.cache\kagglehub\datasets\blastchar\telco-customer-churn\versions\1


# 2.Data Analysis

In [9]:
def create_analysis_dataframe(dataset):
    list=[]
    for column in dataset.columns:
        data_type=dataset.dtypes[column]
        unique_number=dataset[column].unique().size
        null_ratio=dataset[column].isnull().sum()/dataset.shape[0]
        most_frequent_value= dataset[column].value_counts().idxmax()
        least_frequent_value= dataset[column].value_counts().idxmin()
        list.append([column,data_type,unique_number,null_ratio,most_frequent_value,least_frequent_value])
    dataset_info=pd.DataFrame(list,columns=['Name',
                                            'Data_Type',
                                            'Unique_Size',
                                            'Null_Ratio',
                                            'Most_Frequent_Value',
                                            'Least_Frequent_Value']).sort_values(by=['Unique_Size','Name'],
                                                                                 ascending=True)
    return dataset_info

dataset_info=create_analysis_dataframe(data)
dataset_info

Unnamed: 0,Name,Data_Type,Unique_Size,Null_Ratio,Most_Frequent_Value,Least_Frequent_Value
20,Churn,object,2,0.0,No,Yes
4,Dependents,object,2,0.0,No,Yes
16,PaperlessBilling,object,2,0.0,Yes,No
3,Partner,object,2,0.0,No,Yes
6,PhoneService,object,2,0.0,Yes,No
2,SeniorCitizen,int64,2,0.0,0,1
1,gender,object,2,0.0,Male,Female
15,Contract,object,3,0.0,Month-to-month,One year
11,DeviceProtection,object,3,0.0,No,No internet service
8,InternetService,object,3,0.0,Fiber optic,No


## 2.1 Total Charges
First I want to start by setting the index.

In the previous cell, I realize that even though **"TotalCharges"** has numbers as value but type of the column is **'object'**.

Also, since the most frequently used value is **empty space**, my second move will be to change this column type and also its most frequently used value.

In [None]:
data.set_index('customerID', inplace=True)

In [None]:
data['TotalCharges'].value_counts()

In [None]:
data['TotalCharges'] = data['TotalCharges'].replace(' ','0')
data['TotalCharges'] = data['TotalCharges'].astype('float')

In [None]:
data['TotalCharges'].astype('float')

In [None]:
dataset_info=create_analysis_dataframe(data)
dataset_info

## 2.2 Visualization

In [None]:
fig, ((ax0,ax1),(ax2,ax3)) = plt.subplots(ncols=2,
                                          nrows=2,
                                          figsize=(16,12))

#  TotalCharges
sns.histplot(data['TotalCharges'],kde=True,ax =ax0, color='tomato')
ax0.set_title('TotalCharges Histogram Plot')
ax0.set_xlabel('TotalCharges Value')
ax0.set_ylabel('# of Students')

# MonthlyCharges 
sns.histplot(data['MonthlyCharges'],kde=True,ax =ax1)
ax1.set_title('MonthlyCharges Histogram Plot')
ax1.set_xlabel('MonthlyCharges Value')
ax1.set_ylabel('# of Rows')

# Churn 
sns.histplot(data['tenure'],kde=True,ax =ax2,color='skyblue')
ax2.set_title('Tenure Histogram Plot')
ax2.set_xlabel('Tenure')
ax2.set_ylabel('# of Rows')

# gre 
ax3.bar(x = data['Churn'].value_counts().keys(),
        height = data['Churn'].value_counts().values,
        color ='skyblue')
ax3.set_title('Churn Histogram Plot')
ax3.set_xlabel('Churn Value')
ax3.set_ylabel('# of Rows')


fig.show()

In [None]:
sns.scatterplot(data=data,x='MonthlyCharges',y='TotalCharges',hue='Churn')

In [None]:
sns.scatterplot(data=data,x='TotalCharges',y='tenure',hue='Churn')

In [None]:
fig, ((ax0,ax1),(ax2,ax3)) = plt.subplots(ncols=2,
                                          nrows=2,
                                          figsize=(16,12))

#  TotalCharges
sns.boxplot(x='Churn',
            y='TotalCharges',
            data = data,
            ax =ax0, color='tomato')
ax0.set_title('TotalCharges - Churn Box Plot')

# MonthlyCharges 
sns.boxplot(x='Churn',
            y='MonthlyCharges',
            data = data,
            ax =ax1)
ax1.set_title('MonthlyCharges -Churn Box Plot')

# Churn 
sns.boxplot(x='Churn',
            y='tenure',
            data = data,
            ax =ax2, color='skyblue')
ax2.set_title('Tenure - Churn Box Plot')

# SeniorCitizen 
sns.boxplot(x='Churn',
            y='SeniorCitizen',
            data = data,
            ax =ax3, color='skyblue')
ax3.set_title('SeniorCitizen - Churn Box Plot')


fig.show()

## 2.3 Statistical Analysis

In [None]:
object_columns = data.select_dtypes(include='object').columns.tolist()
len(object_columns)

# 3.Data Preprocessing

## 3.1 Columns with only 2 unique values
My next step will be to encode the columns that only have 2 unique labels and those labels are 'No' or 'Yes'.

In [None]:
column_list =['Churn',
              'Dependents',
              'PaperlessBilling',
              'Partner',
              'PhoneService']

for each in column_list:
    data[each] = np.where(data[each]=='No', 0, 1)
    
dataset_info=create_analysis_dataframe(data)
dataset_info

## 3.2 Encoding the remaining Categorical columns
Because of **Multicollinearity** in the dataset, when I use one-hot encoding , I will drop first column.

In [None]:
data = pd.get_dummies(data,  dtype=int, drop_first=True)

dataset_info=create_analysis_dataframe(data)
dataset_info

## 3.3 Multicollinearity
**We have multicollinearity in our data, so we should models which are less sensitive to multicollinearity.**

In [None]:
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif.sort_values(by=['VIF'],
                          ascending=False)
    return(vif)

X = data.drop('Churn',axis=1)
calc_vif(X)

## 3.4 Correlation

In [None]:
spearman_corr = data.corr(method='spearman').drop('Churn',axis=0)
most_correlated_spearman = spearman_corr.loc[(spearman_corr['Churn']>0.2) | (spearman_corr['Churn'] < -0.2)][['Churn']].sort_values('Churn')
fig = plt.figure(figsize = (10, 5))
# creating the bar plot
plt.bar(x = most_correlated_spearman.index,
        height=most_correlated_spearman['Churn'],
        color ='blue', 
        width = 0.4)
plt.xticks(rotation=90)
plt.xlabel("Column Names")
plt.ylabel("Correlation Values")
plt.title("The Most Correlated Columns (Method = 'Spearman')")
plt.show()

## 3.5 Train-Test Split

In [None]:
np.random.seed(123)
features = data.drop(['Churn'],axis=1).reset_index(drop=True)
label = data['Churn'].reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.33, random_state=42,stratify=label)

In [None]:
features.head()

## 3.6 Standardization

In [None]:
scaler = MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=features.columns, index=y_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), columns=features.columns, index=y_test.index)

# 4.Base Models
## 4.1 Logistic Regression

In [None]:
lg_model = LogisticRegression()
lg_model.fit(X_train, y_train)
lg_preds = lg_model.predict(X_test)
print(f"Accuracy : {accuracy_score(y_test, lg_preds)} F1 Score : {f1_score(y_test, lg_preds)} ")

result=confusion_matrix(y_test,lg_preds)
plot_confusion_matrix(conf_mat=result,figsize=(5,5),class_names=np.unique(y_test))
plt.show()

## 4.2 Xgboost

In [None]:
# Assuming class 0 is the negative class and class 1 is the positive class
neg, pos = np.bincount(y_train)  # y_train is the target variable
scale_pos_weight = neg / pos

In [None]:
xg_model = XGBClassifier(max_depth=4,
                         min_child_weight=3.50,
                         subsample=1,
                         colsample_bytree=0.73,
                         reg_alpha=0.0,
                         reg_lambda =0.5,
                        n_estimators=57,
                       scale_pos_weight=scale_pos_weight)

# Train the XGBoost model with early stopping to capture the learning curve
eval_set = [(X_train, y_train), (X_test, y_test)]
xg_model.fit(X_train, y_train, eval_metric="logloss", eval_set=eval_set, verbose=False)

# Extract results from evaluation history
results = xg_model.evals_result()

# Plot learning curves
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results['validation_0']['logloss'], label='Train')
plt.plot(x_axis, results['validation_1']['logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('XGBoost Learning Curves')
plt.legend()
plt.show()

In [None]:
xgb_model = XGBClassifier(
    max_depth=3,            
    min_child_weight=5,     
    subsample=0.8,          
    colsample_bytree=0.6,   
    reg_alpha=0.1,          
    reg_lambda=1.0,         
    n_estimators=100,       
    learning_rate=0.05,
#     scale_pos_weight=scale_pos_weight
)

eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_model.fit(X_train, y_train, eval_metric="logloss", eval_set=eval_set, verbose=False)

# Extract results from evaluation history
results = xgb_model.evals_result()

# Plot learning curves
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results['validation_0']['logloss'], label='Train')
plt.plot(x_axis, results['validation_1']['logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('XGBoost Learning Curves')
plt.legend()
plt.show()

In [None]:
prediction = xgb_model.predict(X_test)
print(f"Accuracy train : {accuracy_score(y_test,prediction)} F1 Score train : {f1_score(y_test, prediction)} ")

result=confusion_matrix(y_test,prediction)
plot_confusion_matrix(conf_mat=result,figsize=(5,5),class_names=np.unique(y_test))
plt.show()

In [None]:
explainer = shap.Explainer(xgb_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values=shap_values, features=X_test)

In [None]:
fig, ax0 = plt.subplots(ncols=1,nrows=1,figsize=(10,10))
xgb.plot_importance(xgb_model, importance_type='weight', xlabel='F-Score',ax=ax0)
plt.show()

# 5.Clustering

In [None]:
# Fit the data into a hierarchical clustering algorithm
distance_matrix = linkage(data.drop('Churn',axis=1), 'ward')

In [None]:
fig, ax = plt.subplots(figsize=(15,8))

# Create a dendrogram
dn = dendrogram(distance_matrix,ax=ax, no_labels=True)
# Display the dendogram
fig.show()

In [None]:
distortions = []
num_clusters = range(1, 10)

# Create a list of distortions from the kmeans function
for i in num_clusters:
    cluster_centers, distortion = kmeans(data.drop('Churn',axis=1),i)
    distortions.append(distortion)

# Create a DataFrame with two lists - num_clusters, distortions
elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})

# Creat a line plot of num_clusters and distortions
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot)
plt.xticks(num_clusters)
plt.show()

In [None]:
# Generate cluster centers
cluster_centers, distortion = kmeans(data.drop('Churn',axis=1),2)

# Assign cluster labels
data['cluster_labels'], distortion_list = vq(data.drop('Churn',axis=1),cluster_centers)

In [None]:
# Plot clusters
sns.scatterplot(x=distortion_list, y='tenure', 
                hue='cluster_labels', data = data)
plt.show()

In [None]:
# Plot clusters
sns.scatterplot(x='TotalCharges', y='tenure', 
                hue='cluster_labels', data = data)
plt.show()

In [None]:
data.groupby(['cluster_labels','Churn']).mean()

In [None]:
data.cluster_labels.value_counts()/data.shape[0]

## 5.2 Models with Cluster Column 

In [None]:
np.random.seed(123)
features = data.drop(['Churn'],axis=1).reset_index(drop=True)
label = data['Churn'].reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.33, random_state=42,stratify=label)

In [None]:
X_train.head()

In [None]:
xgb_model = XGBClassifier(
    max_depth=3,            
    min_child_weight=5,     
    subsample=0.8,          
    colsample_bytree=0.6,   
    reg_alpha=0.1,          
    reg_lambda=1.0,         
    n_estimators=100,       
    learning_rate=0.05 ,
    scale_pos_weight=scale_pos_weight
)

eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_model.fit(X_train, y_train, eval_metric="logloss", eval_set=eval_set, verbose=False)

# Extract results from evaluation history
results = xgb_model.evals_result()

# Plot learning curves
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results['validation_0']['logloss'], label='Train')
plt.plot(x_axis, results['validation_1']['logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Log Loss')
plt.title('XGBoost Learning Curves')
plt.legend()
plt.show()

In [None]:
prediction = xgb_model.predict(X_test)
print(f"Accuracy train : {accuracy_score(y_test,prediction)} F1 Score train : {f1_score(y_test, prediction)} ")

result=confusion_matrix(y_test,prediction)
plot_confusion_matrix(conf_mat=result,figsize=(5,5),class_names=np.unique(y_test))
plt.show()