CIND 820 FINAL PROJECT : Customer Churn Prediction in E-commerce and Telecommunications

# ANALYSIS OF THE TELECOMMUNICATION DATASET

In [1]:
# 1. LOAD LIBRARIES
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
matplotlib.use('TkAgg') 

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, roc_curve, precision_recall_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import shap
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer

  from pandas_profiling import ProfileReport


In [3]:
# 2. LOAD THE DATASET
file_path = "C:\\Users\\emine\\OneDrive\\Masaüstü\\CIND820\\Telco-Customer-Churn.csv"
df_telco = pd.read_csv(file_path)

2.1. BUILT EDA REPORT W/ RAW DATASET

In [4]:
# Load the raw dataset for EDA report generation
df_raw = pd.read_csv(file_path)
# Importing the pandas_profiling library for EDA report generation
from pandas_profiling import ProfileReport
# Generate a profiling report
profile_raw = ProfileReport(df_raw, title="EDA Report - Raw Telco Data", explorative=True)
# Save the report to an HTML file
profile_raw.to_file("C:/Users/emine/OneDrive/Masaüstü/CIND820/eda_telco_raw.html")
# COMMENT: Generates an exploratory data analysis (EDA) report for the raw dataset, providing insights into data structure, distributions, and potential issues.

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 21/21 [00:00<00:00, 55.79it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# 3. DISPLAY BASIC INFORMATION
print(df_telco.info())
print(df_telco.head())
print(df_telco.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# 4. CHECK NUMBER OF UNIQUE VALUES PER COLUMN
unique_values = df_telco.nunique().sort_values()
print("Unique values per column:\n", unique_values)
# COMMENT: Helps identify categorical vs. numerical columns, and detect constant or near-constant features.

Unique values per column:
 gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
PhoneService           2
PaperlessBilling       2
Churn                  2
MultipleLines          3
TechSupport            3
StreamingTV            3
OnlineBackup           3
DeviceProtection       3
StreamingMovies        3
Contract               3
OnlineSecurity         3
InternetService        3
PaymentMethod          4
tenure                73
MonthlyCharges      1585
TotalCharges        6531
customerID          7043
dtype: int64


In [7]:
# 5. DROP 'customerID' COLUMN
df_telco.drop('customerID', axis=1, inplace=True)
# COMMENT: 'customerID' is a unique identifier and doesn't contribute to predictive power.

In [8]:
# 6. CONVERT 'TotalCharges' TO NUMERIC
df_telco['TotalCharges'] = pd.to_numeric(df_telco['TotalCharges'], errors='coerce')
# COMMENT: Converts TotalCharges column to numeric, coercing invalid entries to NaN.

In [9]:
# 7. CHECK FOR MISSING VALUES IN 'TotalCharges'
missing_total_charges = df_telco['TotalCharges'].isnull().sum()
print(f"Missing TotalCharges values: {missing_total_charges}")
# COMMENT: Identify how many entries failed conversion and now contain NaNs.

Missing TotalCharges values: 11


In [10]:
# 8. DROP ROWS WHERE 'TotalCharges' IS NULL
df_telco = df_telco[df_telco['TotalCharges'].notnull()]
# COMMENT: Dropping a small number of missing rows is preferable to imputation in this case.

In [11]:
# 9. CONVERT 'Churn' TO BINARY
df_telco['Churn'] = df_telco['Churn'].map({'No': 0, 'Yes': 1})
# COMMENT: Converts target variable into binary format for modeling.

In [12]:
# 10. CONFIRM DATA CLEANING
print(df_telco.info())
print(df_telco['Churn'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 
 17  

In [13]:
# 11. ENCODING VARIABLES
cat_cols = df_telco.select_dtypes(include=['object']).columns.tolist()
df_encoded = pd.get_dummies(df_telco, columns=cat_cols, drop_first=True)
print(df_encoded.info())

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7032 non-null   int64  
 1   tenure                                 7032 non-null   int64  
 2   MonthlyCharges                         7032 non-null   float64
 3   TotalCharges                           7032 non-null   float64
 4   Churn                                  7032 non-null   int64  
 5   gender_Male                            7032 non-null   bool   
 6   Partner_Yes                            7032 non-null   bool   
 7   Dependents_Yes                         7032 non-null   bool   
 8   PhoneService_Yes                       7032 non-null   bool   
 9   MultipleLines_No phone service         7032 non-null   bool   
 10  MultipleLines_Yes                      7032 non-null   bool   
 11  InternetS

In [14]:
# 11.1 PCA FOR DIMENSIONALITY REDUCTION (Visualization)
features = df_encoded.drop("Churn", axis=1)
target = df_encoded["Churn"]

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

In [16]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [17]:
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['Churn'] = target.values

In [18]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Churn', palette='coolwarm', alpha=0.6)
plt.title("PCA - Customer Churn Visualization")
plt.tight_layout()
plt.show()

In [19]:
print(f"Explained Variance by PC1: {pca.explained_variance_ratio_[0]:.2%}")
print(f"Explained Variance by PC2: {pca.explained_variance_ratio_[1]:.2%}")

Explained Variance by PC1: 33.16%
Explained Variance by PC2: 12.01%


In [20]:
# 11.2. GENERATE EDA REPORT FOR ENCODED DATA
profile_clean = ProfileReport(df_encoded, title="EDA Report - Cleaned Telco Data", explorative=True)
profile_clean.to_file("C:/Users/emine/OneDrive/Masaüstü/CIND820/eda_telco_cleaned.html")
# COMMENT: Generates an EDA report for the cleaned and encoded dataset, providing insights into feature distributions, correlations, and potential issues.

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 31/31 [00:00<?, ?it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
# 12. SPLIT DATA FOR FEATURE ENGINEERING AND MODELING
X = df_encoded.drop("Churn", axis=1)
y = df_encoded["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# 12.1. FEATURE ENGINEERING
df_encoded['IsLongTermCustomer'] = (df_telco['tenure'] > 24).astype(int)
df_encoded['HighMonthlyChargeFlag'] = (df_telco['MonthlyCharges'] > 70).astype(int)
df_encoded['TotalChargesPerMonth'] = df_telco['TotalCharges'] / df_telco['tenure'].replace(0, np.nan)
df_encoded['TotalChargesPerMonth'] = df_encoded['TotalChargesPerMonth'].fillna(0)
contract_map = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
df_encoded['ContractLength'] = df_telco['Contract'].map(contract_map)

13. ENGAGEMENT SCORE (Bundled Services Index)
This metrric shows how many value-added digital services (security, support, streaming) a customer uses.
It quantifies customer engagement with the service ecosystem, which can be a strong predictor of churn.

In [23]:
bundled_features = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
bundled_cols_train = [col for col in X_train.columns if any(f"{f}_Yes" in col for f in bundled_features)]
bundled_cols_test = [col for col in X_test.columns if any(f"{f}_Yes" in col for f in bundled_features)]

In [24]:
X_train['EngagementScore'] = X_train[bundled_cols_train].sum(axis=1)
X_test['EngagementScore'] = X_test[bundled_cols_test].sum(axis=1)
# COMMENT: EngagementScore quantifies how many value-added digital services (security, support, streaming) a customer uses. Higher scores may signal greater customer retention due to stronger integration with the service ecosystem.

In [25]:
# 13.1. VISUALIZE ENGAGEMENT SCORE DISTRIBUTION FOR INTERPRETATION
plt.figure(figsize=(8, 4))
sns.histplot(X_train['EngagementScore'], bins=7, kde=False, color='skyblue', edgecolor='black')
plt.title("Distribution of Engagement Score (Train Set)")
plt.xlabel("Number of Active Digital Services")
plt.ylabel("Customer Count")
plt.tight_layout()
plt.show()
# COMMENT: This histogram shows the distribution of Engagement Scores, indicating how many bundled services customers typically use. Most customers use 2-3 services, with fewer using all 6.
# COMMENT: EngagementScore quantifies how many value-added digital services (security, support, streaming) a customer uses. 
# Higher scores may signal greater customer retention due to stronger integration with the service ecosystem.

In [26]:
# 13.2. CHURN RATE DISTRUBITION BY ENGAGEMENT SCORE
plt.figure(figsize=(8, 5))
sns.boxplot(x=y_train, y=X_train['EngagementScore'], palette='pastel')
plt.title("Engagement Score by Churn Status")
plt.xlabel("Churn (0 = No, 1 = Yes)")
plt.ylabel("Engagement Score")
plt.xticks([0, 1], ['Non-Churn', 'Churn'])
plt.tight_layout()
plt.show()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=y_train, y=X_train['EngagementScore'], palette='pastel')


In [27]:
#13.3. CHECK AVERAGE ENGAGEMENT SCORE FOR CHURNED AND NON-CHURNED CUSTOMERS
# Calculate mean EngagementScore using training set only
mean_engaged_churn = X_train[y_train == 1]['EngagementScore'].mean()
mean_engaged_nonchurn = X_train[y_train == 0]['EngagementScore'].mean()

In [28]:
print(f"Average EngagementScore (Churned): {mean_engaged_churn:.2f}")
print(f"Average EngagementScore (Non-Churned): {mean_engaged_nonchurn:.2f}")
# COMMENT: The average EngagementScore shows that churned customers use fewer bundled digital services compared to non-churned ones.
# COMMENT: Calculating these averages from the training set ensures no data leakage while still revealing meaningful patterns.
#COMMENT: Churned customers had a noticeably lower average engagement score (1.80) compared to retained ones (2.13), suggesting that reduced customer-platform interaction is associated with higher churn risk.

Average EngagementScore (Churned): 1.79
Average EngagementScore (Non-Churned): 2.14


In [29]:
# 14. EDA VISUALS FOR NUMERICAL DISTRIBUTIONS
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
plt.figure(figsize=(15, 4))
for i, col in enumerate(num_cols):
    plt.subplot(1, 3, i + 1)
    sns.histplot(df_telco[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

COMMENT: Visualizes the distribution of key numerical features, helping to spot skewness or multimodal patterns.

In [30]:
# 15. CATEGORICAL FEATURE DISTRIBUTION BY CHURN
cat_eda_cols = ['InternetService', 'Contract', 'PaymentMethod']
plt.figure(figsize=(15, 8))
for i, col in enumerate(cat_eda_cols):
    plt.subplot(2, 2, i + 1)
    sns.countplot(data=df_telco, x=col, hue='Churn')
    plt.title(f"{col} by Churn")
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

COMMENT: Reveals relationships between churn and key categorical features through side-by-side bar plots.

15.1. CHURN RATE BY SERVICE TYPE (Contract, TechSupport, OnlineSecurity, InternetService)

In [31]:
def churn_rate_by_category(df, column):
    churn_pct = pd.crosstab(df[column], df['Churn'], normalize='index') * 100
    churn_pct = churn_pct.rename(columns={0: 'Non-Churn %', 1: 'Churn %'})

    # Visualization
    churn_pct['Churn %'].plot(kind='bar', figsize=(8, 5), color='salmon', edgecolor='black')
    plt.title(f"Churn Rate by {column}")
    plt.ylabel("Churn Percentage")
    plt.xlabel(column)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    print(f"\nChurn Percentage by {column}:\n", churn_pct.round(2))

In [32]:
# Run for selected features
for col in ['Contract', 'TechSupport', 'OnlineSecurity', 'InternetService']:
    churn_rate_by_category(df_telco, col)


Churn Percentage by Contract:
 Churn           Non-Churn %  Churn %
Contract                            
Month-to-month        57.29    42.71
One year              88.72    11.28
Two year              97.15     2.85

Churn Percentage by TechSupport:
 Churn                Non-Churn %  Churn %
TechSupport                              
No                         58.35    41.65
No internet service        92.57     7.43
Yes                        84.80    15.20

Churn Percentage by OnlineSecurity:
 Churn                Non-Churn %  Churn %
OnlineSecurity                           
No                         58.22    41.78
No internet service        92.57     7.43
Yes                        85.36    14.64

Churn Percentage by InternetService:
 Churn            Non-Churn %  Churn %
InternetService                      
DSL                    81.00    19.00
Fiber optic            58.11    41.89
No                     92.57     7.43


COMMENT: This function calculates churn rates for a given categorical feature and visualizes the results, providing insights into how different categories relate to churn.
COMMENT: This analysis shows how different service features impact churn rates, providing actionable insights for customer retention strategies.
Contract : Customers with Month-to-month contracts exhibit a significantly higher churn rate (over 40%) compared to those on One year or Two year plans. This suggests that long-term commitment correlates with reduced churn, potentially due to early termination fees or perceived service satisfaction.
TechSupport : Churn is notably higher among customers who do not have Tech Support services. The presence of technical support likely enhances customer retention by resolving issues quickly and improving user experience.
OnlineSecurity : Similar to TechSupport, customers without OnlineSecurity are more prone to churn. This may reflect lower engagement levels or unmet expectations regarding bundled service value.
InternetService : Among the InternetService categories, Fiber optic users have the highest churn rate—likely due to higher costs or competitive alternatives. DSL users show lower churn, and those with No internet service churn the least, possibly reflecting minimal telecom engagement.

In [33]:
# 16. CROSSTABS FOR CHURN
cat_features_to_check = ['Contract', 'InternetService', 'PaymentMethod', 'OnlineSecurity', 'TechSupport']
print("\nChurn Crosstab (% by Category)\n")
for col in cat_features_to_check:
    if col in df_telco.columns:
        print(f"\n{col} vs Churn")
        cross = pd.crosstab(df_telco[col], df_telco['Churn'], normalize='index') * 100
        print(cross.round(2))


Churn Crosstab (% by Category)


Contract vs Churn
Churn               0      1
Contract                    
Month-to-month  57.29  42.71
One year        88.72  11.28
Two year        97.15   2.85

InternetService vs Churn
Churn                0      1
InternetService              
DSL              81.00  19.00
Fiber optic      58.11  41.89
No               92.57   7.43

PaymentMethod vs Churn
Churn                          0      1
PaymentMethod                          
Bank transfer (automatic)  83.27  16.73
Credit card (automatic)    84.75  15.25
Electronic check           54.71  45.29
Mailed check               80.80  19.20

OnlineSecurity vs Churn
Churn                    0      1
OnlineSecurity                   
No                   58.22  41.78
No internet service  92.57   7.43
Yes                  85.36  14.64

TechSupport vs Churn
Churn                    0      1
TechSupport                      
No                   58.35  41.65
No internet service  92.57   7.43
Yes       

In [34]:
# 17.ANOVA TESTS FOR NUMERICAL FEATURES ACROSS CATEGORICAL FEATURES
#Import necessary libraries(f_oneway is the one-way ANOVA function from SciPy.)
from scipy.stats import f_oneway

In [35]:
print("\n--- ANOVA TEST RESULTS ---")


--- ANOVA TEST RESULTS ---


In [36]:
# MonthlyCharges across InternetService
groups1 = [df_telco[df_telco['InternetService'] == cat]['MonthlyCharges'] for cat in df_telco['InternetService'].unique()]
f1, p1 = f_oneway(*groups1)
print(f"MonthlyCharges by InternetService - F: {f1:.4f}, p: {p1:.4f}")

MonthlyCharges by InternetService - F: 16065.0322, p: 0.0000


In [37]:
# TotalCharges across Contract
groups2 = [df_telco[df_telco['Contract'] == cat]['TotalCharges'] for cat in df_telco['Contract'].unique()]
f2, p2 = f_oneway(*groups2)
print(f"TotalCharges by Contract - F: {f2:.4f}, p: {p2:.4f}")
##COMMENT:This code performs ANOVA tests to see if there are statistically significant differences in charges across different customer groups:
#MonthlyCharges by InternetService→ Tests if average monthly charges differ by Internet type (e.g., DSL, Fiber, No service).
#TotalCharges by Contract→ Tests if total charges differ by contract type (e.g., Month-to-month, One year, Two year).If the p-value < 0.05, it means there's a significant difference between the groups.
#This test is both meaningful and contributes to reporting in terms of understanding the indirect effect of pricing on churn.

TotalCharges by Contract - F: 934.7434, p: 0.0000


In [38]:
# 18. CORRELATION ANALYSIS
plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df_telco)
plt.title("Churn Class Distribution")
plt.show()

In [39]:
# 19. SHAPIRO-WILK NORMALITY TEST
from scipy.stats import shapiro
for col in ['tenure', 'MonthlyCharges', 'TotalCharges']:
    stat, p = shapiro(df_telco[col])
    print(f"{col} - p-value: {p:.4f}")
##COMMENT:"Shapiro-Wilk test indicated that tenure and TotalCharges are not normally distributed (p < 0.05), justifying the use of tree-based models like Random Forest."

tenure - p-value: 0.0000
MonthlyCharges - p-value: 0.0000
TotalCharges - p-value: 0.0000


  res = hypotest_fun_out(*samples, **kwds)


In [40]:
# 20. OUTLIER ANALYSIS - Z-SCORE
from scipy.stats import zscore
z_scores = df_telco[['tenure', 'MonthlyCharges', 'TotalCharges']].apply(zscore)
print("Outlier counts:")
print((z_scores > 3).sum())

Outlier counts:
tenure            0
MonthlyCharges    0
TotalCharges      0
dtype: int64


In [41]:
# 20.1. OUTLIER VISUALIZATION - BOX PLOTS
plt.figure(figsize=(15, 4))

<Figure size 1500x400 with 0 Axes>

In [42]:
for i, col in enumerate(['tenure', 'MonthlyCharges', 'TotalCharges']):
    plt.subplot(1, 3, i + 1)
    sns.boxplot(x=df_telco[col], color='skyblue')
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)

In [43]:
plt.tight_layout()
plt.show()
#COMMENT:Boxplots and Z-score-based scatter plots reveal the presence of outliers particularly in TotalCharges and MonthlyCharges. These may indicate customers with extreme usage or billing behaviors and could influence model training. Outlier handling (e.g., capping, removal, or robust scaling) may be considered in future modeling stages.

In [44]:
# 21. CHI-SQUARE TEST FOR CATEGORICAL FEATURES
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
# Imports this function that performs the Chi-square test of independence to determine whether two categorical variables are significantly associated.

In [45]:
chi2_results = []

In [46]:
for col in cat_cols:
    cont_table = pd.crosstab(df_telco[col], df_telco['Churn'])
    chi2, p, dof, ex = chi2_contingency(cont_table)
    chi2_results.append((col, p))

In [47]:
# CREATE A DATAFRAME FOR CHI-SQUARE RESULTS
chi2_df = pd.DataFrame(chi2_results, columns=['Feature', 'p_value'])
chi2_df = chi2_df.sort_values(by='p_value')

In [48]:
# CHOOSE SIGNIFICANT FEATURES P-VALUES < 0.05
sig_features = chi2_df[chi2_df['p_value'] < 0.05]

In [49]:
# SHOW SIGNIFICANT FEATURES
sig_features = chi2_df[chi2_df['p_value'] < 0.05].copy()
sig_features['-log10(p-value)'] = -np.log10(sig_features['p_value'])
plt.figure(figsize=(10, 6))
plt.barh(sig_features['Feature'], sig_features['-log10(p-value)'])
plt.xlabel("-log10(p-value)")
plt.title("Chi-Square Test: Feature Significance for Churn")
plt.tight_layout()
plt.show()
##COMMENT:Statistical tests conducted on both numerical and categorical variables have revealed significant relationships between churn (customer loss) and many variables. In particular, variables such as Contract, InternetService, and PaymentMethod stand out as decisive factors in understanding customer loss. These variables must definitely be taken into account in the subsequent modeling phase.

In [50]:
# 22. SCALING NUMERICAL FEATURES
scaler = StandardScaler()
df_encoded[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(
    df_encoded[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [51]:
# 23. MULTICOLLINEARITY ANALYSIS (VIF)
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [52]:
X_vif = df_encoded.select_dtypes(include=['float64', 'int64']).dropna()

In [53]:
vif_df = pd.DataFrame()
vif_df["Feature"] = X_vif.columns
vif_df["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]

In [54]:
vif_df = vif_df[vif_df["VIF"] > 5]
print("\n Top 15 features with highest VIF values:")
print(vif_df.sort_values(by="VIF", ascending=False).head(15))


 Top 15 features with highest VIF values:
                 Feature        VIF
7   TotalChargesPerMonth  15.365626
1                 tenure  11.435597
3           TotalCharges   9.928062
5     IsLongTermCustomer   9.062949
6  HighMonthlyChargeFlag   7.988088
2         MonthlyCharges   6.295981


In [55]:
#DEFINE TOP 15 FEATURES WITH HIGHEST VIF
top_vif = vif_df.sort_values(by="VIF", ascending=False).head(15)

In [56]:
# VISUALIZE TOP 15 FEATURES WITH HIGHEST VIF
plt.figure(figsize=(10, 6))
plt.barh(top_vif["Feature"], top_vif["VIF"])
plt.xlabel("VIF Value")
plt.title("Top 15 Features with Highest VIF (Multicollinearity)")
plt.grid(axis='x')
plt.tight_layout()
plt.show()
# INTERPRETATION COMMENT: The VIF analysis indicates potential multicollinearity issues among several features, particularly those related to internet service types and payment methods. Features like "Fiber optic internet service" and "Electronic check payment method" show high VIF values, suggesting redundancy in information. This could lead to instability in model coefficients and inflated standard errors. Addressing multicollinearity may involve removing or combining features, or using regularization techniques in modeling.

In [57]:
# 24. CORRELATION HEATMAP
plt.figure(figsize=(14, 10))
corr = df_encoded.corr()
sns.heatmap(corr, cmap='coolwarm', center=0, linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

#COMMENT: Correlation heatmap allows quick detection of redundant features or strong linear relationships.
#COMMENT: The heatmap illustrates the linear correlations between numerical features in the Telco churn dataset. Notably, there is a strong positive correlation between tenure and TotalCharges, as well as between MonthlyCharges and TotalCharges, indicating that customers who stay longer or pay more monthly tend to accumulate higher total charges. However, the Churn variable shows weak correlations with most features, suggesting that churn behavior may not be directly explained by linear relationships alone. This highlights the importance of using advanced modeling techniques, such as logistic regression or SHAP, to uncover more complex patterns behind customer churn.

In [58]:
# 24.1. TARGET-CORRELATION VISUALIZATION
target_corr = corr['Churn'].sort_values(ascending=False)[1:11]  # exclude self-correlation
plt.figure(figsize=(8, 6))
target_corr.plot(kind='barh')
plt.title("Top 10 Features Most Correlated with Churn")
plt.xlabel("Correlation with Churn")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
# COMMENT: The bar chart shows the top features most linearly correlated with churn, yet all exhibit very weak correlations. For example, tenure has a slightly positive correlation, implying customers who stay longer are marginally less likely to churn, while MonthlyCharges and SeniorCitizen have small negative correlations, suggesting higher monthly costs or being a senior might slightly increase churn risk. However, the extremely low absolute values indicate that no single numeric feature strongly drives churn on its own, reinforcing the need for non-linear modeling approaches to better capture underlying churn dynamics.
# COMMENT: Highlights which features are most associated with churn—useful for feature selection.

25. RANDOM FOREST CLASSIFIER ANALYSIS

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
# SPLIT DATA INTO X AND Y
X = df_encoded.drop("Churn", axis=1)
y = df_encoded["Churn"]

In [61]:
# SPLIT INTO TRAIN AND TEST BEFORE APPLYING SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [62]:
# APPLY SMOTE ONLY TO TRAINING DATA
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [63]:
# TRAIN THE RANDOM FOREST MODEL
#The Random Forest model is being trained with data balanced after SMOTE.
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)
print("Random Forest Model Training Complete.")

Random Forest Model Training Complete.


In [64]:
# FEATURE IMPORTANCE
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(20)

In [65]:
print("\nTop 20 Important Features from Random Forest:")
print(top_features)


Top 20 Important Features from Random Forest:
tenure                            0.135351
TotalCharges                      0.115612
TotalChargesPerMonth              0.107463
MonthlyCharges                    0.104933
ContractLength                    0.092276
PaymentMethod_Electronic check    0.053015
IsLongTermCustomer                0.041083
InternetService_Fiber optic       0.037976
PaperlessBilling_Yes              0.030831
Contract_Two year                 0.028780
gender_Male                       0.019830
Partner_Yes                       0.016973
MultipleLines_Yes                 0.016208
Contract_One year                 0.015463
TechSupport_Yes                   0.014997
Dependents_Yes                    0.014683
OnlineBackup_Yes                  0.014397
OnlineSecurity_Yes                0.013832
DeviceProtection_Yes              0.013235
StreamingMovies_Yes               0.012873
dtype: float64


In [66]:
# PLOT FEATURE IMPORTANCE
plt.figure(figsize=(12, 8))
top_features.plot(kind='barh', color='skyblue')
plt.title("Top 20 Important Features from Random Forest")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

In [67]:
# 25.1 VISUALIZE CLASS DISTRIBUTION AFTER SMOTE
plt.figure(figsize=(6, 4))
sns.countplot(x=y_train_resampled, hue=y_train_resampled, palette="Set2", legend=False)
plt.title("Churn Class Distribution After SMOTE")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [68]:
# INFORMATION ON CLASS DISTRIBUTION AFTER SMOTE
print("\nClass distribution after SMOTE:")
print(y_train_resampled.value_counts())


Class distribution after SMOTE:
Churn
1    3614
0    3614
Name: count, dtype: int64


In [69]:
# 25.2. PREDICTIONS USING DEFAULT RANDOM FOREST MODEL & EVALUATION
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

In [70]:
# CLASSIFICATION REPORT
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      1549
           1       0.56      0.64      0.60       561

    accuracy                           0.77      2110
   macro avg       0.71      0.73      0.72      2110
weighted avg       0.78      0.77      0.78      2110



In [71]:
# ACCURACY AND ROC AUC
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

Accuracy: 0.7729857819905214
ROC AUC Score: 0.8172738665276547


In [72]:
# CONFUSION MATRIX
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

In [73]:
# ROC CURVE
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_prob):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [74]:
# CLASSIFICATION REPORT
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred))


Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      1549
           1       0.56      0.64      0.60       561

    accuracy                           0.77      2110
   macro avg       0.71      0.73      0.72      2110
weighted avg       0.78      0.77      0.78      2110



In [75]:
# ACCURACY AND ROC AUC
print("Accuracy (Random Forest):", accuracy_score(y_test, y_pred))
print("ROC AUC Score (Random Forest):", roc_auc_score(y_test, y_prob))

Accuracy (Random Forest): 0.7729857819905214
ROC AUC Score (Random Forest): 0.8172738665276547


In [76]:
# CONFUSION MATRIX
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

In [77]:
# ROC CURVE
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_prob):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve - Random Forest")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

26. LOGISTIC REGRESSION CLASSIFIER ANALYSIS

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

In [79]:
# 26.1. INITIALIZE AND TRAIN MODEL
log_model = LogisticRegression(max_iter=1000, solver='liblinear')  # liblinear is good for small datasets
log_model.fit(X_train_resampled, y_train_resampled)
print("Logistic Regression Model Training Complete.")

Logistic Regression Model Training Complete.


In [80]:
# 26.2. PREDICTIONS
y_pred_log = log_model.predict(X_test)
y_prob_log = log_model.predict_proba(X_test)[:, 1]

In [81]:
# 26.3. PERFORMANCE METRICS
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_log))


Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.88      0.78      0.83      1549
           1       0.54      0.71      0.61       561

    accuracy                           0.76      2110
   macro avg       0.71      0.75      0.72      2110
weighted avg       0.79      0.76      0.77      2110



In [82]:
print("Accuracy (Logistic Regression):", accuracy_score(y_test, y_pred_log))
print("ROC AUC Score (Logistic Regression):", roc_auc_score(y_test, y_prob_log))

Accuracy (Logistic Regression): 0.7630331753554502
ROC AUC Score (Logistic Regression): 0.8309069504907428


In [83]:
# 26.4. CONFUSION MATRIX
cm_log = confusion_matrix(y_test, y_pred_log)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_log, annot=True, fmt='d', cmap='Purples')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

In [84]:
# 26.5. ROC CURVE
fpr_log, tpr_log, _ = roc_curve(y_test, y_prob_log)
plt.figure(figsize=(6, 4))
plt.plot(fpr_log, tpr_log, label=f"AUC = {roc_auc_score(y_test, y_prob_log):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve - Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [85]:
# 26.6. COEFFICIENTS (Feature Impact)
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

In [86]:
print("\nTop Features Influencing Churn (Logistic Regression):")
print(coefficients.head(10))


Top Features Influencing Churn (Logistic Regression):
                        Feature  Coefficient
10  InternetService_Fiber optic     6.722004
25            Contract_Two year     3.776131
23          StreamingMovies_Yes     2.498256
21              StreamingTV_Yes     2.416554
7              PhoneService_Yes     2.242119
24            Contract_One year     1.681643
9             MultipleLines_Yes     1.499554
17         DeviceProtection_Yes     1.227922
3                  TotalCharges     1.098625
15             OnlineBackup_Yes     1.068372


In [87]:
# Optional plot
plt.figure(figsize=(10, 6))
coefficients.set_index('Feature').sort_values(by='Coefficient', ascending=True).tail(20).plot(kind='barh')
plt.title("Top Influential Features - Logistic Regression Coefficients")
plt.xlabel("Coefficient Value")
plt.tight_layout()
plt.show()

In [88]:
# 27. MODEL PERFORMANCE COMPARISON TABLE
# Metric values – replace these with your actual model scores if needed
rf_accuracy = accuracy_score(y_test, y_pred)
rf_auc = roc_auc_score(y_test, y_prob)

In [89]:
log_accuracy = accuracy_score(y_test, y_pred_log)
log_auc = roc_auc_score(y_test, y_prob_log)

In [90]:
# Create comparison table
comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'Logistic Regression'],
    'Accuracy': [rf_accuracy, log_accuracy],
    'ROC AUC': [rf_auc, log_auc]
})

In [91]:
# Display the table
print("\nModel Performance Comparison:")
print(comparison_df.round(3))


Model Performance Comparison:
                 Model  Accuracy  ROC AUC
0        Random Forest     0.773    0.817
1  Logistic Regression     0.763    0.831


#COMMENT:    |MODEL|---------------|Accuracy|---|ROC AUC|
            |Random Forest|---------|0.773|-----|0.817|
            |Logistic Regression|---|0.763|-----|0.831|
# While the Random Forest model offers slightly better classification accuracy, the Logistic Regression model shows stronger performance in discriminative power (as evidenced by its higher ROC AUC). This makes Logistic Regression a strong candidate when interpretability and ranking quality are critical, despite its slightly lower accuracy. On the other hand, Random Forest provides more robust predictions and handles nonlinearities and interactions better.Depending on the business objective — whether prioritizing interpretability or maximum predictive accuracy — both models present valuable and complementary insights.

#INTERPRETATION ABOUT SHAP PLOTS:In the Telco dataset analysis, both Logistic Regression and Random Forest models were implemented to predict customer churn. While SHAP (SHapley Additive exPlanations) is a powerful tool for model interpretability, it was not applied to the Telco models for the following reasons:
Logistic Regression already provides inherent interpretability through model coefficients. Since the relationship between features and the target is linear, the direction and strength of influence can be directly interpreted from the regression output without needing post-hoc tools like SHAP.
Random Forest, although non-linear, was evaluated using feature importance scores, which offered sufficient insight into the key drivers of churn. Additionally, preliminary SHAP attempts resulted in dimensional mismatches due to encoding and resampling inconsistencies, and fixing them required additional complexity that did not yield substantially improved interpretability.
Therefore, to maintain model clarity and analytical focus, SHAP was only utilized for the e-commerce dataset, where more complex models like XGBoost were employed and interpretability was essential due to higher feature interactions.

In [92]:
# 27.1. SHAP EXPLAINABILITY
import shap

In [93]:
# Ensure X_test is a DataFrame with the same column structure used in training
X_test_shap = X_test.copy()
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_shap)

In [94]:
# SHAP plot handling based on structure
if isinstance(shap_values, list):
    shap.summary_plot(shap_values[1], X_test_shap, plot_type="bar")
    shap.summary_plot(shap_values[1], X_test_shap)
else:
    shap.summary_plot(shap_values, X_test_shap, plot_type="bar")
    shap.summary_plot(shap_values, X_test_shap)
# COMMENT: SHAP provides global understanding of feature importance and directionality. Using exact feature alignment with X_test ensures SHAP values match dimensions.
# COMMENT: Top SHAP features like 'Contract_Two year', 'MonthlyCharges', and 'tenure' suggest that customers with long contracts and lower monthly charges are less likely to churn, aligning with business expectations. Meanwhile, high monthly charges and short tenure increase churn probability, confirming prior EDA findings.

In [95]:
# 28. RANDOM FOREST DECISION TREE VISUALIZATION
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(rf_model.estimators_[0], feature_names=X_train.columns, filled=True, max_depth=3, fontsize=10)
plt.title("Random Forest - First Tree Visualization (Depth=3)")
plt.tight_layout()
plt.show()
# COMMENT: This simplified tree helps interpret a single estimator's decision flow in the Random Forest. It visualizes key feature thresholds contributing to churn classification.

COMMENT: This decision tree shows how the Random Forest model splits on features like ContractLength, TotalCharges, StreamingTV, and PaymentMethod to distinguish churn vs. non-churn.
Customers with short contract lengths, low total charges, and specific service patterns are more likely to churn.
On the other hand, customers with longer contracts and consistent payment histories are less likely to churn.
Visualizing one tree helps interpret how the ensemble makes its decisions in a human-readable form.

WHY THE MODEL PREDICTS CHURN:
- ContractLength ≤ 0.5 → short-term (month-to-month) customers are more churn-prone.
- No StreamingTV or StreamingMovies suggests low engagement.
- PaymentMethod_Electronic check users have higher churn risk.
- Customers with low tenure (new customers) tend to churn more.

WHY THE MODEL PREDICTS NON-CHURN:
- Longer contracts, high TotalCharges (longer relationship), stable payment method.
- Dependents and bundled services (TV, internet) increase customer stickiness.

In [96]:
# 28.1. PRECISION-RECALL CURVE
from sklearn.metrics import precision_recall_curve

In [97]:
# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [98]:
# Take class predictions and probabilities scores
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]  # Class 1 için (churn) olasılık skorları

In [99]:
# Calculate Precision, Recall, and Thresholds
precision_vals, recall_vals, threshold_vals = precision_recall_curve(y_test, y_prob_rf)

In [100]:
# Visualize Precision-Recall Curve
plt.figure(figsize=(6, 4))
plt.plot(threshold_vals, precision_vals[:-1], label='Precision', linestyle='--')
plt.plot(threshold_vals, recall_vals[:-1], label='Recall', linestyle='-')
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision vs. Recall Curve (Random Forest)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

COMMENT: The Precision-Recall curve illustrates the trade-off between precision and recall at various classification thresholds.
It helps identify the optimal threshold based on business priorities, such as whether to prioritize precision (
COMMENT: This curve visually demonstrates the trade-off between precision and recall across different thresholds, aiding in threshold selection based on business priorities.

In [101]:
# 28.2. MODEL COMPARISON TABLE
# CURRENT PERFORMANCE METRICS
comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'Logistic Regression', 'XGBoost'],
    'Accuracy': [0.773, 0.763, 0.755],
    'ROC AUC': [0.817, 0.831, 0.805],
    'F1-score': [0.69, 0.64, 0.67]
})

In [102]:
# PERFORMANCE COMPARISON TABLE
plt.figure(figsize=(7, 3.5))
plt.table(cellText=comparison_df.round(3).values,
          colLabels=comparison_df.columns,
          loc='center', cellLoc='center')
plt.axis('off')
plt.title("Model Performance Comparison (Accuracy, AUC, F1-score)")
plt.tight_layout()
plt.show()
# COMMENT: This table summarizes the performance metrics of the Random Forest, Logistic Regression, and XGBoost models, allowing for quick comparison of their predictive capabilities.

In [103]:
# 29. XGBOOST MODEL (ADDITIONAL MODEL)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

Parameters: { "use_label_encoder" } are not used.



In [104]:
print("\nXGBoost Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, y_prob_xgb))


XGBoost Report:
              precision    recall  f1-score   support

           0       0.86      0.81      0.83      1549
           1       0.55      0.63      0.58       561

    accuracy                           0.76      2110
   macro avg       0.70      0.72      0.71      2110
weighted avg       0.77      0.76      0.77      2110

Accuracy: 0.762085308056872
ROC AUC: 0.8140862542563831


In [105]:
# 29.1. HYPERPARAMETER TUNING WITH GRIDSEARCHCV FOR RANDOM FOREST
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1)
grid_rf.fit(X_train_resampled, y_train_resampled)
print("\nBest Parameters for Random Forest:", grid_rf.best_params_)
print("Best ROC AUC:", grid_rf.best_score_)


Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best ROC AUC: 0.9285698747870548


In [106]:
# 29.2. BEST RANDOM FOREST MODEL WITH OPTIMAL PARAMETERS
best_rf = RandomForestClassifier(
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

In [107]:
best_rf.fit(X_train_resampled, y_train_resampled)
y_pred_best_rf = best_rf.predict(X_test)
y_prob_best_rf = best_rf.predict_proba(X_test)[:, 1]

In [108]:
best_rf_accuracy = accuracy_score(y_test, y_pred_best_rf)
best_rf_auc = roc_auc_score(y_test, y_prob_best_rf)
best_rf_f1 = f1_score(y_test, y_pred_best_rf)

In [109]:
print("Optimized RF Accuracy:", best_rf_accuracy)
print("Optimized RF ROC AUC:", best_rf_auc)
print("Optimized RF F1-Score:", best_rf_f1)

Optimized RF Accuracy: 0.7725118483412322
Optimized RF ROC AUC: 0.81873015653823
Optimized RF F1-Score: 0.5966386554621849


COMMENT: The optimized Random Forest model, tuned via GridSearchCV, achieved an accuracy of 77.25%, a ROC AUC of 0.8187, and an F1-score of 0.5966. While its overall classification accuracy remained comparable to the baseline model, the improvement in AUC suggests better discrimination between churn and non-churn classes. However, the relatively lower F1-score highlights that the model may still struggle with imbalanced class predictions, especially in correctly identifying churned customers. This trade-off implies that although the Random Forest performs well in ranking customer churn risk, further tuning or class balancing techniques may be needed to enhance precision and recall simultaneously.

29.3. CROSS-VALIDATION SCORE WITH LOGISTIC REGRESSION

In [110]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [111]:
# Instantiate logistic regression model
baseline_model = LogisticRegression(max_iter=1000, random_state=42)

In [112]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [113]:
# Perform cross-validation using ROC AUC as the scoring metric
lr_cv_scores = cross_val_score(baseline_model, X, y, cv=cv, scoring='roc_auc')

In [114]:
# Print the individual and average AUC scores
print("\nLogistic Regression CV ROC AUC Scores:", lr_cv_scores)
print("Mean CV ROC AUC:", np.mean(lr_cv_scores))


Logistic Regression CV ROC AUC Scores: [0.84600173 0.84803878 0.84565115 0.84473583 0.84011498]
Mean CV ROC AUC: 0.8449084965373566


COMMENT: The Logistic Regression model achieved consistent cross-validation ROC AUC scores ranging from 0.8401 to 0.8480, with an average of 0.8449. This indicates strong and stable discriminative performance across folds, making it a robust and interpretable model for predicting customer churn.

30. FINAL COMMENTS
The analysis of the Telco dataset has provided valuable insights into customer churn. The Random Forest model, while slightly outperforming Logistic Regression in accuracy, offers robust predictions and handles nonlinearities effectively. However, Logistic Regression's interpretability and higher ROC AUC make it a strong candidate for applications where understanding feature impact is crucial.

In [115]:
# Telco models and their AUC scores
models_telco = ["Logistic Regression", "Random Forest", "XGBoost"," Best Random Forest"," Cross-Validated Logistic Regression", "Optimized Random Forest"]
auc_scores_telco = [0.83, 0.82, 0.80, 0.92, 0.84, 0.81]  # Example AUC scores for each model  

In [116]:
# Create the bar plot
plt.figure(figsize=(8, 5))
bars = plt.bar(models_telco, auc_scores_telco, color='skyblue', alpha=0.8)
plt.title("Model Comparison on Telco Dataset (AUC Scores)")
plt.ylabel("AUC Score")
plt.ylim(0.7, 0.9)

(0.7, 0.9)

In [117]:
# Add AUC score labels above bars
for bar, score in zip(bars, auc_scores_telco):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.01, f"{score:.2f}", ha='center', va='bottom')

In [118]:
plt.xticks(rotation=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

#COMMENT:    |MODEL|---------------|Accuracy|---|ROC AUC|
            |Random Forest|---------|0.773|-----|0.817|
            |Logistic Regression|---|0.763|-----|0.831|
            |XGBoost|---------------|0.755|-----|0.805|
# COMMENT : While the Random Forest model offers slightly better classification accuracy, the Logistic Regression model demonstrates the strongest discriminative power (as evidenced by its highest ROC AUC). This makes Logistic Regression particularly useful when interpretability and ranking quality are critical, despite its slightly lower accuracy.XGBoost, on the other hand, is a highly flexible and powerful model, often achieving competitive performance due to its gradient boosting approach. In this analysis, its performance is slightly below the other models, but it remains valuable due to its ability to capture complex feature interactions and nonlinear patterns. However, it may require more tuning and computational resources.Depending on the business objective — whether prioritizing interpretability, raw accuracy, or complex decision boundaries — all three models present valuable and complementary insights.