In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [2]:
data = pd.read_csv('Telco-Customer-Churn.csv')


In [3]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

for column in data.select_dtypes(include=['object']):
    if column != 'customerID':
        data[column] = LabelEncoder().fit_transform(data[column])


In [4]:
X = data.drop(['customerID', 'Churn'], axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)


In [7]:
logreg_accuracy = accuracy_score(y_test, logreg_preds)
logreg_precision = precision_score(y_test, logreg_preds)
logreg_recall = recall_score(y_test, logreg_preds)
logreg_f1 = f1_score(y_test, logreg_preds)

rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)

In [8]:
print(f"Logistic Regression - Accuracy: {logreg_accuracy}, Precision: {logreg_precision}, Recall: {logreg_recall}, F1: {logreg_f1}")
print(f"Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1: {rf_f1}")


Logistic Regression - Accuracy: 0.8106956933270232, Precision: 0.68125, Recall: 0.5696864111498258, F1: 0.6204933586337761
Random Forest - Accuracy: 0.795551348793185, Precision: 0.678391959798995, Recall: 0.47038327526132406, F1: 0.5555555555555556


In [9]:
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

In [10]:
rfe = RFE(RandomForestClassifier(**best_params), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

rf_optimized = RandomForestClassifier(**best_params)
rf_optimized.fit(X_train_rfe, y_train)
rf_optimized_preds = rf_optimized.predict(X_test_rfe)

In [11]:
rf_optimized_accuracy = accuracy_score(y_test, rf_optimized_preds)
rf_optimized_precision = precision_score(y_test, rf_optimized_preds)
rf_optimized_recall = recall_score(y_test, rf_optimized_preds)
rf_optimized_f1 = f1_score(y_test, rf_optimized_preds)


In [12]:
print(f"Optimized Random Forest - Accuracy: {rf_optimized_accuracy}, Precision: {rf_optimized_precision}, Recall: {rf_optimized_recall}, F1: {rf_optimized_f1}")

Optimized Random Forest - Accuracy: 0.7974443918599148, Precision: 0.6622222222222223, Recall: 0.519163763066202, F1: 0.58203125


In [13]:
important_features = pd.Series(rf_optimized.feature_importances_, index=X.columns[rfe.support_])
important_features = important_features.sort_values(ascending=False)


In [14]:
print("\nImportant Features:")
print(important_features)



Important Features:
TotalCharges        0.193716
tenure              0.190013
MonthlyCharges      0.187506
Contract            0.147998
OnlineSecurity      0.072222
TechSupport         0.070567
PaymentMethod       0.050447
InternetService     0.032488
OnlineBackup        0.027922
PaperlessBilling    0.027121
dtype: float64


In [15]:
print("\nBased on the evaluation metrics, the Optimized Random Forest model performs the best.")
print("The top features contributing to customer churn prediction are:")
print(important_features.head(5))


Based on the evaluation metrics, the Optimized Random Forest model performs the best.
The top features contributing to customer churn prediction are:
TotalCharges      0.193716
tenure            0.190013
MonthlyCharges    0.187506
Contract          0.147998
OnlineSecurity    0.072222
dtype: float64
