In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [40]:
df = pd.read_csv(r"C:\Users\user\Downloads\telecom_churn_expanded.csv") 

df.drop(["customerID"], axis=1, inplace=True)  

categorical_cols = df.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

df.fillna(df.median(numeric_only=True), inplace=True)  
df.fillna(df.mode().iloc[0], inplace=True)  

X = df.drop("Churn", axis=1)  
y = df["Churn"]  # Target

# Balance data using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)

# Scale data for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.90      0.85      0.87       505
           1       0.86      0.91      0.89       530

    accuracy                           0.88      1035
   macro avg       0.88      0.88      0.88      1035
weighted avg       0.88      0.88      0.88      1035



In [54]:
from xgboost import XGBClassifier
model2 = XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric="logloss")
model2.fit(X_train, y_train)
y_pred_xgb = model2.predict(X_test)
print("\n🔹 XGBoost Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
print(classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.




🔹 XGBoost Performance:
Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       505
           1       0.83      0.85      0.84       530

    accuracy                           0.84      1035
   macro avg       0.84      0.84      0.84      1035
weighted avg       0.84      0.84      0.84      1035



In [42]:
import shap
import matplotlib.pyplot as plt


In [49]:
explainer = shap.TreeExplainer(model)

# Compute SHAP values for test data
shap_values = explainer.shap_values(X_test)

# --- ✅ 1️⃣ Global Feature Importance (Top 5 factors affecting churn) ---
# If SHAP returns a list (multi-class case), select the second class (Churn = 1)
if isinstance(shap_values, list):
    shap_values = shap_values[1]  

# Convert SHAP values to a DataFrame
shap_df = pd.DataFrame(shap_values, columns=X_test.columns)  

# Get the top 5 features by mean absolute SHAP value
top_5_features = shap_df.abs().mean().sort_values(ascending=False).head(5)

# ✅ Plot Global SHAP Feature Importance
plt.figure(figsize=(8, 5))
top_5_features.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Top 5 Features Influencing Churn Prediction")
plt.ylabel("Mean |SHAP Value|")
plt.xlabel("Feature")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [44]:
import numpy as np

# Get user input
print("Enter customer details:")

gender = int(input("Gender (1 for Female, 0 for Male): "))
senior_citizen = int(input("Senior Citizen (1 for Yes, 0 for No): "))
partner = int(input("Partner (1 for Yes, 0 for No): "))
dependents = int(input("Dependents (1 for Yes, 0 for No): "))
tenure = float(input("Tenure (in months, e.g., 12): "))
phone_service = int(input("Phone Service (1 for Yes, 0 for No): "))
paperless_billing = int(input("Paperless Billing (1 for Yes, 0 for No): "))
monthly_charges = float(input("Monthly Charges (e.g., 70): "))
total_charges = float(input("Total Charges (e.g., 840): "))

multiple_lines = int(input("Multiple Lines (0: No, 1: Yes, 2: No phone service): "))
internet_service = int(input("Internet Service (0: DSL, 1: Fiber optic, 2: No): "))
online_security = int(input("Online Security (0: No, 1: Yes, 2: No internet service): "))
online_backup = int(input("Online Backup (0: No, 1: Yes, 2: No internet service): "))
device_protection = int(input("Device Protection (0: No, 1: Yes, 2: No internet service): "))
tech_support = int(input("Tech Support (0: No, 1: Yes, 2: No internet service): "))
streaming_tv = int(input("Streaming TV (0: No, 1: Yes, 2: No internet service): "))
streaming_movies = int(input("Streaming Movies (0: No, 1: Yes, 2: No internet service): "))

contract = int(input("Contract (0: Month-to-month, 1: One year, 2: Two year): "))
payment_method = int(input("Payment Method (0: Electronic check, 1: Mailed check, 2: Bank transfer (automatic), 3: Credit card (automatic)): "))

# Create input array
input_data = np.array([[gender, senior_citizen, partner, dependents, tenure, phone_service,
                        paperless_billing, monthly_charges, total_charges, multiple_lines,
                        internet_service, online_security, online_backup, device_protection,
                        tech_support, streaming_tv, streaming_movies, contract, payment_method]])

# Scale input data using the same scaler from training
input_data_scaled = scaler.transform(input_data)

# Make prediction
prediction = model.predict(input_data_scaled)

# Output result
if prediction[0] == 1:
    print("\nPrediction: The customer is likely to CHURN.")
else:
    print("\nPrediction: The customer is likely to STAY.")


Enter customer details:


Gender (1 for Female, 0 for Male):  0
Senior Citizen (1 for Yes, 0 for No):  0
Partner (1 for Yes, 0 for No):  0
Dependents (1 for Yes, 0 for No):  0
Tenure (in months, e.g., 12):  2
Phone Service (1 for Yes, 0 for No):  0
Paperless Billing (1 for Yes, 0 for No):  1
Monthly Charges (e.g., 70):  53.85
Total Charges (e.g., 840):  108.15
Multiple Lines (0: No, 1: Yes, 2: No phone service):  0
Internet Service (0: DSL, 1: Fiber optic, 2: No):  0
Online Security (0: No, 1: Yes, 2: No internet service):  1
Online Backup (0: No, 1: Yes, 2: No internet service):  1
Device Protection (0: No, 1: Yes, 2: No internet service):  0
Tech Support (0: No, 1: Yes, 2: No internet service):  0
Streaming TV (0: No, 1: Yes, 2: No internet service):  0
Streaming Movies (0: No, 1: Yes, 2: No internet service):  0
Contract (0: Month-to-month, 1: One year, 2: Two year):  0
Payment Method (0: Electronic check, 1: Mailed check, 2: Bank transfer (automatic), 3: Credit card (automatic)):  1



Prediction: The customer is likely to CHURN.




In [None]:
3668-QPYBK	Male	0	              No	No	           2	Yes	             No	               DSL	           Yes	                Yes	         No	               No	      No	       No   	Month-to-month	Yes            	Mailed check	    53.85         	108.15	Yes
customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn


In [None]:
shap_input = scaler.transform(input_data.reshape(1, -1))  

# Compute SHAP values for this single input
shap_values_single = explainer.shap_values(shap_input)

# If SHAP returns a list, select the second class (Churn = 1)
if isinstance(shap_values_single, list):
    shap_values_single = shap_values_single[1][0]  # Take SHAP values for class 1 (Churn)

# ✅ Create SHAP Explanation Object for the specific prediction
shap_explanation = shap.Explanation(
    values=shap_values_single,  
    base_values=explainer.expected_value[1],  # Expected value for class 1 (Churn)
    data=shap_input[0],  
    feature_names=X_test.columns.tolist()  
)

# ✅ Waterfall Plot (Top 5 Reasons for this Customer's Churn Prediction)
shap.plots.waterfall(shap_explanation, max_display=5)