In [3]:
# Step 1: Load Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step 2: Load Dataset
df = pd.read_csv(r"C:\Users\DELL\Downloads\archive (1)\WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Step 3: Handle spaces only in TotalCharges column
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')

# Step 4: Drop rows with missing values (after conversion)
df.dropna(inplace=True)

# Step 5: Drop 'customerID' (not useful)
df.drop("customerID", axis=1, inplace=True)

# Step 6: Convert target variable to 0/1
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# Step 7: One-Hot Encode all categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Step 8: Train-test split
X = df_encoded.drop("Churn", axis=1)
y = df_encoded["Churn"]

print("Data shape after cleaning:", df_encoded.shape)

# This will now work correctly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 10: Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Data shape after cleaning: (7032, 31)
[[930 103]
 [195 179]]
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.63      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.78      0.79      0.78      1407



In [4]:
import pickle

# Save your trained model
pickle.dump(model, open("churn_model.pkl", "wb"))

# Save your encoded feature columns too
pickle.dump(X_train.columns, open("churn_features.pkl", "wb"))
