<a href="https://colab.research.google.com/github/ejajjindran-gif/FUTURE_ML_01/blob/main/Task2_Churn_Prediction/Ejaz_Jindran_Task2_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd

# Load dataset
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Show first 5 rows
df.head()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# --- DATA CLEANING ---

# 1. Remove customerID (not useful for prediction)
df = df.drop("customerID", axis=1)

# 2. Convert "TotalCharges" to numeric (it has some empty strings)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# 3. Fill missing values
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

# 4. Convert Yes/No columns to 1/0
yes_no_cols = ["Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]
for col in yes_no_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

# 5. One-hot encoding for remaining categorical variables
df = pd.get_dummies(df, drop_first=True)

# 6. Show cleaned dataset
df.head()

In [None]:
# --- MODEL TRAINING ---

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Define X (features) and y (target)
X = df.drop("Churn", axis=1)
y = df["Churn"]

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 4. Predictions
y_pred = model.predict(X_test)

# 5. Accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
# Feature Importance
importance = model.coef_[0]

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

feature_importance.head(15)

In [None]:
# Sort feature importance properly
importance = model.coef_[0]

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

feature_importance

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
# Feature importance from Logistic Regression
importance = model.coef_[0]

# Create importance table
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

feature_importance

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Churn', data=df)
plt.show()

In [None]:
df.to_csv("cleaned_churn_data.csv", index=False)

### ðŸ“Œ Summary â€” Customer Churn Prediction

**Model Accuracy:** 82%

**Top Drivers of Churn:**
- InternetService_Fiber optic
- PaperlessBilling
- Electronic Check Payment
- Contract type (One-year/Two-year reduces churn)

**Insights:**
- Customers using fiber optic churn more â€” improve service quality.
- Long-term contract customers churn less â€” promote yearly plans.
- Electronic check users churn more â€” offer smoother payment options.