#IMPORT AND  INSTALLATION

In [None]:
# Install required packages
!pip install imbalanced-learn -q
!pip install tensorflow -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve)
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import warnings
warnings.filterwarnings('ignore')
import joblib
import os
from google.colab import drive, files
from google.colab import drive
drive.mount('/content/drive')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


#DATA LOADING & INITIAL EXPLORATION

In [None]:
print("="*60)
print("TELCO CUSTOMER CHURN PREDICTION - COURSEWORK IMPLEMENTATION")
print("="*60)

# Download dataset from Kaggle (alternative method)
print("\n1. DOWNLOADING DATASET...")

# Method 1: Try loading from Google Drive path first
try:
    df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Telco-Customer-Churn.csv")
    print(f"   Dataset loaded successfully from Google Drive!")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {df.columns.tolist()}")

except FileNotFoundError:
    print("   File not found at Google Drive path. Trying alternative method...")

    # Method 2: Upload from local
    print("\n   Please upload the 'Telco-Customer-Churn.csv' file when prompted")
    from google.colab import files
    uploaded = files.upload()

    # Get the filename
    if uploaded:
        filename = list(uploaded.keys())[0]
        print(f"   Uploaded file: {filename}")

        # Load the dataset
        df = pd.read_csv(filename)
        print(f"   Dataset loaded successfully from uploaded file!")
        print(f"   Shape: {df.shape}")
        print(f"   Columns: {df.columns.tolist()}")
    else:
        print("    No file uploaded. Please try again.")
        raise FileNotFoundError("Telco-Customer-Churn.csv file not found")

# Display first few rows
print("\nFirst 5 rows of the dataset:")
display(df.head())

print("\n Data loading completed!")

#EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
print("\n" + "="*60)
print("EXPLORATORY DATA ANALYSIS (EDA)")
print("="*60)

# Define DRIVE_PATH for saving visualizations
DRIVE_PATH = './colab_output'
os.makedirs(DRIVE_PATH, exist_ok=True)

# 2.1 Basic information
print("\n2.1 DATASET INFORMATION:")
print(f"   Total samples: {len(df)}")
print(f"   Total features: {len(df.columns)}")

# Data types
print("\n2.2 DATA TYPES:")
print(df.dtypes)

# Missing values
print("\n2.3 MISSING VALUES:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Handle missing values in TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# 2.4 Class Distribution - BEFORE BALANCING
print("\n2.4 CLASS DISTRIBUTION (BEFORE BALANCING):")
churn_counts = df['Churn'].value_counts()
churn_percent = df['Churn'].value_counts(normalize=True) * 100
print(f"   No Churn:  {churn_counts['No']} ({churn_percent['No']:.1f}%)")
print(f"   Churn:     {churn_counts['Yes']} ({churn_percent['Yes']:.1f}%)")
print(f"   IMBALANCE RATIO: {churn_counts['No']/churn_counts['Yes']:.2f}:1")

# Save for report
class_dist_before = {
    'No': churn_counts['No'],
    'Yes': churn_counts['Yes'],
    'No_percent': churn_percent['No'],
    'Yes_percent': churn_percent['Yes']
}

# 2.5 Numerical features statistics
print("\n2.5 NUMERICAL FEATURES STATISTICS:")
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
print(df[numerical_cols].describe())

# 2.6 Visualizations
print("\n2.6 CREATING EDA VISUALIZATIONS...")
plt.figure(figsize=(20, 15))

# Subplot 1: Churn distribution
plt.subplot(3, 3, 1)
sns.countplot(data=df, x='Churn', palette='Set2')
plt.title('Churn Distribution (Original)', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Churn Status', fontsize=12)

# Add percentage labels
total = len(df)
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().text(p.get_x() + p.get_width()/2., height + 30,
                  f'{height/total*100:.1f}%', ha='center', fontsize=11)

# Subplot 2: Tenure distribution
plt.subplot(3, 3, 2)
sns.histplot(df['tenure'], bins=30, kde=True, color='skyblue')
plt.title('Customer Tenure Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Tenure (months)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Subplot 3: Monthly charges vs Churn
plt.subplot(3, 3, 3)
sns.boxplot(data=df, x='Churn', y='MonthlyCharges', palette='Set2')
plt.title('Monthly Charges by Churn Status', fontsize=14, fontweight='bold')
plt.xlabel('Churn Status', fontsize=12)
plt.ylabel('Monthly Charges ($)', fontsize=12)

# Subplot 4: Contract type impact
plt.subplot(3, 3, 4)
contract_churn = pd.crosstab(df['Contract'], df['Churn'], normalize='index') * 100
contract_churn.plot(kind='bar', stacked=True, colormap='coolwarm', ax=plt.gca())
plt.title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Contract Type', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 5: Correlation heatmap
plt.subplot(3, 3, 5)
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": 0.8})
plt.title('Numerical Features Correlation', fontsize=14, fontweight='bold')

# Subplot 6: Payment method impact
plt.subplot(3, 3, 6)
payment_churn = pd.crosstab(df['PaymentMethod'], df['Churn'], normalize='index') * 100
payment_churn.plot(kind='bar', stacked=True, colormap='viridis', ax=plt.gca())
plt.title('Churn Rate by Payment Method', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Payment Method', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 7: Internet service impact
plt.subplot(3, 3, 7)
internet_churn = pd.crosstab(df['InternetService'], df['Churn'], normalize='index') * 100
internet_churn.plot(kind='bar', stacked=True, colormap='summer', ax=plt.gca())
plt.title('Churn Rate by Internet Service', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Internet Service', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 8: Gender distribution
plt.subplot(3, 3, 8)
gender_churn = pd.crosstab(df['gender'], df['Churn'])
gender_churn.plot(kind='bar', stacked=True, colormap='Pastel1', ax=plt.gca())
plt.title('Churn Distribution by Gender', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Gender', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 9: Senior citizen impact
plt.subplot(3, 3, 9)
senior_churn = pd.crosstab(df['SeniorCitizen'], df['Churn'], normalize='index') * 100
senior_churn.index = ['Non-Senior', 'Senior']
senior_churn.plot(kind='bar', stacked=True, colormap='RdYlBu_r', ax=plt.gca())
plt.title('Churn Rate by Senior Citizen Status', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Senior Citizen Status', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
eda_path = os.path.join(DRIVE_PATH, 'eda_visualizations.png')
plt.savefig(eda_path, dpi=300, bbox_inches='tight')
print(f"\n EDA visualizations saved as '{eda_path}'")

# Show EDA insights
print("\n" + "="*60)
print("KEY EDA INSIGHTS:")
print("="*60)
print("1. Class Imbalance: Significant imbalance (73% No Churn vs 27% Churn)")
print("2. Monthly Charges: Churned customers tend to have higher monthly charges")
print("3. Contract Type: Month-to-month contracts have highest churn rate")
print("4. Tenure: Long-term customers less likely to churn")
print("5. Payment Method: Electronic check has highest churn rate")
print("6. Internet Service: Fiber optic customers more likely to churn")
print("7. Senior Citizens: Slightly higher churn rate among seniors")

print("\n EDA completed!")

#DATA PREPROCESSING & FEATURE ENGINEERING

In [None]:
print("\n" + "="*60)
print("DATA PREPROCESSING & FEATURE ENGINEERING")
print("="*60)

# 3.1 Encode categorical variables
print("\n3.1 ENCODING CATEGORICAL VARIABLES...")

# Binary encoding for Yes/No columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# Label encoding for other categoricals
label_encoders = {}
categorical_cols = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity',
                    'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                    'StreamingMovies', 'Contract', 'PaymentMethod']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Target encoding
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print(f"   Encoded {len(binary_cols) + len(categorical_cols)} categorical features")

# 3.2 Feature selection
print("\n3.2 SELECTING FEATURES...")
# Drop customerID
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

print(f"   Features selected: {X.shape[1]}")
print(f"   Feature names: {X.columns.tolist()}")

# 3.3 Train-test split (BEFORE SMOTE to avoid data leakage)
print("\n3.3 TRAIN-TEST SPLIT (80-20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"   Training set: {X_train.shape} ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Test set:     {X_test.shape} ({len(X_test)/len(X)*100:.1f}%)")

# Check class distribution in train/test
print(f"\n   Class distribution in Training set:")
train_counts = y_train.value_counts()
for val, count in train_counts.items():
    label = "Churn" if val == 1 else "No Churn"
    print(f"     {label}: {count} ({count/len(y_train)*100:.1f}%)")

print(f"\n   Class distribution in Test set:")
test_counts = y_test.value_counts()
for val, count in test_counts.items():
    label = "Churn" if val == 1 else "No Churn"
    print(f"     {label}: {count} ({count/len(y_test)*100:.1f}%)")

# 3.4 Feature scaling
print("\n3.4 SCALING FEATURES...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("   Features scaled using StandardScaler")

print("\n Data preprocessing completed!")

#HANDLING CLASS IMBALANCE WITH SMOTE

In [None]:
print("\n" + "="*60)
print("HANDLING CLASS IMBALANCE WITH SMOTE")
print("="*60)

# Define DRIVE_PATH for saving visualizations
DRIVE_PATH = './colab_output' # Local directory for demonstration
os.makedirs(DRIVE_PATH, exist_ok=True)

# 4.1 Check class distribution BEFORE SMOTE
print("\n4.1 CLASS DISTRIBUTION BEFORE SMOTE (Training set):")
unique_before, counts_before = np.unique(y_train, return_counts=True)
for val, count in zip(unique_before, counts_before):
    percentage = count / len(y_train) * 100
    label = "Churn" if val == 1 else "No Churn"
    print(f"   {label}: {count} ({percentage:.1f}%)")

# 4.2 Apply SMOTE ONLY to training data
print("\n4.2 APPLYING SMOTE TO TRAINING DATA...")
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# 4.3 Check class distribution AFTER SMOTE
print("4.3 CLASS DISTRIBUTION AFTER SMOTE:")
unique_after, counts_after = np.unique(y_train_bal, return_counts=True)
for val, count in zip(unique_after, counts_after):
    percentage = count / len(y_train_bal) * 100
    label = "Churn" if val == 1 else "No Churn"
    print(f"   {label}: {count} ({percentage:.1f}%)")

class_dist_after = {
    'No': counts_after[0],
    'Yes': counts_after[1],
    'No_percent': counts_after[0]/len(y_train_bal)*100,
    'Yes_percent': counts_after[1]/len(y_train_bal)*100
}

# 4.4 Visualize before/after SMOTE
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
labels = ['No Churn', 'Churn']
colors = ['lightblue', 'salmon']
plt.pie(counts_before, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Class Distribution\n(Before SMOTE)', fontsize=14, fontweight='bold')

plt.subplot(1, 2, 2)
plt.pie(counts_after, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Class Distribution\n(After SMOTE)', fontsize=14, fontweight='bold')

plt.tight_layout()
smote_path = os.path.join(DRIVE_PATH, 'smote_comparison.png')
plt.savefig(smote_path, dpi=300, bbox_inches='tight')
print(f"\n✅ SMOTE comparison visualization saved as '{smote_path}'")

print("\n✅ SMOTE applied successfully. Training data is now balanced.")

#DECISION TREE MODEL

In [None]:
print("\n" + "="*60)
print("DECISION TREE MODEL")
print("="*60)

# 5.1 Hyperparameter tuning with GridSearchCV
print("\n5.1 PERFORMING HYPERPARAMETER TUNING...")
print("   This may take a few minutes...")

dt_model = DecisionTreeClassifier(random_state=42)

param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [10, 20, 30, 40],
    'min_samples_leaf': [5, 10, 20, 30],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced'],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(
    dt_model, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
grid_search.fit(X_train_bal, y_train_bal)

best_dt = grid_search.best_estimator_
print(f"\n Hyperparameter tuning completed!")
print(f"   Best parameters: {grid_search.best_params_}")
print(f"   Best CV F1-score: {grid_search.best_score_:.4f}")

# 5.2 Train final model
print("\n5.2 TRAINING FINAL DECISION TREE...")
best_dt.fit(X_train_bal, y_train_bal)
print("   Model training completed!")

# ===== ADD FEATURE IMPORTANCE ANALYSIS =====
print("\n5.3 FEATURE IMPORTANCE ANALYSIS...")

# Get feature importance
dt_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_dt.feature_importances_
}).sort_values('importance', ascending=False)

print("\n   Top 10 Most Important Features:")
print(dt_importance.head(10).to_string(index=False))

# Create horizontal bar chart for top 10 features
plt.figure(figsize=(12, 6))
plt.barh(dt_importance['feature'].head(10)[::-1],
         dt_importance['importance'].head(10)[::-1],
         color='steelblue')
plt.xlabel('Importance Score', fontsize=12)
plt.title('Top 10 Feature Importances - Decision Tree',
          fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('decision_tree_feature_importance.png', dpi=300)
plt.show()

print("   Feature importance chart saved as 'decision_tree_feature_importance.png'")

# 5.4 Predictions
print("\n5.4 MAKING PREDICTIONS...")
y_pred_dt = best_dt.predict(X_test_scaled)
y_pred_proba_dt = best_dt.predict_proba(X_test_scaled)[:, 1]

# 5.5 Evaluation
print("\n5.5 EVALUATING DECISION TREE...")
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_auc = roc_auc_score(y_test, y_pred_proba_dt)

print(f"    Accuracy:  {dt_accuracy:.4f}")
print(f"    Precision: {dt_precision:.4f}")
print(f"    Recall:    {dt_recall:.4f}")
print(f"    F1-Score:  {dt_f1:.4f}")
print(f"    AUC:       {dt_auc:.4f}")

# 5.6 Confusion Matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
print(f"\n   Confusion Matrix:")
print(f"   [[TN: {cm_dt[0,0]}, FP: {cm_dt[0,1]}]")
print(f"    [FN: {cm_dt[1,0]}, TP: {cm_dt[1,1]}]]")

print("\n" + "="*60)
print("Decision Tree model completed!")
print("="*60)

#NEURAL NETWORK MODEL