#IMPORT AND  INSTALLATION

In [None]:
# Install required packages
!pip install imbalanced-learn -q
!pip install tensorflow -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve)
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import warnings
warnings.filterwarnings('ignore')
import joblib
import os
from google.colab import drive, files
from google.colab import drive
drive.mount('/content/drive')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


#DATA LOADING & INITIAL EXPLORATION

In [None]:
print("="*60)
print("TELCO CUSTOMER CHURN PREDICTION - COURSEWORK IMPLEMENTATION")
print("="*60)

# Download dataset from Kaggle (alternative method)
print("\n1. DOWNLOADING DATASET...")

# Method 1: Try loading from Google Drive path first
try:
    df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Telco-Customer-Churn.csv")
    print(f"   Dataset loaded successfully from Google Drive!")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {df.columns.tolist()}")

except FileNotFoundError:
    print("   File not found at Google Drive path. Trying alternative method...")

    # Method 2: Upload from local
    print("\n   Please upload the 'Telco-Customer-Churn.csv' file when prompted")
    from google.colab import files
    uploaded = files.upload()

    # Get the filename
    if uploaded:
        filename = list(uploaded.keys())[0]
        print(f"   Uploaded file: {filename}")

        # Load the dataset
        df = pd.read_csv(filename)
        print(f"   Dataset loaded successfully from uploaded file!")
        print(f"   Shape: {df.shape}")
        print(f"   Columns: {df.columns.tolist()}")
    else:
        print("    No file uploaded. Please try again.")
        raise FileNotFoundError("Telco-Customer-Churn.csv file not found")

# Display first few rows
print("\nFirst 5 rows of the dataset:")
display(df.head())

print("\n Data loading completed!")

#EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
print("\n" + "="*60)
print("EXPLORATORY DATA ANALYSIS (EDA)")
print("="*60)

# Define DRIVE_PATH for saving visualizations
DRIVE_PATH = './colab_output'
os.makedirs(DRIVE_PATH, exist_ok=True)

# 2.1 Basic information
print("\n2.1 DATASET INFORMATION:")
print(f"   Total samples: {len(df)}")
print(f"   Total features: {len(df.columns)}")

# Data types
print("\n2.2 DATA TYPES:")
print(df.dtypes)

# Missing values
print("\n2.3 MISSING VALUES:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Handle missing values in TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# 2.4 Class Distribution - BEFORE BALANCING
print("\n2.4 CLASS DISTRIBUTION (BEFORE BALANCING):")
churn_counts = df['Churn'].value_counts()
churn_percent = df['Churn'].value_counts(normalize=True) * 100
print(f"   No Churn:  {churn_counts['No']} ({churn_percent['No']:.1f}%)")
print(f"   Churn:     {churn_counts['Yes']} ({churn_percent['Yes']:.1f}%)")
print(f"   IMBALANCE RATIO: {churn_counts['No']/churn_counts['Yes']:.2f}:1")

# Save for report
class_dist_before = {
    'No': churn_counts['No'],
    'Yes': churn_counts['Yes'],
    'No_percent': churn_percent['No'],
    'Yes_percent': churn_percent['Yes']
}

# 2.5 Numerical features statistics
print("\n2.5 NUMERICAL FEATURES STATISTICS:")
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
print(df[numerical_cols].describe())

# 2.6 Visualizations
print("\n2.6 CREATING EDA VISUALIZATIONS...")
plt.figure(figsize=(20, 15))

# Subplot 1: Churn distribution
plt.subplot(3, 3, 1)
sns.countplot(data=df, x='Churn', palette='Set2')
plt.title('Churn Distribution (Original)', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Churn Status', fontsize=12)

# Add percentage labels
total = len(df)
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().text(p.get_x() + p.get_width()/2., height + 30,
                  f'{height/total*100:.1f}%', ha='center', fontsize=11)

# Subplot 2: Tenure distribution
plt.subplot(3, 3, 2)
sns.histplot(df['tenure'], bins=30, kde=True, color='skyblue')
plt.title('Customer Tenure Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Tenure (months)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Subplot 3: Monthly charges vs Churn
plt.subplot(3, 3, 3)
sns.boxplot(data=df, x='Churn', y='MonthlyCharges', palette='Set2')
plt.title('Monthly Charges by Churn Status', fontsize=14, fontweight='bold')
plt.xlabel('Churn Status', fontsize=12)
plt.ylabel('Monthly Charges ($)', fontsize=12)

# Subplot 4: Contract type impact
plt.subplot(3, 3, 4)
contract_churn = pd.crosstab(df['Contract'], df['Churn'], normalize='index') * 100
contract_churn.plot(kind='bar', stacked=True, colormap='coolwarm', ax=plt.gca())
plt.title('Churn Rate by Contract Type', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Contract Type', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 5: Correlation heatmap
plt.subplot(3, 3, 5)
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": 0.8})
plt.title('Numerical Features Correlation', fontsize=14, fontweight='bold')

# Subplot 6: Payment method impact
plt.subplot(3, 3, 6)
payment_churn = pd.crosstab(df['PaymentMethod'], df['Churn'], normalize='index') * 100
payment_churn.plot(kind='bar', stacked=True, colormap='viridis', ax=plt.gca())
plt.title('Churn Rate by Payment Method', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Payment Method', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 7: Internet service impact
plt.subplot(3, 3, 7)
internet_churn = pd.crosstab(df['InternetService'], df['Churn'], normalize='index') * 100
internet_churn.plot(kind='bar', stacked=True, colormap='summer', ax=plt.gca())
plt.title('Churn Rate by Internet Service', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Internet Service', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 8: Gender distribution
plt.subplot(3, 3, 8)
gender_churn = pd.crosstab(df['gender'], df['Churn'])
gender_churn.plot(kind='bar', stacked=True, colormap='Pastel1', ax=plt.gca())
plt.title('Churn Distribution by Gender', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Gender', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

# Subplot 9: Senior citizen impact
plt.subplot(3, 3, 9)
senior_churn = pd.crosstab(df['SeniorCitizen'], df['Churn'], normalize='index') * 100
senior_churn.index = ['Non-Senior', 'Senior']
senior_churn.plot(kind='bar', stacked=True, colormap='RdYlBu_r', ax=plt.gca())
plt.title('Churn Rate by Senior Citizen Status', fontsize=14, fontweight='bold')
plt.ylabel('Percentage (%)', fontsize=12)
plt.xlabel('Senior Citizen Status', fontsize=12)
plt.legend(title='Churn', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
eda_path = os.path.join(DRIVE_PATH, 'eda_visualizations.png')
plt.savefig(eda_path, dpi=300, bbox_inches='tight')
print(f"\n EDA visualizations saved as '{eda_path}'")

# Show EDA insights
print("\n" + "="*60)
print("KEY EDA INSIGHTS:")
print("="*60)
print("1. Class Imbalance: Significant imbalance (73% No Churn vs 27% Churn)")
print("2. Monthly Charges: Churned customers tend to have higher monthly charges")
print("3. Contract Type: Month-to-month contracts have highest churn rate")
print("4. Tenure: Long-term customers less likely to churn")
print("5. Payment Method: Electronic check has highest churn rate")
print("6. Internet Service: Fiber optic customers more likely to churn")
print("7. Senior Citizens: Slightly higher churn rate among seniors")

print("\n EDA completed!")