# Case Study 1: Customer Churn Prediction (Classification)

**Student Name:** [Your Name]  
**SRN:** [Your SRN]  
**Dataset:** customer_churn_data.csv  
**Objective:** Predict customer churn using classification algorithms

## Task 1: Data Loading and Exploration

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Load the dataset
df = pd.read_csv('customer_churn_data.csv')

# Basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()

In [None]:
# Display first few rows
print("First 5 rows:")
df.head()

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("No missing values found!")

In [None]:
# Summary statistics
print("Summary Statistics:")
df.describe()

In [None]:
# Check data types
print("Data Types:")
print(df.dtypes)

## Task 2: Exploratory Data Analysis (EDA)

In [None]:
# Analyze churn distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
churn_counts = df['churn'].value_counts()
plt.pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Churn Distribution')

plt.subplot(1, 2, 2)
sns.countplot(data=df, x='churn')
plt.title('Churn Count')

plt.tight_layout()
plt.show()

# Calculate churn rate
churn_rate = (df['churn'] == 'Yes').mean() * 100
print(f"Churn Rate: {churn_rate:.2f}%")

In [None]:
# Demographic patterns analysis
plt.figure(figsize=(15, 10))

# Age distribution by churn
plt.subplot(2, 3, 1)
sns.boxplot(data=df, x='churn', y='age')
plt.title('Age Distribution by Churn')

# Gender analysis
plt.subplot(2, 3, 2)
sns.countplot(data=df, x='gender', hue='churn')
plt.title('Gender vs Churn')

# Senior citizen analysis
plt.subplot(2, 3, 3)
sns.countplot(data=df, x='senior_citizen', hue='churn')
plt.title('Senior Citizen vs Churn')

# Partner analysis
plt.subplot(2, 3, 4)
sns.countplot(data=df, x='partner', hue='churn')
plt.title('Partner vs Churn')

# Dependents analysis
plt.subplot(2, 3, 5)
sns.countplot(data=df, x='dependents', hue='churn')
plt.title('Dependents vs Churn')

# Tenure analysis
plt.subplot(2, 3, 6)
sns.boxplot(data=df, x='churn', y='tenure')
plt.title('Tenure vs Churn')

plt.tight_layout()
plt.show()

In [None]:
# Service usage impact analysis
plt.figure(figsize=(15, 10))

# Internet service
plt.subplot(2, 3, 1)
sns.countplot(data=df, x='internet_service', hue='churn')
plt.title('Internet Service vs Churn')
plt.xticks(rotation=45)

# Contract type
plt.subplot(2, 3, 2)
sns.countplot(data=df, x='contract', hue='churn')
plt.title('Contract Type vs Churn')
plt.xticks(rotation=45)

# Payment method
plt.subplot(2, 3, 3)
sns.countplot(data=df, x='payment_method', hue='churn')
plt.title('Payment Method vs Churn')
plt.xticks(rotation=45)

# Phone service
plt.subplot(2, 3, 4)
sns.countplot(data=df, x='phone_service', hue='churn')
plt.title('Phone Service vs Churn')

# Paperless billing
plt.subplot(2, 3, 5)
sns.countplot(data=df, x='paperless_billing', hue='churn')
plt.title('Paperless Billing vs Churn')

# Multiple lines
plt.subplot(2, 3, 6)
sns.countplot(data=df, x='multiple_lines', hue='churn')
plt.title('Multiple Lines vs Churn')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Financial factors analysis
plt.figure(figsize=(15, 5))

# Monthly charges
plt.subplot(1, 3, 1)
sns.boxplot(data=df, x='churn', y='monthly_charges')
plt.title('Monthly Charges vs Churn')

# Total charges
plt.subplot(1, 3, 2)
# Convert total_charges to numeric (it might be stored as string)
df['total_charges'] = pd.to_numeric(df['total_charges'], errors='coerce')
sns.boxplot(data=df, x='churn', y='total_charges')
plt.title('Total Charges vs Churn')

# Customer satisfaction
plt.subplot(1, 3, 3)
sns.boxplot(data=df, x='churn', y='customer_satisfaction')
plt.title('Customer Satisfaction vs Churn')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis for numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

## Task 3: Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Handle any missing values in total_charges
df_processed['total_charges'].fillna(0, inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = df_processed.select_dtypes(include=['object']).columns

for col in categorical_columns:
    if col != 'customer_id':  # Don't encode customer_id
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col])
        label_encoders[col] = le

print("Categorical variables encoded:")
print(list(label_encoders.keys()))

In [None]:
# Feature engineering - create new features if beneficial
# Average monthly charges per tenure month
df_processed['avg_monthly_charges'] = df_processed['total_charges'] / (df_processed['tenure'] + 1)

# High value customer (above median total charges)
median_charges = df_processed['total_charges'].median()
df_processed['high_value_customer'] = (df_processed['total_charges'] > median_charges).astype(int)

print("New features created:")
print("- avg_monthly_charges")
print("- high_value_customer")

In [None]:
# Prepare features and target
# Drop customer_id as it's not a feature
X = df_processed.drop(['customer_id', 'churn'], axis=1)
y = df_processed['churn']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeatures:", list(X.columns))

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Test set size:", X_test.shape)

In [None]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")

## Task 4: Model Development

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Hyperparameter grids
param_grids = {
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    },
    'Decision Tree': {
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
}

In [None]:
# Train models with hyperparameter tuning
best_models = {}
training_results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Grid search with cross-validation
    grid_search = GridSearchCV(
        model, 
        param_grids[name], 
        cv=5, 
        scoring='f1',
        n_jobs=-1
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    best_models[name] = grid_search.best_estimator_
    training_results[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    print("-" * 50)

## Task 5: Model Evaluation

In [None]:
# Evaluate models on validation set
def evaluate_model(model, X, y, model_name):
    y_pred = model.predict(X)
    
    metrics = {
        'Accuracy': accuracy_score(y, y_pred),
        'Precision': precision_score(y, y_pred),
        'Recall': recall_score(y, y_pred),
        'F1-Score': f1_score(y, y_pred)
    }
    
    return metrics, y_pred

# Evaluate all models
validation_results = {}
predictions = {}

for name, model in best_models.items():
    metrics, y_pred = evaluate_model(model, X_val_scaled, y_val, name)
    validation_results[name] = metrics
    predictions[name] = y_pred
    
    print(f"{name} - Validation Results:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    print("-" * 40)

In [None]:
# Create comparison dataframe
results_df = pd.DataFrame(validation_results).T
print("Model Comparison (Validation Set):")
print(results_df)

In [None]:
# Visualize model performance
plt.figure(figsize=(15, 10))

# Performance metrics comparison
plt.subplot(2, 2, 1)
results_df.plot(kind='bar', ax=plt.gca())
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)

# Confusion matrices
for i, (name, y_pred) in enumerate(predictions.items(), 1):
    plt.subplot(2, 2, i+1)
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    
    if i == 3:  # Only show 3 confusion matrices due to space
        break

plt.tight_layout()
plt.show()

In [None]:
# Select best model based on F1-score
best_model_name = results_df['F1-Score'].idxmax()
best_model = best_models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Best F1-Score: {results_df.loc[best_model_name, 'F1-Score']:.4f}")

# Test on test set
test_metrics, test_predictions = evaluate_model(best_model, X_test_scaled, y_test, best_model_name)

print(f"\n{best_model_name} - Test Set Results:")
for metric, value in test_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Final confusion matrix and classification report
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
cm_test = confusion_matrix(y_test, test_predictions)
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues')
plt.title(f'Test Set Confusion Matrix - {best_model_name}')
plt.ylabel('Actual')
plt.xlabel('Predicted')

plt.subplot(1, 2, 2)
# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    
    sns.barplot(data=feature_importance, x='importance', y='feature')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.xlabel('Importance')
else:
    plt.text(0.5, 0.5, f'{best_model_name}\ndoes not have\nfeature_importances_', 
             ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
    plt.title('Feature Importance Not Available')

plt.tight_layout()
plt.show()

# Detailed classification report
print(f"\nDetailed Classification Report - {best_model_name}:")
print(classification_report(y_test, test_predictions))

## Summary and Conclusions

### Key Findings:
1. **Churn Rate**: [Add your analysis]
2. **Best Model**: [Add best model name and performance]
3. **Important Features**: [Add key features that influence churn]
4. **Business Insights**: [Add actionable insights for reducing churn]

### Model Performance:
- **Best Model**: [Model name]
- **Test Accuracy**: [Value]
- **Test F1-Score**: [Value]
- **Test Precision**: [Value]
- **Test Recall**: [Value]

### Recommendations:
1. [Add recommendation 1]
2. [Add recommendation 2]
3. [Add recommendation 3]