In [1]:
# Customer Churn Prediction - Exploratory Data Analysis
# CodSoft ML Internship - Task 3
# Author: Chandan Kumar

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("CUSTOMER CHURN PREDICTION - EXPLORATORY DATA ANALYSIS")
print("="*70)

CUSTOMER CHURN PREDICTION - EXPLORATORY DATA ANALYSIS


In [2]:
# 1. LOAD DATASET

print("\nüìÇ Loading dataset...")
df = pd.read_csv('../data/Churn_Modelling.csv')

print(f"‚úÖ Dataset loaded successfully!")
print(f"   Shape: {df.shape}")
print(f"   Rows: {df.shape[0]:,}")
print(f"   Columns: {df.shape[1]}")


üìÇ Loading dataset...
‚úÖ Dataset loaded successfully!
   Shape: (10000, 14)
   Rows: 10,000
   Columns: 14


In [3]:
# 2. INITIAL DATA INSPECTION

print("\n" + "="*70)
print("DATA OVERVIEW")
print("="*70)

print("\nüìä First 5 rows:")
print(df.head())

print("\nüìã Dataset Info:")
print(df.info())

print("\nüìà Statistical Summary:")
print(df.describe())

print("\nüîç Column Names:")
print(df.columns.tolist())


DATA OVERVIEW

üìä First 5 rows:
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3      

In [4]:
# 3. MISSING VALUES CHECK

print("\n" + "="*70)
print("MISSING VALUES ANALYSIS")
print("="*70)

missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_values,
    'Percentage': missing_percent
})

print(missing_df[missing_df['Missing_Count'] > 0])

if missing_values.sum() == 0:
    print("‚úÖ No missing values found!")
else:
    print(f"‚ö†Ô∏è  Total missing values: {missing_values.sum()}")


MISSING VALUES ANALYSIS
Empty DataFrame
Columns: [Missing_Count, Percentage]
Index: []
‚úÖ No missing values found!


In [5]:
# 4. CHURN DISTRIBUTION

print("\n" + "="*70)
print("CHURN DISTRIBUTION ANALYSIS")
print("="*70)

churn_counts = df['Exited'].value_counts()
churn_percentages = df['Exited'].value_counts(normalize=True) * 100

print("\nüìä Customer Status:")
print(f"   Retained (0): {churn_counts[0]:,} ({churn_percentages[0]:.2f}%)")
print(f"   Churned (1): {churn_counts[1]:,} ({churn_percentages[1]:.2f}%)")

churn_ratio = churn_counts[1] / churn_counts[0]
print(f"\nüìà Churn Rate: {churn_percentages[1]:.2f}%")
print(f"   Imbalance Ratio: 1:{1/churn_ratio:.2f}")

# Visualization - Churn Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
axes[0].bar(['Retained', 'Churned'], churn_counts.values, 
            color=['green', 'red'], alpha=0.7)
axes[0].set_ylabel('Count')
axes[0].set_title('Customer Churn Distribution')
for i, v in enumerate(churn_counts.values):
    axes[0].text(i, v, f'{v:,}', ha='center', va='bottom')

# Pie chart
colors = ['lightgreen', 'lightcoral']
axes[1].pie(churn_counts.values, labels=['Retained', 'Churned'], 
            autopct='%1.2f%%', colors=colors, startangle=90)
axes[1].set_title('Customer Churn Percentage')

plt.tight_layout()
plt.savefig('../images/churn_distribution.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Visualization saved: ../images/churn_distribution.png")
plt.close()


CHURN DISTRIBUTION ANALYSIS

üìä Customer Status:
   Retained (0): 7,963 (79.63%)
   Churned (1): 2,037 (20.37%)

üìà Churn Rate: 20.37%
   Imbalance Ratio: 1:3.91

‚úÖ Visualization saved: ../images/churn_distribution.png


In [6]:
# 5. DEMOGRAPHIC ANALYSIS

print("\n" + "="*70)
print("DEMOGRAPHIC ANALYSIS")
print("="*70)

# Geography
print("\nüåç Geography Distribution:")
print(df['Geography'].value_counts())

# Gender
print("\nüë• Gender Distribution:")
print(df['Gender'].value_counts())

# Churn by Geography
print("\nüåç Churn Rate by Geography:")
geo_churn = df.groupby('Geography')['Exited'].agg(['sum', 'count', 'mean'])
geo_churn.columns = ['Churned', 'Total', 'Churn_Rate']
geo_churn['Churn_Rate'] = geo_churn['Churn_Rate'] * 100
print(geo_churn)

# Churn by Gender
print("\nüë• Churn Rate by Gender:")
gender_churn = df.groupby('Gender')['Exited'].agg(['sum', 'count', 'mean'])
gender_churn.columns = ['Churned', 'Total', 'Churn_Rate']
gender_churn['Churn_Rate'] = gender_churn['Churn_Rate'] * 100
print(gender_churn)

# Visualization - Demographics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Geography distribution
df['Geography'].value_counts().plot(kind='bar', ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Geography Distribution')
axes[0, 0].set_ylabel('Count')

# Gender distribution
df['Gender'].value_counts().plot(kind='bar', ax=axes[0, 1], color='lightcoral')
axes[0, 1].set_title('Gender Distribution')
axes[0, 1].set_ylabel('Count')

# Churn by Geography
geo_churn_pct = df.groupby(['Geography', 'Exited']).size().unstack()
geo_churn_pct.plot(kind='bar', ax=axes[1, 0], color=['green', 'red'])
axes[1, 0].set_title('Churn by Geography')
axes[1, 0].set_ylabel('Count')
axes[1, 0].legend(['Retained', 'Churned'])

# Churn by Gender
gender_churn_pct = df.groupby(['Gender', 'Exited']).size().unstack()
gender_churn_pct.plot(kind='bar', ax=axes[1, 1], color=['green', 'red'])
axes[1, 1].set_title('Churn by Gender')
axes[1, 1].set_ylabel('Count')
axes[1, 1].legend(['Retained', 'Churned'])

plt.tight_layout()
plt.savefig('../images/demographic_analysis.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Demographic analysis saved: ../images/demographic_analysis.png")
plt.close()


DEMOGRAPHIC ANALYSIS

üåç Geography Distribution:
Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

üë• Gender Distribution:
Gender
Male      5457
Female    4543
Name: count, dtype: int64

üåç Churn Rate by Geography:
           Churned  Total  Churn_Rate
Geography                            
France         810   5014   16.154767
Germany        814   2509   32.443204
Spain          413   2477   16.673395

üë• Churn Rate by Gender:
        Churned  Total  Churn_Rate
Gender                            
Female     1139   4543   25.071539
Male        898   5457   16.455928

‚úÖ Demographic analysis saved: ../images/demographic_analysis.png


In [7]:
# 6. NUMERICAL FEATURES ANALYSIS

print("\n" + "="*70)
print("NUMERICAL FEATURES ANALYSIS")
print("="*70)

numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 
                  'NumOfProducts', 'EstimatedSalary']

print("\nüìä Numerical Features Summary:")
print(df[numerical_cols].describe())

# Age Analysis
print("\nüìä Age Statistics:")
print(f"   Mean Age: {df['Age'].mean():.2f} years")
print(f"   Median Age: {df['Age'].median():.0f} years")
print(f"   Age Range: {df['Age'].min():.0f} - {df['Age'].max():.0f} years")

# Balance Analysis
print("\nüí∞ Balance Statistics:")
print(f"   Mean Balance: ${df['Balance'].mean():,.2f}")
print(f"   Median Balance: ${df['Balance'].median():,.2f}")
print(f"   Zero Balance Customers: {(df['Balance'] == 0).sum():,}")

# Tenure Analysis
print("\n‚è±Ô∏è  Tenure Statistics:")
print(f"   Mean Tenure: {df['Tenure'].mean():.2f} years")
print(f"   Median Tenure: {df['Tenure'].median():.0f} years")

# Visualization - Numerical Features
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

for idx, col in enumerate(numerical_cols):
    row = idx // 3
    col_idx = idx % 3
    
    # Distribution for churned vs retained
    df[df['Exited'] == 0][col].hist(bins=30, alpha=0.7, 
                                     label='Retained', color='green', ax=axes[row, col_idx])
    df[df['Exited'] == 1][col].hist(bins=30, alpha=0.7, 
                                     label='Churned', color='red', ax=axes[row, col_idx])
    axes[row, col_idx].set_title(f'{col} Distribution')
    axes[row, col_idx].legend()
    axes[row, col_idx].set_xlabel(col)
    axes[row, col_idx].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('../images/numerical_features.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Numerical features analysis saved: ../images/numerical_features.png")
plt.close()


NUMERICAL FEATURES ANALYSIS

üìä Numerical Features Summary:
        CreditScore           Age        Tenure        Balance  NumOfProducts  \
count  10000.000000  10000.000000  10000.000000   10000.000000   10000.000000   
mean     650.528800     38.921800      5.012800   76485.889288       1.530200   
std       96.653299     10.487806      2.892174   62397.405202       0.581654   
min      350.000000     18.000000      0.000000       0.000000       1.000000   
25%      584.000000     32.000000      3.000000       0.000000       1.000000   
50%      652.000000     37.000000      5.000000   97198.540000       1.000000   
75%      718.000000     44.000000      7.000000  127644.240000       2.000000   
max      850.000000     92.000000     10.000000  250898.090000       4.000000   

       EstimatedSalary  
count     10000.000000  
mean     100090.239881  
std       57510.492818  
min          11.580000  
25%       51002.110000  
50%      100193.915000  
75%      149388.247500  
max    

In [8]:
# 7. CORRELATION ANALYSIS

print("\n" + "="*70)
print("CORRELATION ANALYSIS")
print("="*70)

# Select numerical columns for correlation
corr_data = df[numerical_cols + ['Exited']].copy()

# Calculate correlation
correlation = corr_data.corr()

# Correlation with Churn
print("\nüî• Features Correlation with Churn:")
churn_corr = correlation['Exited'].sort_values(ascending=False)
print(churn_corr)

# Visualization - Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('../images/correlation_matrix.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Correlation matrix saved: ../images/correlation_matrix.png")
plt.close()


CORRELATION ANALYSIS

üî• Features Correlation with Churn:
Exited             1.000000
Age                0.285323
Balance            0.118533
EstimatedSalary    0.012097
Tenure            -0.014001
CreditScore       -0.027094
NumOfProducts     -0.047820
Name: Exited, dtype: float64

‚úÖ Correlation matrix saved: ../images/correlation_matrix.png


In [10]:
# 8. PRODUCT & CARD ANALYSIS

print("\n" + "="*70)
print("PRODUCT & CARD ANALYSIS")
print("="*70)

# Number of Products
print("\nüì¶ Number of Products Distribution:")
print(df['NumOfProducts'].value_counts().sort_index())

# HasCrCard
print("\nüí≥ Credit Card Holders:")
card_dist = df['HasCrCard'].value_counts()
print(f"   Without Card: {card_dist[0]:,}")
print(f"   With Card: {card_dist[1]:,}")

# IsActiveMember
print("\n‚ú® Active Members:")
active_dist = df['IsActiveMember'].value_counts()
print(f"   Inactive: {active_dist[0]:,}")
print(f"   Active: {active_dist[1]:,}")

# Churn by Products
print("\nüì¶ Churn Rate by Number of Products:")
products_churn = df.groupby('NumOfProducts')['Exited'].agg(['sum', 'count', 'mean'])
products_churn.columns = ['Churned', 'Total', 'Churn_Rate']
products_churn['Churn_Rate'] = products_churn['Churn_Rate'] * 100
print(products_churn)

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Products distribution
df.groupby(['NumOfProducts', 'Exited']).size().unstack().plot(kind='bar', 
                                                                ax=axes[0, 0], 
                                                                color=['green', 'red'])
axes[0, 0].set_title('Churn by Number of Products')
axes[0, 0].legend(['Retained', 'Churned'])

# Credit Card
df.groupby(['HasCrCard', 'Exited']).size().unstack().plot(kind='bar', 
                                                            ax=axes[0, 1], 
                                                            color=['green', 'red'])
axes[0, 1].set_title('Churn by Credit Card Status')
axes[0, 1].set_xticklabels(['No Card', 'Has Card'], rotation=0)
axes[0, 1].legend(['Retained', 'Churned'])

# Active Member
df.groupby(['IsActiveMember', 'Exited']).size().unstack().plot(kind='bar', 
                                                                 ax=axes[1, 0], 
                                                                 color=['green', 'red'])
axes[1, 0].set_title('Churn by Active Member Status')
axes[1, 0].set_xticklabels(['Inactive', 'Active'], rotation=0)
axes[1, 0].legend(['Retained', 'Churned'])

# Churn Rate by Products
products_churn['Churn_Rate'].plot(kind='bar', ax=axes[1, 1], color='orange')
axes[1, 1].set_title('Churn Rate by Number of Products')
axes[1, 1].set_ylabel('Churn Rate (%)')
axes[1, 1].set_xlabel('Number of Products')

plt.tight_layout()
plt.savefig('../images/product_card_analysis.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Product & card analysis saved: ../imaages/product_card_analysis.png")
plt.close()


PRODUCT & CARD ANALYSIS

üì¶ Number of Products Distribution:
NumOfProducts
1    5084
2    4590
3     266
4      60
Name: count, dtype: int64

üí≥ Credit Card Holders:
   Without Card: 2,945
   With Card: 7,055

‚ú® Active Members:
   Inactive: 4,849
   Active: 5,151

üì¶ Churn Rate by Number of Products:
               Churned  Total  Churn_Rate
NumOfProducts                            
1                 1409   5084   27.714398
2                  348   4590    7.581699
3                  220    266   82.706767
4                   60     60  100.000000

‚úÖ Product & card analysis saved: ../imaages/product_card_analysis.png


In [None]:
# 9. DATA PREPROCESSING

print("\n" + "="*70)
print("DATA PREPROCESSING")
print("="*70)

# Create a copy for preprocessing
df_processed = df.copy()

# Remove unnecessary columns
columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']
df_processed = df_processed.drop(columns=columns_to_drop)
print(f"\n‚úÖ Dropped columns: {columns_to_drop}")

# Encode categorical variables
print("\nüîÑ Encoding categorical variables...")

# Label Encoding for Gender
le_gender = LabelEncoder()
df_processed['Gender'] = le_gender.fit_transform(df_processed['Gender'])
print(f"   Gender: {dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))}")

# One-Hot Encoding for Geography
df_processed = pd.get_dummies(df_processed, columns=['Geography'], drop_first=True)
print(f"   Geography: One-hot encoded")

# Scale numerical features
print("\n‚öñÔ∏è  Scaling numerical features...")
scaler = StandardScaler()
features_to_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
df_processed[features_to_scale] = scaler.fit_transform(df_processed[features_to_scale])
print("   Features scaled: CreditScore, Age, Tenure, Balance, EstimatedSalary")

# Save preprocessed data
df_processed.to_csv('../data/Churn_Modelling_processed.csv', index=False)
print("\n‚úÖ Processed data saved: ../data/Churn_Modelling_processed.csv")

# Save encoders and scaler
import joblib
joblib.dump(le_gender, '../artifacts/label_encoder_gender.pkl')
joblib.dump(scaler, '../artifacts/scaler.pkl')
print("‚úÖ Encoders and scaler saved")


DATA PREPROCESSING

‚úÖ Dropped columns: ['RowNumber', 'CustomerId', 'Surname']

üîÑ Encoding categorical variables...
   Gender: {'Female': np.int64(0), 'Male': np.int64(1)}
   Geography: One-hot encoded

‚öñÔ∏è  Scaling numerical features...
   Features scaled: CreditScore, Age, Tenure, Balance, EstimatedSalary

‚úÖ Processed data saved: ../artifacts/Churn_Modelling_processed.csv
‚úÖ Encoders and scaler saved


In [12]:
# 10. KEY INSIGHTS

print("\n" + "="*70)
print("KEY INSIGHTS")
print("="*70)

print(f"""
üîç Dataset Overview:
   - Total Customers: {len(df):,}
   - Churned Customers: {churn_counts[1]:,} ({churn_percentages[1]:.2f}%)
   - Churn Rate: {churn_percentages[1]:.2f}%

üìä Key Findings:

1. CHURN DISTRIBUTION
   - Moderate imbalance: ~{churn_percentages[1]:.0f}% churn rate
   - Most customers are retained

2. DEMOGRAPHICS
   - Geography: {geo_churn['Churn_Rate'].idxmax()} has highest churn rate ({geo_churn['Churn_Rate'].max():.2f}%)
   - Gender: {gender_churn['Churn_Rate'].idxmax()} customers churn more ({gender_churn['Churn_Rate'].max():.2f}%)

3. AGE FACTOR
   - Mean age: {df['Age'].mean():.0f} years
   - Age appears to be correlated with churn

4. PRODUCT ENGAGEMENT
   - Most customers have {df['NumOfProducts'].mode()[0]} product(s)
   - Product count affects churn rate

5. ACTIVITY STATUS
   - Active members: {(df['IsActiveMember']==1).sum():,}
   - Inactive members show higher churn

üí° RECOMMENDATIONS:
   - Focus on {geo_churn['Churn_Rate'].idxmax()} geography
   - Target {gender_churn['Churn_Rate'].idxmax()} customers
   - Engage customers with multiple products
   - Activate inactive members
   - Monitor customers aged {df[df['Exited']==1]['Age'].mean():.0f}+ years
""")

# Summary statistics
summary = {
    'total_customers': int(len(df)),
    'churned_customers': int(churn_counts[1]),
    'retained_customers': int(churn_counts[0]),
    'churn_rate': float(churn_percentages[1]),
    'mean_age': float(df['Age'].mean()),
    'mean_balance': float(df['Balance'].mean()),
    'geography_highest_churn': geo_churn['Churn_Rate'].idxmax(),
    'gender_highest_churn': gender_churn['Churn_Rate'].idxmax()
}

import json
with open('../artifacts/eda_summary.json', 'w') as f:
    json.dump(summary, f, indent=4)
print("\n‚úÖ Summary saved: ../artifacts/eda_summary.json")

print("\n" + "="*70)
print("‚úÖ EXPLORATORY DATA ANALYSIS COMPLETED!")
print("="*70)

print("\nüìÅ Generated Files:")
print("   ‚úÖ ../artifacts/churn_distribution.png")
print("   ‚úÖ ../artifacts/demographic_analysis.png")
print("   ‚úÖ ../artifacts/numerical_features.png")
print("   ‚úÖ ../artifacts/correlation_matrix.png")
print("   ‚úÖ ../artifacts/product_card_analysis.png")
print("   ‚úÖ ../artifacts/label_encoder_gender.pkl")
print("   ‚úÖ ../artifacts/scaler.pkl")
print("   ‚úÖ ../artifacts/eda_summary.json")
print("   ‚úÖ ../artifacts/Churn_Modelling_processed.csv")

print("\nüöÄ Next Steps:")
print("   1. Run model_training.ipynb to build churn prediction models")
print("   2. Focus on key features identified")
print("   3. Handle moderate class imbalance")


KEY INSIGHTS

üîç Dataset Overview:
   - Total Customers: 10,000
   - Churned Customers: 2,037 (20.37%)
   - Churn Rate: 20.37%

üìä Key Findings:

1. CHURN DISTRIBUTION
   - Moderate imbalance: ~20% churn rate
   - Most customers are retained

2. DEMOGRAPHICS
   - Geography: Germany has highest churn rate (32.44%)
   - Gender: Female customers churn more (25.07%)

3. AGE FACTOR
   - Mean age: 39 years
   - Age appears to be correlated with churn

4. PRODUCT ENGAGEMENT
   - Most customers have 1 product(s)
   - Product count affects churn rate

5. ACTIVITY STATUS
   - Active members: 5,151
   - Inactive members show higher churn

üí° RECOMMENDATIONS:
   - Focus on Germany geography
   - Target Female customers
   - Engage customers with multiple products
   - Activate inactive members
   - Monitor customers aged 45+ years


‚úÖ Summary saved: ../artifacts/eda_summary.json

‚úÖ EXPLORATORY DATA ANALYSIS COMPLETED!

üìÅ Generated Files:
   ‚úÖ ../artifacts/churn_distribution.png
   