In [8]:
!pip install Faker



In [15]:
from google.colab import drive
import os
# Remove any existing files in the mountpoint
if os.path.exists('/content/drive'):
    os.system("rm -rf /content/drive/*")
drive.mount('/content/drive')

Mounted at /content/drive


🔍 Key Features in the New Credit Card Dataset:

**Financial Behavior Features:**

Credit_Limit: Available credit amount

Total_Revolving_Bal: Outstanding balance

Avg_Utilization_Ratio: Credit usage percentage

Total_Trans_Amt: Total transaction amount

Total_Trans_Ct: Number of transactions

**Customer Relationship Features:**

Months_on_book: Length of relationship

Total_Relationship_Count: Number of products with bank

Months_Inactive_12_mon: Recent inactivity

Contacts_Count_12_mon: Customer service contacts

**Risk Indicators:**

Card_Category: Blue/Silver/Gold/Platinum

Income_Category: Income brackets

Total_Amt_Chng_Q4_Q1: Spending pattern changes

Total_Ct_Chng_Q4_Q1: Transaction frequency changes


**💡 Perfect for Amazon Interview:**
This credit card churn dataset demonstrates:


Financial risk modeling (default prediction, credit risk)

Customer segmentation (by card type, income, usage patterns)

Behavioral analytics (transaction patterns, utilization trends)

Regulatory compliance insights (risk management, portfolio health)

Revenue optimization (identifying high-value customers to retain)

In [16]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime
import warnings
import os
import random
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
fake = Faker()
Faker.seed(42)

def generate_credit_card_churn_dataset(n_records=100000):
    """
    Generate a synthetic credit card customer churn dataset with 100k records
    specifically designed for credit card churn prediction with financial behavior patterns
    """

    print(f"Generating {n_records:,} credit card customer records...")

    # Initialize lists to store data
    data = {
        'CLIENTNUM': [],
        'Customer_Age': [],
        'Gender': [],
        'Dependent_count': [],
        'Education_Level': [],
        'Marital_Status': [],
        'Income_Category': [],
        'Card_Category': [],
        'Months_on_book': [],
        'Total_Relationship_Count': [],
        'Months_Inactive_12_mon': [],
        'Contacts_Count_12_mon': [],
        'Credit_Limit': [],
        'Total_Revolving_Bal': [],
        'Avg_Open_To_Buy': [],
        'Total_Amt_Chng_Q4_Q1': [],
        'Total_Trans_Amt': [],
        'Total_Trans_Ct': [],
        'Total_Ct_Chng_Q4_Q1': [],
        'Avg_Utilization_Ratio': [],
        'Attrition_Flag': [],
        'Last_Transaction_Date': []
    }

    # Define realistic distributions for credit card customers
    genders = ['M', 'F']
    gender_weights = [0.47, 0.53]  # Slightly more females

    education_levels = ['High School', 'College', 'Graduate', 'Post-Graduate', 'Doctorate', 'Uneducated', 'Unknown']
    education_weights = [0.20, 0.25, 0.30, 0.15, 0.05, 0.03, 0.02]

    marital_status = ['Married', 'Single', 'Divorced', 'Unknown']
    marital_weights = [0.46, 0.39, 0.12, 0.03]

    income_categories = ['Less than $40K', '$40K - $60K', '$60K - $80K', '$80K - $120K', '$120K +', 'Unknown']
    income_weights = [0.17, 0.24, 0.20, 0.21, 0.15, 0.03]

    card_categories = ['Blue', 'Silver', 'Gold', 'Platinum']
    card_weights = [0.80, 0.12, 0.05, 0.03]  # Most customers have Blue cards

    print("Generating credit card customer data...")

    for i in range(n_records):
        if i % 10000 == 0:
            print(f"Progress: {i:,}/{n_records:,} ({i/n_records*100:.1f}%)")

        # Basic customer info
        client_num = 700000000 + i  # Realistic client number format

        # Demographics
        age = int(np.random.normal(46, 8))
        age = max(18, min(73, age))  # Credit card age range

        gender = np.random.choice(genders, p=gender_weights)

        dependent_count = np.random.choice([0, 1, 2, 3, 4, 5],
                                         p=[0.30, 0.25, 0.20, 0.15, 0.07, 0.03])

        education_level = np.random.choice(education_levels, p=education_weights)
        marital_status_val = np.random.choice(marital_status, p=marital_weights)
        income_category = np.random.choice(income_categories, p=income_weights)
        card_category = np.random.choice(card_categories, p=card_weights)

        # Relationship with bank
        months_on_book = int(np.random.normal(36, 8))
        months_on_book = max(13, min(56, months_on_book))  # 13-56 months range

        # Total relationship count (1-6 products)
        total_relationship_count = np.random.choice([1, 2, 3, 4, 5, 6],
                                                   p=[0.05, 0.30, 0.25, 0.20, 0.15, 0.05])

        # Months inactive (0-6 months)
        months_inactive = np.random.choice([0, 1, 2, 3, 4, 5, 6],
                                         p=[0.40, 0.20, 0.15, 0.10, 0.08, 0.05, 0.02])

        # Contacts count (0-6 times)
        contacts_count = np.random.choice([0, 1, 2, 3, 4, 5, 6],
                                        p=[0.35, 0.25, 0.20, 0.12, 0.05, 0.02, 0.01])

        # Credit limit (based on income and card category)
        base_limit = 3000
        if income_category == 'Less than $40K':
            base_limit = np.random.uniform(1500, 4000)
        elif income_category == '$40K - $60K':
            base_limit = np.random.uniform(3000, 8000)
        elif income_category == '$60K - $80K':
            base_limit = np.random.uniform(6000, 15000)
        elif income_category == '$80K - $120K':
            base_limit = np.random.uniform(10000, 25000)
        elif income_category == '$120K +':
            base_limit = np.random.uniform(15000, 35000)

        # Adjust for card category
        if card_category == 'Silver':
            base_limit *= 1.2
        elif card_category == 'Gold':
            base_limit *= 1.5
        elif card_category == 'Platinum':
            base_limit *= 2.0

        credit_limit = round(base_limit, 0)

        # Revolving balance (0 to 80% of credit limit)
        revolving_balance = np.random.uniform(0, credit_limit * 0.8)
        revolving_balance = round(revolving_balance, 0)

        # Average open to buy
        avg_open_to_buy = credit_limit - revolving_balance

        # Transaction amounts and counts
        # Base transaction amount (influenced by income and activity)
        base_trans_amt = np.random.lognormal(8.5, 1.2)
        base_trans_amt = min(base_trans_amt, credit_limit * 2)  # Cap at 2x credit limit

        total_trans_amt = round(base_trans_amt, 0)

        # Transaction count (related to transaction amount)
        avg_trans_size = total_trans_amt / max(1, np.random.uniform(20, 120))
        total_trans_ct = int(total_trans_amt / max(avg_trans_size, 50))
        total_trans_ct = max(10, min(total_trans_ct, 139))  # Reasonable range

        # Quarter over quarter changes
        total_amt_chng_q4_q1 = np.random.normal(0.76, 0.4)
        total_amt_chng_q4_q1 = max(0.0, min(total_amt_chng_q4_q1, 3.4))
        total_amt_chng_q4_q1 = round(total_amt_chng_q4_q1, 3)

        total_ct_chng_q4_q1 = np.random.normal(0.72, 0.4)
        total_ct_chng_q4_q1 = max(0.0, min(total_ct_chng_q4_q1, 3.7))
        total_ct_chng_q4_q1 = round(total_ct_chng_q4_q1, 3)

        # Utilization ratio
        if credit_limit > 0:
            utilization_ratio = revolving_balance / credit_limit
        else:
            utilization_ratio = 0
        utilization_ratio = round(utilization_ratio, 3)

        # CHURN PROBABILITY CALCULATION
        # Base churn rate for credit cards (~16%)
        churn_prob = 0.16

        # Age factor (younger customers more likely to churn)
        if age < 35:
            churn_prob += 0.08
        elif age > 55:
            churn_prob -= 0.05

        # Activity factors
        if months_inactive >= 3:
            churn_prob += 0.25  # High inactivity = high churn
        elif months_inactive == 0:
            churn_prob -= 0.08  # Active customers less likely to churn

        # Contact frequency (too many contacts = churn risk)
        if contacts_count >= 4:
            churn_prob += 0.20
        elif contacts_count == 0:
            churn_prob += 0.05  # No contact might indicate disengagement

        # Transaction patterns
        if total_trans_ct < 30:
            churn_prob += 0.15  # Low transaction count
        elif total_trans_ct > 100:
            churn_prob -= 0.10  # High transaction count

        # Utilization patterns
        if utilization_ratio == 0:
            churn_prob += 0.20  # Not using card = likely to cancel
        elif utilization_ratio > 0.8:
            churn_prob += 0.10  # High utilization might indicate stress
        elif 0.1 <= utilization_ratio <= 0.3:
            churn_prob -= 0.05  # Healthy utilization

        # Relationship depth
        if total_relationship_count == 1:
            churn_prob += 0.12  # Single product relationship
        elif total_relationship_count >= 4:
            churn_prob -= 0.15  # Deep relationship

        # Tenure effect
        if months_on_book < 20:
            churn_prob += 0.08  # New relationships more volatile
        elif months_on_book > 40:
            churn_prob -= 0.08  # Long tenure = loyalty

        # Card category effect
        if card_category == 'Blue':
            churn_prob += 0.03  # Basic cards higher churn
        elif card_category in ['Gold', 'Platinum']:
            churn_prob -= 0.05  # Premium cards lower churn

        # Ensure churn probability is between 0 and 1
        churn_prob = max(0.02, min(0.90, churn_prob))

        # Generate churn decision
        attrition_flag = 'Attrited Customer' if np.random.random() < churn_prob else 'Existing Customer'

        # Store all data
        data['CLIENTNUM'].append(int(client_num))
        data['Customer_Age'].append(age)
        data['Gender'].append(gender)
        data['Dependent_count'].append(dependent_count)
        data['Education_Level'].append(education_level)
        data['Marital_Status'].append(marital_status_val)
        data['Income_Category'].append(income_category)
        data['Card_Category'].append(card_category)
        data['Months_on_book'].append(months_on_book)
        data['Total_Relationship_Count'].append(total_relationship_count)
        data['Months_Inactive_12_mon'].append(months_inactive)
        data['Contacts_Count_12_mon'].append(contacts_count)
        data['Credit_Limit'].append(credit_limit)
        data['Total_Revolving_Bal'].append(revolving_balance)
        data['Avg_Open_To_Buy'].append(avg_open_to_buy)
        data['Total_Amt_Chng_Q4_Q1'].append(total_amt_chng_q4_q1)
        data['Total_Trans_Amt'].append(total_trans_amt)
        data['Total_Trans_Ct'].append(total_trans_ct)
        data['Total_Ct_Chng_Q4_Q1'].append(total_ct_chng_q4_q1)
        data['Avg_Utilization_Ratio'].append(utilization_ratio)
        data['Attrition_Flag'].append(attrition_flag)
        # Last Transaction Date
        last_transaction_date = fake.date_between(start_date='-3y', end_date='-1y')
        data['Last_Transaction_Date'].append(last_transaction_date)


    # Create DataFrame
    df = pd.DataFrame(data)

    print("\nCredit Card Churn Dataset generation complete!")
    print(f"Total records: {len(df):,}")
    churn_rate = (df['Attrition_Flag'] == 'Attrited Customer').mean()
    print(f"Churn rate: {churn_rate:.2%}")

    return df

def save_dataset(df, output_path='/content/drive/MyDrive/Customer_Churn/dataset/', filename='credit_card_churn_100k.csv'):
    """Save the dataset to Google Drive"""
    import os

    # Create directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Full file path
    full_path = os.path.join(output_path, filename)

    # Save the dataset
    df.to_csv(full_path, index=False)

    print(f"\nDataset saved to Google Drive!")
    print(f"Location: {full_path}")
    print(f"File size: {round(os.path.getsize(full_path) / 1024 / 1024, 2)} MB")

    return full_path

In [17]:
# Main execution
if __name__ == "__main__":
    # Mount Google Drive first (uncomment if running in Google Colab)
    # from google.colab import drive
    # drive.mount('/content/drive')

    print("💳 CREDIT CARD CUSTOMER CHURN DATASET GENERATOR")
    print("=" * 60)
    print("This script generates a realistic 100k record dataset")
    print("for credit card churn analysis and machine learning.\n")

    # Generate the dataset
    df = generate_credit_card_churn_dataset(100000)


    # Save the dataset to Google Drive
    output_path = '/content/drive/MyDrive/Customer_Churn/dataset/'
    saved_path = save_dataset(df, output_path, 'credit_card_churn_100k.csv')

    print("\n✅ Credit Card Churn Dataset generation complete!")
    print(f"📁 Saved to: {saved_path}")
    print("\nNext steps:")
    print("1. Load the CSV file for analysis")
    print("2. Perform EDA on credit card behavior patterns")
    print("3. Build ML models for churn prediction")
    print("4. Analyze financial risk patterns")
    print("5. Create retention strategy dashboards")

    # Display first few rows
    print(f"\n📋 FIRST 5 ROWS OF GENERATED DATA:")
    print(df.head().to_string())

💳 CREDIT CARD CUSTOMER CHURN DATASET GENERATOR
This script generates a realistic 100k record dataset
for credit card churn analysis and machine learning.

Generating 100,000 credit card customer records...
Generating credit card customer data...
Progress: 0/100,000 (0.0%)
Progress: 10,000/100,000 (10.0%)
Progress: 20,000/100,000 (20.0%)
Progress: 30,000/100,000 (30.0%)
Progress: 40,000/100,000 (40.0%)
Progress: 50,000/100,000 (50.0%)
Progress: 60,000/100,000 (60.0%)
Progress: 70,000/100,000 (70.0%)
Progress: 80,000/100,000 (80.0%)
Progress: 90,000/100,000 (90.0%)

Credit Card Churn Dataset generation complete!
Total records: 100,000
Churn rate: 20.29%

Dataset saved to Google Drive!
Location: /content/drive/MyDrive/Customer_Churn/dataset/credit_card_churn_100k.csv
File size: 13.32 MB

✅ Credit Card Churn Dataset generation complete!
📁 Saved to: /content/drive/MyDrive/Customer_Churn/dataset/credit_card_churn_100k.csv

Next steps:
1. Load the CSV file for analysis
2. Perform EDA on credi