# Retail Dataset - Data Exploration

Updated exploration of the retail customer churn dataset for regression task.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('default')

In [None]:
# Load the dataset with correct path
df = pd.read_csv('../data/train_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Dataset overview
print("Dataset Info:")
df.info()
print("\nFirst few rows:")
df.head()

In [None]:
# Target variable analysis
print("Target Variable Analysis - 'churned':")
print(df['churned'].value_counts())
print(f"\nChurn rate: {df['churned'].value_counts(normalize=True)['Yes']:.2%}")

# Visualize target distribution
plt.figure(figsize=(8, 5))
df['churned'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Customer Churn Distribution')
plt.xlabel('Churned')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Missing values analysis
print("Missing Values Analysis:")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print(missing_df if len(missing_df) > 0 else "No missing values found!")

In [None]:
# Data types summary
print("Data Types Summary:")
dtype_counts = df.dtypes.value_counts()
print(dtype_counts)

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols[:10]}...")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols[:10]}...")

In [None]:
# Statistical summary for numerical features
print("Statistical Summary - Numerical Features:")
df[numerical_cols].describe()

In [None]:
# Key categorical features analysis
key_categorical = ['gender', 'income_bracket', 'loyalty_program', 'marital_status', 
                   'education_level', 'product_category', 'payment_method']

print("Key Categorical Features Analysis:")
for col in key_categorical:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        print(df[col].value_counts().head())
        print(f"Unique values: {df[col].nunique()}")

In [None]:
# Correlation analysis for key numerical features
key_numerical = ['age', 'membership_years', 'avg_purchase_value', 'purchase_frequency_encoded',
                'total_sales', 'customer_support_calls', 'days_since_last_purchase']

# Create a subset with available numerical columns
available_numerical = [col for col in key_numerical if col in numerical_cols]
if len(available_numerical) < len(key_numerical):
    # Use first 10 numerical columns if key ones not available
    available_numerical = numerical_cols[:10]

print(f"Correlation Analysis for: {available_numerical}")

# Calculate correlation matrix
corr_matrix = df[available_numerical].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix - Key Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Sample data for quick analysis (to avoid memory issues)
sample_size = 10000
df_sample = df.sample(n=sample_size, random_state=42)
print(f"Working with sample of {sample_size} rows for visualizations")

# Age distribution by churn status
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
for churn_status in df_sample['churned'].unique():
    subset = df_sample[df_sample['churned'] == churn_status]
    plt.hist(subset['age'], alpha=0.7, label=f'Churned: {churn_status}', bins=30)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution by Churn Status')
plt.legend()

plt.subplot(1, 2, 2)
df_sample.boxplot(column='age', by='churned', ax=plt.gca())
plt.title('Age Distribution by Churn Status (Boxplot)')
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

In [None]:
# Summary insights
print("=" * 60)
print("DATASET SUMMARY INSIGHTS")
print("=" * 60)
print(f"• Dataset size: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"• Target variable: 'churned' with {df['churned'].nunique()} unique values")
print(f"• Churn rate: {(df['churned'] == 'Yes').mean():.1%}")
print(f"• Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"• Data types: {len(numerical_cols)} numerical, {len(categorical_cols)} categorical")
print(f"• Missing values: {'None' if df.isnull().sum().sum() == 0 else df.isnull().sum().sum()}")
print("\n• Key features for modeling:")
print("  - Customer demographics: age, gender, income_bracket")
print("  - Purchase behavior: avg_purchase_value, total_sales, purchase_frequency")
print("  - Engagement: customer_support_calls, days_since_last_purchase")
print("  - Product preferences: product_category, payment_method")
print("\n• Recommendations:")
print("  - Use sampling for faster model training (large dataset)")
print("  - Apply feature engineering for categorical variables")
print("  - Consider feature selection due to high dimensionality")
print("  - Monitor for class imbalance in target variable")