In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/Telco-Customer-Churn.csv')  # Adjust path if needed

# Basic info
print(df.shape)  # Should be (7043, 21)
print(df.info())
print(df.describe())

# Handle TotalCharges
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)

# Target distribution
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.savefig('../diagrams/churn_dist.png')  # Save plot
plt.show()

# Numerical pairplot
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
sns.pairplot(df[numerical_cols + ['Churn']], hue='Churn')
plt.savefig('../diagrams/pairplot.png')
plt.show()

# Categorical vs Churn
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                    'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
for col in categorical_cols:
    sns.countplot(x=col, hue='Churn', data=df)
    plt.title(f'{col} vs Churn')
    plt.savefig(f'../diagrams/{col}_vs_churn.png')
    plt.show()