In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore



In [None]:
# Set plot styles
sns.set_style("darkgrid")

In [None]:
# Load the dataset
data_path = "../data/generated_data.csv"  # Adjust path if necessary
df = pd.read_csv(data_path)

In [None]:
# Display first few rows
display(df.head())

In [None]:
# Check dataset information
display(df.info())


In [None]:
# Summary statistics of numerical features
display(df.describe())

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

In [None]:
# Visualizing missing values
plt.figure(figsize=(10, 5))
sns.heatmap(df.isnull(), cmap="viridis", cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Histograms for numerical variables
df.hist(figsize=(12, 8), bins=30, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
# Boxplots for key numerical features (detecting outliers)
plt.figure(figsize=(12, 6))
sns.boxplot(data=df.select_dtypes(include=[np.number]))
plt.title("Boxplots of Numerical Features")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Compute correlation matrix
corr_matrix = df.corr()


In [None]:
# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
# Pairplot for top correlated features
sns.pairplot(df[['Account_Balance', 'Loan_Amount', 'Credit_Score', 'Default']])
plt.show()


In [None]:
# Outlier detection using Z-score
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
outliers = (z_scores > 3).sum(axis=0)
print("Number of outliers per feature:\n", outliers)

In [None]:
# Analyzing 'Default' class balance
plt.figure(figsize=(6, 4))
sns.countplot(x=df['Default'], palette='Set2')
plt.title("Class Distribution of Target Variable (Default)")
plt.xlabel("Default (0: No, 1: Yes)")
plt.ylabel("Count")
plt.show()

print("✅ EDA Completed!")