# Credit Score Classification – Exploratory Data Analysis (EDA)

In [None]:

import findspark
findspark.init()

from pyspark.sql import SparkSession

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set styles
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Credit Score EDA") \
    .getOrCreate()


In [None]:
# Load dataset via Spark
df = spark.read.csv("./archive/train.csv", header=True, inferSchema=True)

# For EDA, convert entire Spark DataFrame to Pandas DataFrame
df_pd = df.toPandas()

# Preview first 5 rows
df_pd.head()


In [None]:
# Data types and non-null counts
print(df_pd.info())

# Missing values summary
print("\nMissing values per column:")
print(df_pd.isnull().sum())


In [None]:
# Identify numeric vs categorical columns
numeric_cols = df_pd.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df_pd.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)


In [None]:
# Plot distribution for each numeric feature
for col in numeric_cols:
    plt.figure()
    sns.histplot(df_pd[col].dropna(), kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
# Boxplots of numeric features grouped by credit score
for col in numeric_cols:
    plt.figure()
    sns.boxplot(x='Credit_Score', y=col, data=df_pd)
    plt.title(f'{col} by Credit Score')
    plt.xlabel('Credit Score')
    plt.ylabel(col)
    plt.show()


In [None]:

import plotly.express as px

for col in categorical_cols:
    # 1) Build tidy counts DataFrame
    tidy = (
        df_pd
        .groupby([col, 'Credit_Score'])
        .size()
        .reset_index(name='count')
    )
    
    # Optional: filter to top categories
    # top = tidy.groupby(col)['count'].sum().nlargest(10).index
    # tidy = tidy[tidy[col].isin(top)]

    # 2) Plot interactive grouped bar chart
    fig = px.bar(
        tidy,
        x=col,
        y='count',
        color='Credit_Score',
        barmode='group',
        title=f'{col} counts by Credit Score'
    )
    fig.update_layout(xaxis_tickangle=45, margin=dict(l=40, r=40, t=60, b=40))
    fig.show()


In [None]:
# Compute correlation matrix for numeric features
corr_matrix = df_pd[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix of Numeric Features')
plt.show()


In [None]:
# Pairplot of a subset of numeric features vs credit score
selected = numeric_cols[:5]  # choose first 5 for readability
sns.pairplot(df_pd[selected + ['Credit Score']], hue='Credit Score', diag_kind='kde', corner=True)
plt.suptitle('Pairplot of Selected Numeric Features', y=1.02)
plt.show()


In [None]:
# Stop Spark if no further processing is needed
# spark.stop()
