In [None]:
# notebooks/01_eda.ipynb

# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load Data
DATA_PATH = "../data/raw/MachineLearningRating_v3.txt"
df = pd.read_csv(DATA_PATH, sep="|")

# Initial overview
print("Shape:", df.shape)
print(df.head())




In [None]:
# Missing values check
missing_summary = df.isnull().sum().sort_values(ascending=False)
missing_df = pd.DataFrame({
    "Missing Count": missing_summary,
    "Missing %": (missing_summary / len(df)) * 100
})
print(missing_df[missing_df["Missing Count"] > 0])

In [None]:
# Descriptive stats
print(df.describe().T)

# Categorical variable uniqueness
cat_uniques = df.select_dtypes(include=["object"]).nunique().sort_values(ascending=False)
print(cat_uniques)

In [None]:
# Visual: TotalPremium distribution
plt.figure(figsize=(10, 4))
sns.histplot(df['TotalPremium'], bins=50, kde=True)
plt.title('Distribution of Total Premium')
os.makedirs("../reports/figures", exist_ok=True)
plt.savefig("../reports/figures/premium_distribution.png")
plt.show()

In [None]:
# Visual: TotalClaims distribution
plt.figure(figsize=(10, 4))
sns.histplot(df['TotalClaims'], bins=50, kde=True, color='r')
plt.title('Distribution of Total Claims')
plt.savefig("../reports/figures/claims_distribution.png")
plt.show()

In [None]:
# Countplot: Province
plt.figure(figsize=(10, 4))
sns.countplot(data=df, x='Province', order=df['Province'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Policies by Province")
plt.savefig("../reports/figures/province_distribution.png")
plt.show()

In [None]:
# Loss Ratio analysis
df["LossRatio"] = df["TotalClaims"] / df["TotalPremium"]
province_loss = df.groupby("Province")["LossRatio"].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 4))
province_loss.plot(kind="bar", color="orange")
plt.title("Average Loss Ratio by Province")
plt.ylabel("Loss Ratio")
plt.xticks(rotation=45)
plt.savefig("../reports/figures/province_loss_ratio.png")
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.savefig("../reports/figures/correlation_matrix.png")
plt.show()