In [None]:
# notebooks/04_modeling.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load preprocessed data
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Create binary classification target: Claim made or not
df["ClaimMade"] = (df["TotalClaims"] > 0).astype(int)



In [None]:
# Drop irrelevant or ID columns
X = df.drop(columns=["TotalClaims", "ClaimMade", "PolicyID", "UnderwrittenCoverID", "TransactionMonth"])
y = df["ClaimMade"]

In [None]:
# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object", "bool"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])


In [None]:
# Model pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Logistic Regression Report:\n")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

In [None]:
# Try Random Forest
pipeline.set_params(classifier=RandomForestClassifier(random_state=42))
pipeline.fit(X_train, y_train)
y_pred_rf = pipeline.predict(X_test)

print("\nRandom Forest Report:\n")
print(classification_report(y_test, y_pred_rf))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d", cmap="Greens")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()