In [2]:
# notebooks/04_modeling.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load preprocessed data
df = pd.read_csv("../data/processed/cleaned_insurance_data.csv")

# Create binary classification target: Claim made or not
df["ClaimMade"] = (df["TotalClaims"] > 0).astype(int)



  df = pd.read_csv("../data/processed/cleaned_insurance_data.csv")


In [3]:
# Drop irrelevant or ID columns
X = df.drop(columns=["TotalClaims", "ClaimMade", "PolicyID", "UnderwrittenCoverID", "TransactionMonth"])
y = df["ClaimMade"]

In [4]:
# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object", "bool"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [5]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])


In [6]:
# Model pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

In [7]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Replace inf/-inf with NaN
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Option 1: Drop rows with NaN values (simplest fix)
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]  # Also drop corresponding targets
X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

# Option 2 (alternative): Fill NaNs with mean or median (for numerical columns)
# X_train = X_train.fillna(X_train.mean())
# X_test = X_test.fillna(X_test.mean())

In [11]:
# Fit and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Logistic Regression Report:\n")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

ValueError: Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required by StandardScaler.

In [None]:
# Try Random Forest
pipeline.set_params(classifier=RandomForestClassifier(random_state=42))
pipeline.fit(X_train, y_train)
y_pred_rf = pipeline.predict(X_test)

print("\nRandom Forest Report:\n")
print(classification_report(y_test, y_pred_rf))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d", cmap="Greens")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()