In [118]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from sqlalchemy import create_engine,  text
from sklearn.model_selection import train_test_split
import json
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from xgboost import XGBClassifier
import os

In [4]:
load_dotenv()  # load variables from .env

user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
database = os.getenv("POSTGRES_DB")

In [6]:
conn_str = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
engine = create_engine(conn_str)

In [13]:
query = text("SELECT * FROM claims")  

with engine.connect() as connection:
    df = pd.read_sql(query, con=connection)

df.head()

Unnamed: 0,claim_id,policy_start,incident_date,claim_amount,state,vehicle_type,incident_type,fact_of_loss,reported_by,policy_limit,insured_age,prior_claims_count,fraud_flag,severity_score,days_to_incident
0,XAJI0Y6DP,2021-05-09,2021-12-17,551.74,WV,Truck,Weather Damage,Hailstorm caused significant roof and hood dam...,claimant,55174,47,1,0,0.01,222
1,BHSAHXTHV,2020-05-09,2020-05-29,62223.2,CO,Sedan,Weather Damage,V2 rear-ended V1 while stopped at a red light,claimant,95728,67,0,0,0.65,20
2,3A3ZMF8MD,2018-08-08,2019-08-08,15000.0,IN,Van,Head-on Collision,V2 crossed centerline and hit V1 head-on,insured,15000,50,3,0,1.0,365
3,D4V30T9NT,2019-12-30,2020-08-03,15805.4,KY,Sedan,Theft,V2 was behind V1 when it began backing up,insured,24316,75,1,0,0.65,217
4,3W5UZBIKC,2020-12-16,2021-07-19,88000.0,WA,Coupe,Side-impact,V2 ran a stop sign and hit the side of V1,insured,100000,34,5,0,0.88,215


# Features and target

In [55]:
df_model = df.copy()

In [61]:
X = df_model.drop(columns=['fraud_flag', 'claim_id', 'severity_score', 'claim_amount', 'fact_of_loss', 'policy_start'])  # drop outputs
y = df_model['fraud_flag']

In [65]:
X['incident_month'] = X['incident_date'].dt.month
X['incident_dayofweek'] = X['incident_date'].dt.dayofweek
X = X.drop(columns=['incident_date'])

In [67]:
X.columns

Index(['state', 'vehicle_type', 'incident_type', 'reported_by', 'policy_limit',
       'insured_age', 'prior_claims_count', 'days_to_incident',
       'incident_month', 'incident_dayofweek'],
      dtype='object')

In [39]:
y.value_counts()

0    7954
1    2046
Name: fraud_flag, dtype: int64

In [69]:
categorical_cols = ['state', 'vehicle_type', 'incident_type', 'reported_by']

In [71]:
X[categorical_cols].dtypes

state            object
vehicle_type     object
incident_type    object
reported_by      object
dtype: object

In [75]:
#we drop the first category in each to avoid redundancy
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [77]:
print(X_encoded.shape)
X_encoded.head()

(10000, 48)


Unnamed: 0,policy_limit,insured_age,prior_claims_count,days_to_incident,incident_month,incident_dayofweek,state_AL,state_AS,state_AZ,state_CO,...,vehicle_type_SUV,vehicle_type_Sedan,vehicle_type_Truck,vehicle_type_Van,incident_type_Head-on Collision,incident_type_Rear-end Collision,incident_type_Side-impact,incident_type_Theft,incident_type_Weather Damage,reported_by_insured
0,55174,47,1,222,12,4,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,95728,67,0,20,5,4,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
2,15000,50,3,365,8,3,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1
3,24316,75,1,217,8,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
4,100000,34,5,215,7,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


# Split the Data Into Train/Test Sets

In [80]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, stratify=y, random_state=42
)

# Train the Model

In [87]:
model = XGBClassifier(
        n_estimators=100,        # Number of trees
        max_depth=5,             # Tree depth
        learning_rate=0.1,       # Shrinkage rate
        use_label_encoder=False, # Avoid warning
        eval_metric='logloss',   # Proper loss for binary classification
        random_state=42
)

In [89]:
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [91]:
y_pred = model.predict(X_test)

In [95]:
print(confusion_matrix(y_test, y_pred))

[[1589    2]
 [ 102  307]]


In [97]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1591
           1       0.99      0.75      0.86       409

    accuracy                           0.95      2000
   macro avg       0.97      0.87      0.91      2000
weighted avg       0.95      0.95      0.95      2000



In [99]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.948


In [None]:
"""
model correctly identified 1589 non-fraud claims

and caught 307 frauds

But missed 102 actual frauds 

Class 0 (Not Fraud):
- Precision: 0.94 → 94% of predicted non-fraud were correct
- Recall:    1.00 → You caught all non-fraud (almost no false positives)

Class 1 (Fraud):
- Precision: 0.99 → Nearly every time the model says "fraud", it’s correct
- Recall:    0.75 → Model caught 75% of actual frauds (missed 25%)
- F1-score:  0.86 → Strong balance between precision and recall

Overall Accuracy: **0.95 (95%)**

"""

# Improve Recall for Fraud 

In [102]:
y_proba = model.predict_proba(X_test)[:, 1]

In [104]:
for threshold in [0.5, 0.4, 0.3, 0.25]:
    y_pred = (y_proba >= threshold).astype(int)
    print(f"\nThreshold: {threshold}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))


Threshold: 0.5
[[1589    2]
 [ 102  307]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1591
           1       0.99      0.75      0.86       409

    accuracy                           0.95      2000
   macro avg       0.97      0.87      0.91      2000
weighted avg       0.95      0.95      0.95      2000


Threshold: 0.4
[[1585    6]
 [ 102  307]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1591
           1       0.98      0.75      0.85       409

    accuracy                           0.95      2000
   macro avg       0.96      0.87      0.91      2000
weighted avg       0.95      0.95      0.94      2000


Threshold: 0.3
[[1561   30]
 [  99  310]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1591
           1       0.91      0.76      0.83       409

    accuracy                           0.94      2000

In [106]:
fraud_threshold = 0.3
y_pred_final = (y_proba >= fraud_threshold).astype(int)

In [116]:
#save model
joblib.dump(model, 'fraud_model_XGB.pkl')

['fraud_model_XGB.pkl']

In [120]:
#save Threshold 
with open("fraud_model_config.json", "w") as f:
    json.dump({"fraud_threshold": 0.3}, f)