In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import resample # Used for manual oversampling
import matplotlib.pyplot as plt

In [2]:
file_name = "data/Training_TriGuard.csv"

In [3]:
df = pd.read_csv(file_name)

df_cleaned = df.dropna()

target_col = 'subrogation'

numerical_features = ['safety_rating', 'annual_income', 'claim_est_payout', 'vehicle_price', 'age_of_DL']
categorical_features = ['gender', 'living_status', 'vehicle_category', 'accident_type']
    
X = df_cleaned[numerical_features + categorical_features]
y = df_cleaned[target_col]

In [4]:
numerical_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [6]:
df_train = X_train.copy()
df_train[target_col] = y_train

df_majority = df_train[df_train[target_col] == 0]
df_minority = df_train[df_train[target_col] == 1]

df_minority_upsampled = resample(df_minority, 
                                     replace=True,
                                     n_samples=len(df_majority),
                                     random_state=42)

df_train_balanced = pd.concat([df_majority, df_minority_upsampled])

X_train_balanced = df_train_balanced.drop(target_col, axis=1)
y_train_balanced = df_train_balanced[target_col]

In [7]:
knn_model = KNeighborsClassifier(n_neighbors=5)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', knn_model)])

In [9]:
param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11]
}

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=param_grid, 
                           scoring='f1',
                           cv=5, 
                           n_jobs=-1)
    
print("\nStarting GridSearchCV on balanced data...")

grid_search.fit(X_train_balanced, y_train_balanced)
    
print(f"\nBest K (n_neighbors): {grid_search.best_params_['classifier__n_neighbors']}")
print(f"Best cross-validation F1 score (on balanced data): {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

    # --- 8. Show Final Results ---
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label=1)
report = classification_report(y_test, y_pred)

print(f"\n--- Improved KNN Model Evaluation (with Manual Oversampling) ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nTest Set Classification Report:")
print(report)


Starting GridSearchCV on balanced data...

Best K (n_neighbors): 3
Best cross-validation F1 score (on balanced data): 0.7904

--- Improved KNN Model Evaluation (with Manual Oversampling) ---
Accuracy: 0.6176
F1 Score: 0.3471

Test Set Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.67      0.73      4165
         1.0       0.28      0.44      0.35      1235

    accuracy                           0.62      5400
   macro avg       0.54      0.56      0.54      5400
weighted avg       0.68      0.62      0.64      5400



This result is a joke. It is suggested that I apply aggressive variable selection. 

In [None]:
numerical_features = ['liab_prct', 'vehicle_weight', 'claim_est_payout', 'age_of_DL']
categorical_features = ['witness_present_ind', 'in_network_bodyshop']



In [11]:
print(f"--- Final KNN Attempt ---")
print(f"Using Aggressive Feature Selection.")
print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}\n")
    
X = df_cleaned[numerical_features + categorical_features]
y = df_cleaned[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

df_train = X_train.copy()
df_train[target_col] = y_train

df_majority = df_train[df_train[target_col] == 0]
df_minority = df_train[df_train[target_col] == 1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=len(df_majority),    
                                 random_state=42) 

df_train_balanced = pd.concat([df_majority, df_minority_upsampled])

X_train_balanced = df_train_balanced.drop(target_col, axis=1)
y_train_balanced = df_train_balanced[target_col]
    
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
        ])
    
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', KNeighborsClassifier(metric='manhattan'))])

param_grid = {
    'classifier__n_neighbors': [5, 7, 9, 11, 13, 15, 17, 19]
}

grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=param_grid, 
                           scoring='f1', # Still optimizing for F1
                           cv=5, 
                           n_jobs=-1)
    
print("Starting GridSearchCV with selected features and Manhattan distance...")
grid_search.fit(X_train_balanced, y_train_balanced)
    
print(f"\nBest K (n_neighbors): {grid_search.best_params_['classifier__n_neighbors']}")
print(f"Best cross-validation F1 score (on balanced data): {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, pos_label = 1)
report = classification_report(y_test, y_pred)

print(f"\n--- Final KNN Model Evaluation (Feature Selection + Manhattan) ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(report)

--- Final KNN Attempt ---
Using Aggressive Feature Selection.
Numerical Features: ['safety_rating', 'annual_income', 'claim_est_payout', 'vehicle_price', 'age_of_DL']
Categorical Features: ['gender', 'living_status', 'vehicle_category', 'accident_type']

Starting GridSearchCV with selected features and Manhattan distance...

Best K (n_neighbors): 5
Best cross-validation F1 score (on balanced data): 0.7474

--- Final KNN Model Evaluation (Feature Selection + Manhattan) ---
Accuracy: 0.5926
F1 Score: 0.3675

Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.61      0.70      4165
         1.0       0.28      0.52      0.37      1235

    accuracy                           0.59      5400
   macro avg       0.55      0.57      0.53      5400
weighted avg       0.69      0.59      0.62      5400

