In [1]:
# Libraries for data loading, data manipulation and data visulisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import io # For string operations
%matplotlib inline

# Libraries for data preparation and model building

import string
import datetime
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import feature_selection
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif, mutual_info_classif,chi2, VarianceThreshold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_predict
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report



In [2]:
from google.colab import drive
drive.mount('/content/drive')
csv_file = '/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Integrated Project/Github/explore-integrated-project/data/Advanced Features Claims Data.csv'

Mounted at /content/drive


In [3]:
df = pd.read_csv(csv_file, index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 45 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   months_as_customer            1000 non-null   int64  
 1   age                           1000 non-null   int64  
 2   policy_number                 1000 non-null   int64  
 3   policy_bind_date              1000 non-null   object 
 4   policy_state                  1000 non-null   object 
 5   policy_csl                    1000 non-null   object 
 6   policy_deductable             1000 non-null   int64  
 7   policy_annual_premium         1000 non-null   float64
 8   umbrella_limit                1000 non-null   int64  
 9   insured_zip                   1000 non-null   int64  
 10  insured_sex                   1000 non-null   object 
 11  insured_education_level       1000 non-null   object 
 12  insured_occupation            1000 non-null   object 
 13  insured_h

In [5]:
#Drop unnecessary columns
to_drop = ['policy_number','policy_bind_date','insured_zip','incident_location','incident_date']
df.drop(columns=to_drop,inplace=True)

#Handle missing values
df['authorities_contacted'] = df['authorities_contacted'].fillna('None') #The missing columns actually do contain the words "None" in the csv file

#Convert all categorical features to numbers
categorical_features = [col for col in df.select_dtypes(include=['object']).columns if col != 'fraud_reported']
print(categorical_features)
dummies = pd.get_dummies(df[categorical_features], drop_first = True, dtype=int)
df.drop(categorical_features, axis=1, inplace=True)
df = pd.concat([df, dummies], axis=1)
df.info()

['policy_state', 'policy_csl', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 'property_damage', 'police_report_available', 'auto_make', 'auto_model', 'customer_category', 'incident_time_of_day']
<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Columns: 154 entries, months_as_customer to incident_time_of_day_night
dtypes: float64(4), int64(149), object(1)
memory usage: 1.2+ MB


In [6]:
#Split into test and train
X = df.drop(columns=['fraud_reported']).copy()
y = df['fraud_reported'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 800 entries, 887 to 594
Columns: 153 entries, months_as_customer to incident_time_of_day_night
dtypes: float64(4), int64(149)
memory usage: 962.5 KB


In [7]:
# Handle class imbalance
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_train, columns=X.columns)
y_train = pd.DataFrame(y_train, columns=['fraud_reported'])

In [8]:
# Scale the data
scaler = StandardScaler()
columns_to_scale = X_train.columns[X_train.columns.get_loc('months_as_customer'):X_train.columns.get_loc('vehicle_age_at_incident_date')+1]
#X_train_scaled = X_train_resampled.copy()
#X_test_scaled = X_test.copy()
X_train.loc[:, columns_to_scale] = scaler.fit_transform(X_train.loc[:, columns_to_scale])
X_test.loc[:, columns_to_scale] = scaler.transform(X_test.loc[:, columns_to_scale])


In [9]:
# Select best features

kbest = SelectKBest(score_func=f_classif, k=10)
X_train_selected = kbest.fit_transform(X_train, y_train)
X_test_selected = kbest.transform(X_test)
X_train = pd.DataFrame(X_train_selected, columns=X_train.columns[kbest.get_support()])
X_test = pd.DataFrame(X_test_selected, columns=X_test.columns[kbest.get_support()])
print("Selected features:", X_train.columns)
print("Selected features:", X_test.columns)


Selected features: Index(['policy_csl_500/1000', 'incident_severity_Minor Damage',
       'incident_severity_Total Loss', 'incident_severity_Trivial Damage',
       'authorities_contacted_None', 'authorities_contacted_Police',
       'incident_state_NY', 'incident_state_WV', 'property_damage_NO',
       'police_report_available_YES'],
      dtype='object')
Selected features: Index(['policy_csl_500/1000', 'incident_severity_Minor Damage',
       'incident_severity_Total Loss', 'incident_severity_Trivial Damage',
       'authorities_contacted_None', 'authorities_contacted_Police',
       'incident_state_NY', 'incident_state_WV', 'property_damage_NO',
       'police_report_available_YES'],
      dtype='object')


  y = column_or_1d(y, warn=True)


In [10]:
# Define models

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

for name, model in models.items():
    y_pred_cv = cross_val_predict(model, X_train, y_train, cv=10)
    report = classification_report(y_train, y_pred_cv)
    print(f"Classification Report for {name}:\n", report)

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           N       0.83      0.87      0.85       602
           Y       0.86      0.82      0.84       602

    accuracy                           0.85      1204
   macro avg       0.85      0.85      0.85      1204
weighted avg       0.85      0.85      0.85      1204

Classification Report for KNN:
               precision    recall  f1-score   support

           N       0.57      0.90      0.70       602
           Y       0.76      0.32      0.46       602

    accuracy                           0.61      1204
   macro avg       0.67      0.61      0.58      1204
weighted avg       0.67      0.61      0.58      1204

Classification Report for SVM:
               precision    recall  f1-score   support

           N       0.83      0.88      0.85       602
           Y       0.87      0.82      0.84       602

    accuracy                           0.85      1204
   macro avg    

In [11]:
'''
# Tuning RandomForest

clf = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

best_rf_classifier = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_rf_classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification report for tuned model:\n", report)
'''

'\n# Tuning RandomForest\n\nclf = RandomForestClassifier()\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [None, 10, 20],\n    \'min_samples_split\': [2, 5, 10],\n    \'min_samples_leaf\': [1, 2, 5]\n}\n\ngrid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring=\'f1_macro\', verbose=2)\ngrid_search.fit(X_train, y_train)\nbest_params = grid_search.best_params_\nbest_score = grid_search.best_score_\nprint("Best Parameters:", best_params)\nprint("Best Score:", best_score)\n\nbest_rf_classifier = grid_search.best_estimator_\n\n# Evaluate the best model on the test set\ny_pred = best_rf_classifier.predict(X_test)\nreport = classification_report(y_test, y_pred)\nprint("Classification report for tuned model:\n", report)\n'

In [12]:
'''
# Tuning LogisticRegression

clf = LogisticRegression()
param_grid = {
    'penalty': ['l1', 'l2'],                # Penalty term (L1 or L2 regularization)
    'C': [0.001, 0.01, 0.1, 1, 10, 100],    # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],
    'max_iter': [10000]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

best_rf_classifier = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_rf_classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification report for tuned model:\n", report)
'''

'\n# Tuning LogisticRegression\n\nclf = LogisticRegression()\nparam_grid = {\n    \'penalty\': [\'l1\', \'l2\'],                # Penalty term (L1 or L2 regularization)\n    \'C\': [0.001, 0.01, 0.1, 1, 10, 100],    # Inverse of regularization strength\n    \'solver\': [\'liblinear\', \'saga\'],\n    \'max_iter\': [10000]\n}\n\ngrid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring=\'f1_macro\', verbose=2)\ngrid_search.fit(X_train, y_train)\nbest_params = grid_search.best_params_\nbest_score = grid_search.best_score_\nprint("Best Parameters:", best_params)\nprint("Best Score:", best_score)\n\nbest_rf_classifier = grid_search.best_estimator_\n\n# Evaluate the best model on the test set\ny_pred = best_rf_classifier.predict(X_test)\nreport = classification_report(y_test, y_pred)\nprint("Classification report for tuned model:\n", report)\n'

In [13]:
# Train model on full dataset

X = np.concatenate([X_train, X_test], axis=0)
y = np.concatenate([y_train, y_test], axis=0)
clf = LogisticRegression(C=10, max_iter=10000, penalty='l2', solver='saga')
clf.fit(X, y)

In [14]:
# Save the model

import pickle

model_save_path = f'/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Integrated Project/Github/explore-integrated-project/ml_model/resources/model.pkl'
with open(model_save_path,'wb') as file:
    pickle.dump(clf,file)

In [15]:
new_data = {
    'csl': '500/1000',
    'incident_severity': 'Major Damage',
    'incident_state': 'SC',
    'authorities_contacted': 'Police',
    'property_damage': 'NO',
    'police_report_available': 'YES'
}

policy_csl_5001000 = 0
if new_data['csl'] == '500/1000':
    policy_csl_5001000 = 1

incident_severity_MinorDamage = 0
incident_severity_TotalLoss = 0
incident_severity_TrivialDamage = 0
if new_data['incident_severity'] == 'Minor Damage':
    incident_severity_MinorDamage = 1
elif new_data['incident_severity'] == 'Total Loss':
    incident_severity_TotalLoss = 1
elif new_data['incident_severity'] == 'Trivial Damage':
    incident_severity_TrivialDamage = 1
authorities_contacted_None = 0
authorities_contacted_Police = 0
if new_data['authorities_contacted'] == 'None':
    authorities_contacted_None = 1
if new_data['authorities_contacted'] == 'Police':
    authorities_contacted_Police = 1
incident_state_NY = 0
incident_state_WV = 0
if new_data['incident_state'] == 'NY':
    incident_state_NY = 1
elif new_data['incident_state'] == 'WV':
    incident_state_WV = 1
property_damage_NO = 0
if new_data['property_damage'] == 'NO':
    property_damage_NO = 1
police_report_available_YES = 0
if new_data['police_report_available'] == 'YES':
    police_report_available_YES = 1

X_unseen = np.array([policy_csl_5001000,incident_severity_MinorDamage,
                    incident_severity_TotalLoss, incident_severity_TrivialDamage,
                    authorities_contacted_None, authorities_contacted_Police,
                    incident_state_NY, incident_state_WV,property_damage_NO,
                    police_report_available_YES])

X_unseen = X_unseen.reshape(1, -1)
y_pred = clf.predict(X_unseen)
print(y_pred)

['N']
