In [37]:
# Libraries for data loading, data manipulation and data visulisation
import numpy as np
import pandas as pd

# Libraries for data preparation and model building
import string
import datetime
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import feature_selection
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif, mutual_info_classif,chi2, VarianceThreshold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_predict
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

In [13]:
# Load data
url = 'https://raw.githubusercontent.com/dawieloots/explore-integrated-project/main/data/Advanced%20Features%20Claims%20Data.csv'
df = pd.read_csv(url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 46 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    1000 non-null   int64  
 1   months_as_customer            1000 non-null   int64  
 2   age                           1000 non-null   int64  
 3   policy_number                 1000 non-null   int64  
 4   policy_bind_date              1000 non-null   object 
 5   policy_state                  1000 non-null   object 
 6   policy_csl                    1000 non-null   object 
 7   policy_deductable             1000 non-null   int64  
 8   policy_annual_premium         1000 non-null   float64
 9   umbrella_limit                1000 non-null   int64  
 10  insured_zip                   1000 non-null   int64  
 11  insured_sex                   1000 non-null   object 
 12  insured_education_level       1000 non-null   object 
 13  insu

In [15]:
#Drop unnecessary columns
to_drop = ['policy_number','policy_bind_date','insured_zip','incident_location','incident_date']
df.drop(columns=to_drop,inplace=True)

#Handle missing values
df['authorities_contacted'] = df['authorities_contacted'].fillna('None') #The missing columns actually do contain the words "None" in the csv file

#Convert all categorical features to numbers
categorical_features = [col for col in df.select_dtypes(include=['object']).columns if col != 'fraud_reported']
print(categorical_features)
dummies = pd.get_dummies(df[categorical_features], drop_first = True, dtype=int)
df.drop(categorical_features, axis=1, inplace=True)
df = pd.concat([df, dummies], axis=1)
df.info()

['policy_state', 'policy_csl', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 'property_damage', 'police_report_available', 'auto_make', 'auto_model', 'customer_category', 'incident_time_of_day']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 155 entries, Unnamed: 0 to incident_time_of_day_night
dtypes: float64(4), int32(133), int64(17), object(1)
memory usage: 691.5+ KB


In [17]:
#Split into test and train
X = df.drop(columns=['fraud_reported']).copy()
y = df['fraud_reported'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 887 to 594
Columns: 154 entries, Unnamed: 0 to incident_time_of_day_night
dtypes: float64(4), int32(133), int64(17)
memory usage: 553.1 KB


In [19]:
# Handle class imbalance
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_train, columns=X.columns)
y_train = pd.DataFrame(y_train, columns=['fraud_reported'])

In [21]:
# Scale the data
scaler = StandardScaler()
columns_to_scale = X_train.columns[X_train.columns.get_loc('months_as_customer'):X_train.columns.get_loc('vehicle_age_at_incident_date')+1]
X_train.loc[:, columns_to_scale] = scaler.fit_transform(X_train.loc[:, columns_to_scale])
X_test.loc[:, columns_to_scale] = scaler.transform(X_test.loc[:, columns_to_scale])

In [25]:
# Select best features
y_train = np.ravel(y_train) # Convert y_train to a 1-dimensional array
kbest = SelectKBest(score_func=f_classif, k=10)
X_train_selected = kbest.fit_transform(X_train, y_train)
X_test_selected = kbest.transform(X_test)
X_train = pd.DataFrame(X_train_selected, columns=X_train.columns[kbest.get_support()])
X_test = pd.DataFrame(X_test_selected, columns=X_test.columns[kbest.get_support()])
print("Selected features:", X_train.columns)
print("Selected features:", X_test.columns)


Selected features: Index(['policy_csl_500/1000', 'incident_severity_Minor Damage',
       'incident_severity_Total Loss', 'incident_severity_Trivial Damage',
       'authorities_contacted_None', 'authorities_contacted_Police',
       'incident_state_NY', 'incident_state_WV', 'property_damage_NO',
       'police_report_available_YES'],
      dtype='object')
Selected features: Index(['policy_csl_500/1000', 'incident_severity_Minor Damage',
       'incident_severity_Total Loss', 'incident_severity_Trivial Damage',
       'authorities_contacted_None', 'authorities_contacted_Police',
       'incident_state_NY', 'incident_state_WV', 'property_damage_NO',
       'police_report_available_YES'],
      dtype='object')


In [27]:
# Define different models, and do CV on all of them

models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

y_test = np.ravel(y_test) # Convert y_test to a 1D-array

for name, model in models.items():
    y_pred_cv = cross_val_predict(model, X_train, y_train, cv=10)
    report = classification_report(y_train, y_pred_cv)
    print(f"Classification Report for {name}:\n", report)

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           N       0.83      0.87      0.85       602
           Y       0.86      0.82      0.84       602

    accuracy                           0.85      1204
   macro avg       0.85      0.85      0.85      1204
weighted avg       0.85      0.85      0.85      1204

Classification Report for KNN:
               precision    recall  f1-score   support

           N       0.69      0.89      0.78       602
           Y       0.85      0.59      0.70       602

    accuracy                           0.74      1204
   macro avg       0.77      0.74      0.74      1204
weighted avg       0.77      0.74      0.74      1204

Classification Report for SVM:
               precision    recall  f1-score   support

           N       0.83      0.88      0.85       602
           Y       0.87      0.82      0.84       602

    accuracy                           0.85      1204
   macro avg    

In [33]:
# Seems like LogisticRegression performed the best.  It may also be the best option for this project because it is easy to interpret
# Let's do some hyperparameter tuning on it

clf = LogisticRegression()
param_grid = {
    'penalty': ['l1', 'l2'],                # Penalty term (L1 or L2 regularization)
    'C': [0.001, 0.01, 0.1, 1, 10, 100],    # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],
    'max_iter': [10000]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=0)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

best_rf_classifier = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_rf_classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification report for tuned model, on test set:\n", report)


Best Parameters: {'C': 0.1, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'saga'}
Best Score: 0.8414384166177854
Classification report for tuned model, on test set:
               precision    recall  f1-score   support

           N       0.91      0.86      0.88       151
           Y       0.63      0.73      0.68        49

    accuracy                           0.83       200
   macro avg       0.77      0.80      0.78       200
weighted avg       0.84      0.83      0.83       200



In [35]:
# We will now continue to train a logistic regression model on the full dataset

X = np.concatenate([X_train, X_test], axis=0)
y = np.concatenate([y_train, y_test], axis=0)
clf = LogisticRegression(C=10, max_iter=10000, penalty='l2', solver='saga')
clf.fit(X, y)

In [None]:
# Now that the model is fitted, we will save it as a pickle file, so that our app can use it
model_save_path = f'/content/drive/MyDrive/Professionele ontwikkeling/Data Science/Explore Data Science Course/Integrated Project/Github/explore-integrated-project/ml_model/resources/model.pkl'
with open(model_save_path,'wb') as file:
    pickle.dump(clf,file)

In [45]:
# This cell is just an example of how unseen data will be passed to the model. The same logic below will also be specified in the app.

new_data = {
    'csl': '500/1000',
    'incident_severity': 'Major Damage',
    'incident_state': 'SC',
    'authorities_contacted': 'Police',
    'property_damage': 'NO',
    'police_report_available': 'YES'
}

policy_csl_5001000 = 0
if new_data['csl'] == '500/1000':
    policy_csl_5001000 = 1

incident_severity_MinorDamage = 0
incident_severity_TotalLoss = 0
incident_severity_TrivialDamage = 0
if new_data['incident_severity'] == 'Minor Damage':
    incident_severity_MinorDamage = 1
elif new_data['incident_severity'] == 'Total Loss':
    incident_severity_TotalLoss = 1
elif new_data['incident_severity'] == 'Trivial Damage':
    incident_severity_TrivialDamage = 1
authorities_contacted_None = 0
authorities_contacted_Police = 0
if new_data['authorities_contacted'] == 'None':
    authorities_contacted_None = 1
if new_data['authorities_contacted'] == 'Police':
    authorities_contacted_Police = 1
incident_state_NY = 0
incident_state_WV = 0
if new_data['incident_state'] == 'NY':
    incident_state_NY = 1
elif new_data['incident_state'] == 'WV':
    incident_state_WV = 1
property_damage_NO = 0
if new_data['property_damage'] == 'NO':
    property_damage_NO = 1
police_report_available_YES = 0
if new_data['police_report_available'] == 'YES':
    police_report_available_YES = 1

X_unseen = np.array([policy_csl_5001000,incident_severity_MinorDamage,
                    incident_severity_TotalLoss, incident_severity_TrivialDamage,
                    authorities_contacted_None, authorities_contacted_Police,
                    incident_state_NY, incident_state_WV,property_damage_NO,
                    police_report_available_YES])

X_unseen = X_unseen.reshape(1, -1)
y_pred = clf.predict(X_unseen)
print(y_pred)

['N']
