**Import the libraries needed for analysis**

In [None]:
!pip install scikit-optimize
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skopt import BayesSearchCV



**Read training and test datafile**

In [None]:
#Read training data file
trainData = pd.read_csv(r"/content/sample_data/Insurance Fraud - TRAIN-3000.csv")
trainData.tail()

Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,...,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
2994,Apr,2,Monday,Pontiac,Rural,Monday,Apr,2,Male,Married,...,36_to_40,No,No,External,none,no_change,1-vehicle,1996,Liability,No
2995,Nov,4,Thursday,Honda,Urban,Friday,Nov,4,Male,Married,...,31_to_35,No,No,External,none,no_change,1-vehicle,1996,Liability,No
2996,Dec,4,Sunday,Chevrolet,Urban,Wednesday,Jan,1,Male,Married,...,over_65,No,No,External,none,no_change,1-vehicle,1996,Liability,No
2997,Apr,1,Monday,Toyota,Urban,Tuesday,Apr,1,Male,Married,...,31_to_35,No,No,External,none,no_change,1-vehicle,1996,All_Perils,No
2998,Jun,4,Friday,Pontiac,Urban,Wednesday,Jul,1,Male,Single,...,31_to_35,No,No,External,3_to_5,no_change,1-vehicle,1996,All_Perils,No


In [None]:
#Data Description
trainData.info()
trainData.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12918 entries, 0 to 12917
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MONTH                 12918 non-null  object
 1   WEEKOFMONTH           12918 non-null  int64 
 2   DAYOFWEEK             12918 non-null  object
 3   MAKE                  12918 non-null  object
 4   ACCIDENTAREA          12918 non-null  object
 5   DAYOFWEEKCLAIMED      12918 non-null  object
 6   MONTHCLAIMED          12918 non-null  object
 7   WEEKOFMONTHCLAIMED    12918 non-null  int64 
 8   SEX                   12918 non-null  object
 9   MARITALSTATUS         12918 non-null  object
 10  AGE                   12918 non-null  int64 
 11  FAULT                 12918 non-null  object
 12  POLICYTYPE            12918 non-null  object
 13  VEHICLECATEGORY       12918 non-null  object
 14  VEHICLEPRICE          12918 non-null  object
 15  REPNUMBER             12918 non-null

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR
count,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0
mean,2.78619,2.693296,39.915854,8.480183,407.586314,2.488853,1994.999458
std,1.289048,1.259564,13.432421,4.602126,43.672599,1.118991,0.786046
min,1.0,1.0,0.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,16.0,700.0,4.0,1996.0


In [None]:
# To check number of null values
trainData.isna().sum()

Unnamed: 0,0
MONTH,0
WEEKOFMONTH,0
DAYOFWEEK,0
MAKE,0
ACCIDENTAREA,0
DAYOFWEEKCLAIMED,0
MONTHCLAIMED,0
WEEKOFMONTHCLAIMED,0
SEX,0
MARITALSTATUS,0


In [None]:
#Read training data file
testData = pd.read_csv(r"/content/sample_data/Insurance Fraud -TEST-12900.csv")
testData.tail()

Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,...,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
12913,Dec,1,Tuesday,Pontiac,Urban,Thursday,Feb,1,Male,Married,...,51_to_65,No,No,External,none,4_to_8_years,2-vehicles,1995,All_Perils,No
12914,Oct,2,Tuesday,Accura,Urban,Thursday,Oct,3,Male,Married,...,31_to_35,No,No,External,3_to_5,no_change,1-vehicle,1994,Collision,No
12915,Apr,4,Monday,Toyota,Urban,Wednesday,Apr,4,Male,Married,...,over_65,No,No,External,more_than_5,no_change,1-vehicle,1995,All_Perils,No
12916,Jun,2,Friday,Mazda,Urban,Friday,Jun,2,Male,Married,...,over_65,No,No,External,more_than_5,no_change,1-vehicle,1996,Collision,No
12917,Jun,2,Wednesday,Chevrolet,Urban,Wednesday,Jun,2,Male,Married,...,41_to_50,No,No,External,3_to_5,no_change,1-vehicle,1996,Collision,No


In [None]:
#Data Description
testData.info()
testData.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12918 entries, 0 to 12917
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MONTH                 12918 non-null  object
 1   WEEKOFMONTH           12918 non-null  int64 
 2   DAYOFWEEK             12918 non-null  object
 3   MAKE                  12918 non-null  object
 4   ACCIDENTAREA          12918 non-null  object
 5   DAYOFWEEKCLAIMED      12918 non-null  object
 6   MONTHCLAIMED          12918 non-null  object
 7   WEEKOFMONTHCLAIMED    12918 non-null  int64 
 8   SEX                   12918 non-null  object
 9   MARITALSTATUS         12918 non-null  object
 10  AGE                   12918 non-null  int64 
 11  FAULT                 12918 non-null  object
 12  POLICYTYPE            12918 non-null  object
 13  VEHICLECATEGORY       12918 non-null  object
 14  VEHICLEPRICE          12918 non-null  object
 15  REPNUMBER             12918 non-null

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR
count,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0,12918.0
mean,2.78619,2.693296,39.915854,8.480183,407.586314,2.488853,1994.999458
std,1.289048,1.259564,13.432421,4.602126,43.672599,1.118991,0.786046
min,1.0,1.0,0.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,16.0,700.0,4.0,1996.0


In [None]:
# To check number of null values
testData.isna().sum()

Unnamed: 0,0
MONTH,0
WEEKOFMONTH,0
DAYOFWEEK,0
MAKE,0
ACCIDENTAREA,0
DAYOFWEEKCLAIMED,0
MONTHCLAIMED,0
WEEKOFMONTHCLAIMED,0
SEX,0
MARITALSTATUS,0


**Check for non numeric values**

In [None]:
# Separating features and target for training data
X_train = trainData.drop('FRAUDFOUND', axis=1)
y_train = trainData['FRAUDFOUND']

# Similarly, for testing data
X_test = testData.drop('FRAUDFOUND', axis=1)
y_test = testData['FRAUDFOUND']

# Check for non-numeric columns in the training data
print("Columns with non-numeric values in training set:\n", X_train.select_dtypes(include=['object']).columns)

# Convert categorical columns to numeric using one-hot encoding
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure train and test sets have the same columns after encoding
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

Columns with non-numeric values in training set:
 Index(['MONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA', 'DAYOFWEEKCLAIMED',
       'MONTHCLAIMED', 'SEX', 'MARITALSTATUS', 'FAULT', 'POLICYTYPE',
       'VEHICLECATEGORY', 'VEHICLEPRICE', 'DAYS_POLICY_ACCIDENT',
       'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS', 'AGEOFVEHICLE',
       'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED', 'WITNESSPRESENT', 'AGENTTYPE',
       'NUMBEROFSUPPLIMENTS', 'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS',
       'BASEPOLICY'],
      dtype='object')


**Hyperparameter tuning of two classifiers by changing at least three different hyperparameters for each classifier**

**Hyperparameter tuning of Decision Tree classifier using Random Search by changing max_depth, min_samples_split, min_samples_leaf**

In [None]:
from scipy.stats import randint

# Define the parameter distribution for random search
param_dist_dt = {
    'max_depth': [5, 10, 20, 30, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20)
}

# Random Search with cross-validation
random_search_dt = RandomizedSearchCV(estimator=dt, param_distributions=param_dist_dt, n_iter=20, cv=5, scoring='accuracy', random_state=42)
random_search_dt.fit(X_train, y_train)

# Best parameters from random search
print("Best parameters from Random Search (Decision Tree):", random_search_dt.best_params_)

Best parameters from Random Search (Decision Tree): {'max_depth': 30, 'min_samples_leaf': 18, 'min_samples_split': 9}


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, pos_label='Yes', average='binary'),
        'recall': recall_score(y_test, y_pred, pos_label='Yes', average='binary'),
        'f1_score': f1_score(y_test, y_pred, pos_label='Yes', average='binary')
    }
best_dt_random = DecisionTreeClassifier(**random_search_dt.best_params_)
best_dt_random.fit(X_train, y_train)
print("Evaluation (Decision Tree - Random Search):")
evaluate_model(best_dt_random, X_test, y_test)

Evaluation (Decision Tree - Random Search):
              precision    recall  f1-score   support

          No       0.96      1.00      0.98     12420
         Yes       0.73      0.07      0.13       498

    accuracy                           0.96     12918
   macro avg       0.84      0.54      0.56     12918
weighted avg       0.95      0.96      0.95     12918



{'accuracy': 0.9632296021055891,
 'precision': 0.7254901960784313,
 'recall': 0.07429718875502007,
 'f1_score': 0.13479052823315119}

**Hyperparameter tuning of Decision Tree classifier using Grid Search by changing max_depth, min_samples_split, min_samples_leaf**

In [None]:
param_grid_dt = {
    'max_depth': [5, 10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10, 20]
}

# Initialize Decision Tree classifier
dt = DecisionTreeClassifier()

# Grid Search with cross-validation
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Best parameters from grid search
print("Best parameters from Grid Search (Decision Tree):", grid_search_dt.best_params_)

Best parameters from Grid Search (Decision Tree): {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, pos_label='Yes', average='binary'),
        'recall': recall_score(y_test, y_pred, pos_label='Yes', average='binary'),
        'f1_score': f1_score(y_test, y_pred, pos_label='Yes', average='binary')
    }
best_dt_grid = DecisionTreeClassifier(**grid_search_dt.best_params_)
best_dt_grid.fit(X_train, y_train)
print("Evaluation (Decision Tree - Grid Search):")
evaluate_model(best_dt_grid, X_test, y_test)

Evaluation (Decision Tree - Grid Search):
              precision    recall  f1-score   support

          No       0.96      1.00      0.98     12420
         Yes       0.92      0.04      0.08       498

    accuracy                           0.96     12918
   macro avg       0.94      0.52      0.53     12918
weighted avg       0.96      0.96      0.95     12918



{'accuracy': 0.9629973680136245,
 'precision': 0.9166666666666666,
 'recall': 0.04417670682730924,
 'f1_score': 0.0842911877394636}

**Hyperparameter tuning of Decision Tree classifier using Bayes Search by changing max_depth, min_samples_split, min_samples_leaf**

In [None]:
# Bayesian Optimization Search
bayes_search_dt = BayesSearchCV(estimator=dt, search_spaces=param_grid_dt, cv=5, n_iter=20, scoring='accuracy', random_state=42)
bayes_search_dt.fit(X_train, y_train)

# Best parameters from Bayesian search
print("Best parameters from Bayes Search (Decision Tree):", bayes_search_dt.best_params_)

Best parameters from Bayes Search (Decision Tree): OrderedDict([('max_depth', 5), ('min_samples_leaf', 10), ('min_samples_split', 5)])


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, pos_label='Yes', average='binary'),
        'recall': recall_score(y_test, y_pred, pos_label='Yes', average='binary'),
        'f1_score': f1_score(y_test, y_pred, pos_label='Yes', average='binary')
    }
best_dt_bayes = DecisionTreeClassifier(**bayes_search_dt.best_params_)
best_dt_bayes.fit(X_train, y_train)
print("Evaluation (Decision Tree - Bayes Search):")
evaluate_model(best_dt_bayes, X_test, y_test)

Evaluation (Decision Tree - Bayes Search):
              precision    recall  f1-score   support

          No       0.96      1.00      0.98     12420
         Yes       0.92      0.04      0.08       498

    accuracy                           0.96     12918
   macro avg       0.94      0.52      0.53     12918
weighted avg       0.96      0.96      0.95     12918



{'accuracy': 0.9629973680136245,
 'precision': 0.9166666666666666,
 'recall': 0.04417670682730924,
 'f1_score': 0.0842911877394636}

**Best parameters from Random Search (Decision Tree): {'max_depth': 30, 'min_samples_leaf': 18, 'min_samples_split': 9}**


**Evaluation (Decision Tree - Random Search):**

              precision    recall  f1-score   support
          No       0.96      1.00      0.98     12420
         Yes       0.73      0.07      0.13       498
    accuracy                           0.96     12918

   macro avg       0.84      0.54      0.56     12918


weighted avg       0.95      0.96      0.95     12918

{'accuracy': 0.9632296021055891,

 'precision': 0.7254901960784313,

 'recall': 0.07429718875502007,

 'f1_score': 0.13479052823315119}



**Best parameters from Grid Search (Decision Tree): {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2}**

**Evaluation (Decision Tree - Grid Search):**

              precision    recall  f1-score   support
          No       0.96      1.00      0.98     12420
         Yes       0.92      0.04      0.08       498
    accuracy                           0.96     12918

   macro avg       0.94      0.52      0.53     12918

weighted avg       0.96      0.96      0.95     12918


{'accuracy': 0.9629973680136245,

 'precision': 0.9166666666666666,

 'recall': 0.04417670682730924,

 'f1_score': 0.0842911877394636}

**Best parameters from Bayes Search (Decision Tree): OrderedDict([('max_depth', 5), ('min_samples_leaf', 10), ('min_samples_split', 5)])**

**Evaluation (Decision Tree - Bayes Search):**

              precision    recall  f1-score   support
          No       0.96      1.00      0.98     12420
         Yes       0.92      0.04      0.08       498
    accuracy                           0.96     12918

   macro avg       0.94      0.52      0.53     12918

weighted avg       0.96      0.96      0.95     12918

{'accuracy': 0.9629973680136245,

 'precision': 0.9166666666666666,

 'recall': 0.04417670682730924,

 'f1_score': 0.0842911877394636}

**Hyperparameter tuning of Random Forest classifier using Random Search**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribution for random search
param_dist_rf = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest
    'max_depth': [5, 10, 20, 30, None],  # Max depth of the trees
    'min_samples_split': randint(2, 20),  # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 20),  # Minimum samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2', None]  # Number of features to consider when looking for the best split
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Random Search with cross-validation
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_dist_rf, n_iter=20, cv=5, scoring='accuracy', random_state=42)

# Fit the random search model
random_search_rf.fit(X_train, y_train)

# Best parameters from random search
print("Best parameters from Random Search (Random Forest):", random_search_rf.best_params_)


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Best parameters from Random Search (Random Forest): {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 12, 'n_estimators': 130}


**Hyperparameter tuning of Random Forest classifier using Grid Search**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [5, 10, 20, 30, None],  # Max depth of the trees
    'min_samples_split': [2, 5, 10, 20],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 5, 10, 20],  # Minimum samples required to be at a leaf node
#    'max_features': ['auto', 'sqrt', 'log2', None]  # Number of features to consider when looking for the best split
}

# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Grid Search with cross-validation
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy')

# Fit the grid search model
grid_search_rf.fit(X_train, y_train)

# Best parameters from grid search
print("Best parameters from Grid Search (Random Forest):", grid_search_rf.best_params_)


Best parameters from Grid Search (Random Forest): {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


**Hyperparameter tuning of Random Forest classifier using Bayes Search**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical

# Define the parameter search space for Random Forest using skopt.space objects
param_space_rf = {
    'n_estimators': Integer(50, 200),  # Number of trees in the forest
    'max_depth': [5, 10, 20, 30, None],  # Max depth of the trees
    'min_samples_split': Integer(2, 20),  # Minimum samples required to split an internal node
    'min_samples_leaf': Integer(1, 20),  # Minimum samples required to be at a leaf node
#    'max_features': ['auto', 'sqrt', 'log2', None]  # Number of features to consider when looking for the best split
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Bayesian Optimization Search with cross-validation
bayes_search_rf = BayesSearchCV(estimator=rf, search_spaces=param_space_rf, cv=5, n_iter=20, scoring='accuracy', random_state=42)

# Fit the Bayesian search model
bayes_search_rf.fit(X_train, y_train)

# Best parameters from Bayesian search
print("Best parameters from Bayes Search (Random Forest):", bayes_search_rf.best_params_)

Best parameters from Bayes Search (Random Forest): OrderedDict([('max_depth', 20), ('min_samples_leaf', 15), ('min_samples_split', 19), ('n_estimators', 97)])
