### Imports

In [2]:
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from datetime import datetime

# Random Forest model
from sklearn.ensemble import RandomForestClassifier

# Model evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import roc_auc_score, roc_curve

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Load Data

In [3]:
file_path = 'health_data.csv'
health_data = pd.read_csv(file_path)

# Display data
health_data.head()

Unnamed: 0,ProviderID,PolicyholderID,Sex,DOB,DOD,DeductiblePaid,ClaimPayment,ClaimStartDate,ClaimEndDate,ClaimDuration,...,Cancer,Depression,Arthritis,Stroke,ReimburseIP,ReimburseOP,DeductibleIP,DeductibleOP,ProbableFraud,ID
0,51001,56354,Female,1934-06-01,,0,100,2023-06-08,2023-06-08,0,...,0,0,0,0,0,320,0,80,0,1
1,51001,106078,Female,1942-12-01,,0,90,2023-06-16,2023-06-16,0,...,0,0,0,0,0,190,0,20,0,2
2,51001,137197,Male,1964-01-01,,0,1500,2023-04-22,2023-04-22,0,...,1,1,1,0,0,2560,0,480,0,3
3,51001,38773,Male,1952-05-01,,1068,12000,2023-05-23,2023-05-25,2,...,0,0,0,0,95000,2270,2136,900,0,4
4,51001,32715,Male,1950-03-01,,0,500,2023-03-29,2023-03-30,1,...,1,1,0,0,2020,6700,1068,2700,0,5


### Data Preprocessing

In [4]:
# Convert dates to datetime
health_data['DOB'] = pd.to_datetime(health_data['DOB'], errors='coerce')
health_data['ClaimStartDate'] = pd.to_datetime(health_data['ClaimStartDate'], errors='coerce')
health_data['ClaimEndDate'] = pd.to_datetime(health_data['ClaimEndDate'], errors='coerce')

# Calculate Age at Claim
health_data['AgeAtClaim'] = (health_data['ClaimStartDate'] - health_data['DOB']).dt.days / 365.25

# Extrat Year, Month and Day from collumn with dates
health_data['ClaimStartDate'] = pd.to_datetime(health_data['ClaimStartDate'], errors='coerce')
health_data['ClaimStartYear'] = health_data['ClaimStartDate'].dt.year
health_data['ClaimStartMonth'] = health_data['ClaimStartDate'].dt.month
health_data['ClaimStartDay'] = health_data['ClaimStartDate'].dt.day

health_data['ClaimEndDate'] = pd.to_datetime(health_data['ClaimEndDate'], errors='coerce')
health_data['ClaimEndYear'] = health_data['ClaimEndDate'].dt.year
health_data['ClaimEndMonth'] = health_data['ClaimEndDate'].dt.month
health_data['ClaimEndDay'] = health_data['ClaimEndDate'].dt.day


# Drop columns with high number of missing values and unused columns
columns_to_drop = ['DOD', 'OperatingID', 'OtherID', 'CodeProcedure', 'EnterDate', 'ExitDate', 'StayDuration', 'ID', 'ClaimStartDate', 'ClaimEndDate', 'DOB']
health_data.drop(columns=columns_to_drop, inplace=True)



# Encode categorical variables using label encoding
categorical_columns = ['ProviderID', 'PolicyholderID', 'Sex', 'AttendingID', 'State', 'CodeFirst', 'CodeSecond', 
                       'CodeThird', 'CodeHospital']

health_data = pd.get_dummies(health_data, columns=categorical_columns)

# Checking the dataset after preprocessing
health_data.head()

MemoryError: Unable to allocate 258. MiB for an array with shape (2430, 111418) and data type uint8

### Split train-test data

In [4]:
# Splitting the data into training and test sets
X = health_data.drop('ProbableFraud', axis=1)  # Features
y = health_data['ProbableFraud']  # Target variable

# Performing the split with a 80-20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Build the Random Forest Model

In [None]:
# Initializing the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Training the model on the training data
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Model performance metrics
accuracy, precision, recall, f1

In [5]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],     # Number of trees in the forest
    'max_depth': [10, 20, None],        # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],    # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],      # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt']    # Number of features to consider at every split
}

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Use F1 score as the scoring metric
f1_scorer = make_scorer(f1_score)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring=f1_scorer)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding F1 score
print("Best parameters:", grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


243 fits failed out of a total of 486.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
97 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Python311\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Invalid

Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
Best F1 score: 0.6214279748243395
