### Problem definition and backgroud

As defined by the Oxford dictionary fraud is "wrongful or criminal deception intended to result in financial or personal gain". Even if froud is not something new, the phenomenon grew in size with the increase of internet transactions. When we say transactions we are not referring strictly to the online payments, even if that is one of the most popular types of online frauds.

For this example we are using a dataset that contains insurance claims, and each transaction includes the information if it was a fraudulent claim or not

#### Goals
- read data
- understand the data
- prepare data 
- use Machine Learning algorithms to detect fraudulent claims


#### Import necessary libraries to process the data

Details for loading libraries and reading the data at [Import and read data details]("C:\work\sources\mlcourse\notebooks\insurance\import_dataRead.md")


In [None]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, roc_curve, auc
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from IPython.display import display

pd.set_option('display.max_columns', None) # by default the number of columns showed is trucated. This option will enabe "show all" feature

#### Read dataset

In [None]:
#read data from a location on the disk.
#dataset source https://www.kaggle.com/datasets/incarnyx/car-insurance-fraud
dataset = pd.read_excel("car_insurance_fraud.xlsx")
dataset.head()


### Data exploration
The purpose of this step is to understand the data, the sstructure of it, in order to be able to modify it later to fi the purpose of the experiment. A lot of insights related to the data can be obtain here.

The describe function gives an insight about the data by generating statistics on the numeric columns, like cout , average, min , max, etc.

In [None]:
#numerical features in the dataset
dataset.describe(exclude="object").transpose()

In [None]:
#non numerical features
dataset.describe(exclude="number").transpose()

The info() function outputs information about the structure of the data, and the data types.

In [None]:
dataset.info()

In [None]:
# Plots the histogram for each numerical feature in a separate subplot
def createHistogramPlot(data):
    data.hist(bins=25, figsize=(30, 25), layout=(-1, 3))
    plt.tight_layout()     
createHistogramPlot(dataset)

In [None]:

dataset.plot(lw=0, marker="*", subplots=True, layout=(-1, 2),markersize=0.6, figsize=(15, 20))


In [None]:
dataset_filtered= dataset.copy()
dataset_filtered[['Age','DriverRating','RepNumber', 'Sex']].boxplot(figsize=(15, 20))

In [None]:
#Add charts and stats
hue_feature = 'FraudFound_P'
plot_values = 'Age'
g = sns.FacetGrid(dataset, hue=hue_feature, height = 7, aspect = 2)
g.map(sns.kdeplot, plot_values)
plt.title(plot_values +' distribution for fraud and no fraud claims')
plt.legend()
plt.show()

In [None]:
#Add charts and stats
hue_feature = 'FraudFound_P'
plot_values = 'ClaimSize'
g = sns.FacetGrid(dataset, hue=hue_feature, height = 7, aspect = 2)
g.map(sns.kdeplot, plot_values)
plt.title(plot_values +' distribution for fraud and no fraud claims')
plt.legend()
plt.show()

### Data preparation
- resolve issues with the data
- prepare data for model building

More details [Here](here)

In [None]:
## Search for null values in the data
dataset_nulls = dataset.copy()
null_values = dataset_nulls.isnull().sum()
print("Missing values in dataset:")
for name, value in null_values.iteritems():
    if value > 0:
        print(name, value)
display(dataset_nulls.isnull().sum())

In [None]:
#Interpolation

dataset_interpolation = dataset_nulls.copy()
age_index = dataset_nulls[dataset_nulls['Age'].isnull()].index.tolist()
driver_index = dataset_nulls[dataset_nulls['DriverRating'].isnull()].index.tolist()
#Linear
ds_linear = dataset_interpolation.interpolate(method="linear")
#polynomial
ds_poly = dataset_interpolation.interpolate(method="polynomial", order=2)
#padding
ds_padding =dataset_interpolation.interpolate(method="pad", limit=3)

print("Age replacements")
for position in age_index:
    print(f"Original Value: {dataset_nulls.iloc[position]['Age']}  Linear: {ds_linear.iloc[position]['Age']} Poly: {ds_poly.iloc[position]['Age']} Padding: {ds_padding.iloc[position]['Age']}" )

print("Driver Rating replacements")
for position in driver_index:
    print(f"Original Value: {dataset_nulls.iloc[position]['DriverRating']}  Linear: {ds_linear.iloc[position]['DriverRating']} Poly: {ds_poly.iloc[position]['DriverRating']} Padding: {ds_padding.iloc[position]['DriverRating']}" )

In [None]:
#Inputation

dataframe_inputation = dataset_nulls.copy()
age_index = dataset_nulls[dataset_nulls['Age'].isnull()].index.tolist()
driver_index = dataset_nulls[dataset_nulls['DriverRating'].isnull()].index.tolist()

#Mode
ds_mode = dataframe_inputation.copy()
ds_mode['Age'].fillna(ds_mode['Age'].mode()[0], inplace=True)
ds_mode['DriverRating'].fillna(ds_mode['DriverRating'].mode()[0], inplace=True)

#Median
ds_median = dataframe_inputation.copy()
ds_median['Age'].fillna(ds_median['Age'].median(), inplace=True)
ds_median['DriverRating'].fillna(ds_median['DriverRating'].median(), inplace=True)

#Mean
ds_mean = dataframe_inputation.copy()
ds_mean['Age'].fillna(ds_mean['Age'].mean(), inplace=True)
ds_mean['DriverRating'].fillna(ds_mean['DriverRating'].mean(), inplace=True)

print("Age replacements")
for position in age_index:
    print(f"Original Value: {dataset_nulls.iloc[position]['Age']}  Mode: {ds_mode.iloc[position]['Age']} Median: {ds_median.iloc[position]['Age']} Mean: {ds_mean.iloc[position]['Age']}" )
print("----------------------------------------------------------")
print("Driver Rating replacements")
for position in driver_index:
    print(f"Original Value: {dataset_nulls.iloc[position]['DriverRating']}  Mode: {ds_mode.iloc[position]['DriverRating']} Median: {ds_median.iloc[position]['DriverRating']} Mean: {ds_mean.iloc[position]['DriverRating']}" )


In [None]:
#Scikit interpolation
dataframe_scikit_imputation = dataset_nulls.copy()
age_index = dataset_nulls[dataset_nulls['Age'].isnull()].index.tolist()
driver_index = dataset_nulls[dataset_nulls['DriverRating'].isnull()].index.tolist()

imputer_simple = SimpleImputer(missing_values=np.NaN,strategy='most_frequent') #  possible values for strategy are mean, media, most_frequent and constant
simple_imputer_arr = imputer_simple.fit_transform(dataframe_scikit_imputation)

imputer_knn = KNNImputer(n_neighbors=2)
knn_arr = imputer_knn.fit_transform(dataframe_scikit_imputation[['Age', 'DriverRating']])

ds_simple_imputer = pd.DataFrame(simple_imputer_arr, columns=dataframe_scikit_imputation.columns)
ds_knn = pd.DataFrame(knn_arr, columns=['Age','DriverRating'])

print("Age replacements")
for position in age_index:
    print(f"Original Value: {dataframe_scikit_imputation.iloc[position]['Age']}  Simple Imputer: {ds_simple_imputer.iloc[position]['Age']} KNN: {ds_knn.iloc[position]['Age']}" )
print("----------------------------------------------------------")
print("Driver Rating replacements")
for position in driver_index:
    print(f"Original Value: {dataframe_scikit_imputation.iloc[position]['DriverRating']}  Simple Imputer: {ds_simple_imputer.iloc[position]['DriverRating']} KNN: {ds_knn.iloc[position]['DriverRating']}")



In [None]:
#Dropping records that have null values
dataset_nulls.dropna(inplace=True)
dataset_nulls.isnull().sum() 

In [None]:
#List number of uniques values for each columns
dataset_nulls.nunique()


In [None]:
#list possible values for MonthClaimed
dataset_unique = dataset_nulls.copy()
feature = "MonthClaimed"
display("Posible values for feature field: " + feature)
display(dataset_unique[feature].unique())
display((dataset_unique[dataset_unique[feature] == 0]))


In [None]:
#remove the record where monthClimend is equal to 0
dataset_unique.drop((dataset_unique[dataset_unique[feature] == 0]).index, inplace=True)
display(f"Number of uniques values for field {feature} is {(dataset_unique[feature]).nunique()}")

In [None]:
#prepare the dataset for  exploratory data anslisys and model building
moths_array = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep','Oct', 'Nov', 'Dec']
week_array =['Monday','Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
number_claimed = ['none', '1', '2 to 4', 'more than 4']

transformed_data = dataset_unique.copy()
transformed_data['Month'].replace(moths_array, [1,2,3,4,5,6,7,8,9,10,11,12], inplace=True)
transformed_data['MonthClaimed'].replace(moths_array, [1,2,3,4,5,6,7,8,9,10,11,12], inplace=True)
transformed_data['DayOfWeek'].replace(week_array, [1,2,3,4,5,6,7], inplace=True)
transformed_data['DayOfWeekClaimed'].replace(week_array, [1,2,3,4,5,6,7], inplace=True)
transformed_data['PastNumberOfClaims'].replace(number_claimed, [0,1,2,3], inplace=True)
transformed_data.head()

In [None]:
# Use Label Encoder to convert all non-numerical values
dataset_label_encoded= transformed_data.copy()
def createColumnLE(data_column, xdata):
    print("conveting column " + data_column)
    xdata[data_column] = labelEncoder.fit_transform(xdata[data_column])

labelEncoder = LabelEncoder()
catergorical_columns = dataset_label_encoded.columns[dataset_label_encoded.dtypes == object]
for col in catergorical_columns:
   createColumnLE(col, dataset_label_encoded)


dataset_label_encoded.head()

In [None]:
createHistogramPlot(dataset_label_encoded)

In [None]:

feature_corelation = dataset_label_encoded.corr(method="pearson")
display(feature_corelation.head())
indicatives = np.where(np.abs(feature_corelation)>0.75, "P",
                  np.where(np.abs(feature_corelation)>0.5, "M",
                           np.where(np.abs(feature_corelation)>0.25, "s", "")))
plt.figure(figsize=(15, 15))
sns.heatmap(feature_corelation, mask=np.eye(len(feature_corelation)), square=True,
            center=0, fmt='',annot=indicatives, linewidths=.5,
            cmap="vlag", cbar_kws={"shrink": 0.8});

In [None]:

dds = transformed_data.copy()
def create_dummy_dataframe(df):
    temp_dataframe = pd.DataFrame()
    non_numeric_columns = df.columns[df.dtypes == object]
    for col in non_numeric_columns:
        column_dataframe = pd.get_dummies(df[col], drop_first = True)
        column_dataframe.columns = [str(col) + ': ' + str(name) for name in column_dataframe.columns]
        temp_dataframe = pd.concat([temp_dataframe, column_dataframe], axis = 1)
    aggregate_dataframe = pd.concat([df.drop(columns = non_numeric_columns), temp_dataframe], axis = 1)
    return aggregate_dataframe

dummy_dataframe = create_dummy_dataframe(dds)
dummy_dataframe.info()
dummy_dataframe.head()

#### Various information about the dataset

In [None]:
pca_mms = dummy_dataframe.copy()
#Scalers

from sklearn.decomposition import PCA, TruncatedSVD, NMF, KernelPCA
from sklearn.preprocessing import MinMaxScaler,StandardScaler, RobustScaler
scaller_df = pca_mms.drop('FraudFound_P', axis = 1)
scaller_y_v = pca_mms['FraudFound_P'].values

columns = scaller_df.columns
minmax_scaler = MinMaxScaler()
minmax_X_sc = minmax_scaler.fit_transform(scaller_df)
minmax_df= pd.DataFrame(minmax_X_sc, columns=columns)

standard_scaler = StandardScaler()
standard_X_sc = standard_scaler.fit_transform(scaller_df)
standard_df= pd.DataFrame(standard_X_sc, columns=columns)

robust_scaler = RobustScaler()
robust_X_sc = robust_scaler.fit_transform(scaller_df)
robust_df= pd.DataFrame(robust_X_sc, columns=columns)

fig, (default, minmax, standard, robust) = plt.subplots(ncols = 4, figsize =(20, 5))

 
sns.kdeplot(dataset_label_encoded["ClaimSize"], ax = default, color='red')

sns.kdeplot(robust_df['ClaimSize'], ax = minmax, color ='green')
minmax.set_title('MinMax Scaller')
 
sns.kdeplot(robust_df['ClaimSize'], ax = standard, color ='blue')
standard.set_title('Standard Scaller')
 
sns.kdeplot(standard_df['ClaimSize'], ax = robust, color ='black')
robust.set_title('Robust Scaller')

plt.show()

min

In [None]:
pca = PCA(n_components=2, random_state = 1)
df_pca = pca.fit_transform(minmax_X_sc)

df_vis = pd.DataFrame(df_pca)
df_vis['y'] = scaller_y_v
plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_vis, x = 0, y = 1, hue = 'y')
plt.show()
df_pca

### Model building

In [None]:
model_building = minmax_df.copy()
model_building['FraudFound_P']=scaller_y_v

mb = model_building.copy()


In [None]:

#Split data 
training_data, validation_data  = train_test_split(mb,
                                test_size=0.3,
                                random_state = 101)

X_train, X_test, y_train, y_test = train_test_split(training_data.drop('FraudFound_P', axis = 1), 
                                                    training_data['FraudFound_P'], 
                                                    test_size=0.3, 
                                                    random_state=101)

                                                    
print(f"Number of examples used for training : {training_data.shape[0]}")
print(f"Number of examples used for validation: {validation_data.shape[0]}")

In [None]:
# model building, metrics
lgmodel = LogisticRegression(class_weight="balanced",
    n_jobs = -1,
    random_state = 101)
lgmodel.fit(X_train, y_train)

In [None]:
feature_importance = pd.DataFrame(X_train.columns, columns = ["feature"])
feature_importance["importance"] = lgmodel.coef_[0]
feature_importance = feature_importance.sort_values(by = ["importance"], ascending=False)
 
plt.figure(figsize=(15, 25))
sns.barplot(x=feature_importance.importance, y=feature_importance.feature,)
plt.show()

In [None]:
def outputMetrics(model, prediction, test_y):
        print(classification_report(test_y, prediction, target_names = ['Not Fraud', 'Fraud']))
        display(pd.DataFrame(confusion_matrix(test_y, prediction), 
                         columns = ['Predicted Not Fraud', 'Predicted Fraud'],
                         index = ['Not Fraud', 'Fraud']))
        ConfusionMatrixDisplay.from_predictions(y_test, prediction, labels=[0, 1])
        RocCurveDisplay.from_predictions(y_test,prediction)
        if hasattr(model, 'feature_importances_'):
            featureI = pd.DataFrame({
                'Variable'  :X_test.columns,
                'Importance':model.feature_importances_
            })
            featureI.sort_values('Importance', ascending=False, inplace=True)
            display(featureI.head(20))

In [None]:

# Make predictions using the testing set
prediction = lgmodel.predict(X_test)
outputMetrics(lgmodel, prediction, y_test)

In [None]:
dtc = DecisionTreeClassifier(random_state = 101)
dtc.fit(X_train, y_train)
prediction = dtc.predict(X_test)
outputMetrics(dtc,prediction, y_test)


In [None]:
xgbr = XGBClassifier(
    random_state = 1,
    n_jobs = -1,
    scale_pos_weight = 20,
    use_label_encoder=False,
    eval_metric = 'logloss'
)
xgbr.fit(X_train, y_train)
prediction = xgbr.predict(X_test)

outputMetrics(xgbr, prediction, y_test)