#### Student name: Eshaq Rahmani
#### Student number: 22086790


# Data Exploration

## 1. Get data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

#Read the csv by loading the data into a pandas dataframe
df=pd.read_csv('CW1_data_202223.csv')


# We drop 'id' as it is useless for the ML model.
df = df.drop('id', axis=1)


# Separate the features and target
X = df.drop('Result', axis=1)
y = df['Result']
df.head(10)


## 2. Clean AnnualPremium

In [None]:
# Clean the AnnualPremium column
# Define a custom function to parse the string to a float
def parse_amount(s):
    # Remove the leading and trailing white spaces
    s = s.strip()

    # Remove the currency symbol
    s = s.replace("£", "")

    # Remove the commas
    s = s.replace(",", "")

    # Convert the string to a float
    return float(s)

# Use the apply() function to parse the 'amount' column
df['AnnualPremium'] = df['AnnualPremium'].apply(parse_amount)
df.head(5)

## 3. Data description

### 3.1 Data info

In [None]:
# Print information about the dataframe
df.info()


### 3.2 Missing values exploration

In [None]:
# Missing values
missing_values = df.isnull().sum()
print(missing_values)

### 3.3 Data describe - mean, std, quantiles

In [None]:
# Print summary statistics about the data
df.describe()

## 4. Count plots

In [None]:
### Count plots for each CATEGORICAL feature and target variable ###
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.io as pio

# setting Google color palette as default
pio.templates.default = "seaborn"


# Make subplots to align plots
fig = make_subplots(rows=3, cols=3, subplot_titles=("Gender", "HasDrivingLicense", "PastAccident ",
                                                    "Switch", 'VehicleAge' , 'Result' ,
                                                    'SalesChannelID', 'RegionID'))

# Calculate the relative percentages for each count plot

# Gender relative percentages
males = df[df['Gender'] == 'Male']['Gender'].count()
females = df[df['Gender'] == 'Female']['Gender'].count()
male_percentage = males / (males + females) * 100
female_percentage = females / (males + females) * 100

# HasDrivingLicense relative percentages
has_driving_license = df[df['HasDrivingLicense'] == 1]['HasDrivingLicense'].count()
no_driving_license = df[df['HasDrivingLicense'] == 0]['HasDrivingLicense'].count()
has_driving_license_percentage = has_driving_license / (has_driving_license + no_driving_license) * 100
no_driving_license_percentage = no_driving_license / (has_driving_license + no_driving_license) * 100

# PastAccident relative percentages
has_past_accident = df[df['PastAccident'] == 'Yes']['PastAccident'].count()
no_past_accident = df[df['PastAccident'] == 'No']['PastAccident'].count()
has_past_accident_percentage = has_past_accident / (has_past_accident + no_past_accident) * 100
no_past_accident_percentage = no_past_accident / (has_past_accident + no_past_accident) * 100

# Switch relative percentages
switch_yes = df[df['Switch'] == 1]['Switch'].count()
switch_no = df[df['Switch'] == 0]['Switch'].count()
switch_yes_percentage = switch_yes / (switch_yes + switch_no) * 100
switch_no_percentage = switch_no / (switch_yes + switch_no) * 100

# VehicleAge relative percentages
vehicle_age_old = df[df['VehicleAge'] == '< 1 Year']['VehicleAge'].count()
vehicle_age_med = df[df['VehicleAge'] == '1-2 Year']['VehicleAge'].count()
vehicle_age_new = df[df['VehicleAge'] == '> 2 Years']['VehicleAge'].count()

vehicle_age_old_percentage = vehicle_age_old / (vehicle_age_old + vehicle_age_new + vehicle_age_med) * 100
vehicle_age_med_percentage = vehicle_age_med / (vehicle_age_med + vehicle_age_new + vehicle_age_old) * 100
vehicle_age_new_percentage = vehicle_age_new / (vehicle_age_new + vehicle_age_old + vehicle_age_med) * 100

# Result relative percentages
result_yes = df[df['Result'] == 1]['Result'].count()
result_no = df[df['Result'] == 0]['Result'].count()
result_yes_percentage = result_yes / (result_yes + result_no) * 100
result_no_percentage = result_no / (result_yes + result_no) * 100


# Add traces to subplots
traces = [
    go.Bar(
        x=['Male', 'Female'],
        y=[
            len(df[df['Gender']=='Male']),
            len(df[df['Gender']=='Female'])
        ],
        name='Gender',
        text = [
            str(round(male_percentage, 2)) + '%',
            str(round(female_percentage, 2)) + '%'
        ],
        textposition='auto'
    ), go.Bar(
        x=['1', '0'],
        y=[
            len(df[df['HasDrivingLicense']==1]),
            len(df[df['HasDrivingLicense']==0])
        ],
        name='HasDrivingLicense',
        text = [
            str(round(has_driving_license_percentage, 2)) + '%',
            str(round(no_driving_license_percentage, 2)) + '%'
        ],
        textposition='auto'
    ), go.Bar(
        x=['Yes', 'No'],
        y=[
            len(df[df['PastAccident']=='Yes']),
            len(df[df['PastAccident']=='No'])
        ],
        name='PastAccident',
        text = [
            str(round(has_past_accident_percentage, 2)) + '%',
            str(round(no_past_accident_percentage, 2)) + '%'
        ],
        textposition='auto'
    ), go.Bar(
        x=['1', '0'],
        y=[
            len(df[df['Switch']==1]),
            len(df[df['Switch']==0])
        ],
        name='Switch',
        text = [
             str(round(switch_yes_percentage, 2)) + '%',
             str(round(switch_no_percentage, 2)) + '%'
        ],
        textposition='auto'
    ), go.Bar(
        x=['< 1 Year', '1-2 Year', '> 2 Years'],
        y=[
            len(df[df['VehicleAge']=='< 1 Year']),
            len(df[df['VehicleAge']=='1-2 Year']),
            len(df[df['VehicleAge']=='> 2 Years'])
        ],
        name='VehicleAge',
        text = [
            str(round(vehicle_age_old_percentage, 2)) + '%',
            str(round(vehicle_age_med_percentage, 2)) + '%',
            str(round(vehicle_age_new_percentage, 2)) + '%'
        ],
        textposition='auto'
    ), go.Bar(
        x=[1, 0],
        y=[
            len(df[df['Result']==1]),
            len(df[df['Result']==0])
        ],
        name='Result',
        text = [
            str(round(result_yes_percentage, 2)) + '%',
            str(round(result_no_percentage, 2)) + '%'
        ],
        textposition='auto'
    ), go.Histogram(
        x=df['SalesChannelID'],
        name='SalesChannelID'
    ), go.Histogram(
        x=df['RegionID'],
        name='RegionID'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i],
        (i // 3) + 1,
        (i % 3)  +1,)

fig.update_layout(
    title_text='Count Plots',
    height=800,
    width=1000,
    showlegend=False)


fig.show()

## 5. Pie charts

In [None]:
### Pie chart for each feature to check ratios ###

# List of columns to include in the plot
columns_to_include = ['Gender', 'Switch', 'HasDrivingLicense', 'VehicleAge', 'PastAccident', 'Result']


column_names = df.columns
# Filter the list of column names to only include the specified columns
filtered_column_names = [column_name for column_name in column_names if column_name in columns_to_include]

# Set the figure size
plt.figure(figsize=(15, 15))

# Iterate over the filtered column names
for i, column_name in enumerate(filtered_column_names):
    # Get the counts of each value for the column
    counts = df[column_name].value_counts()

    # Get the labels and values for the pie chart
    labels = counts.index
    values = counts.values

    # Create a subplot
    plt.subplot(3, 3, i+1)

    # Create the pie chart
    plt.pie(values, labels=labels, autopct='%1.1f%%')

    # Add a title
    plt.title(column_name)

# Show the plot
plt.show()



## 7. Count plots - Numerical

In [None]:
### Count plots for each NUMERICAL feature and target variable ###

fig = make_subplots(rows=2, cols=2, subplot_titles=("Age", "AnnualPremium", "DaysSinceCreated"))

traces = [
    go.Histogram(
        x=df['Age'],
        name='Age'
    ),
    go.Histogram(
        x=df['AnnualPremium'],
        name='AnnualPremium'
    ),

    go.Histogram(
        x=df['DaysSinceCreated'],
        name='DaysSinceCreated'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i],
        (i // 2) + 1,
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Numerical Feature Distribution',
    height=800,
    width=1000,
    showlegend=False
)

fig.show()



## 8. Box Plots

In [None]:
### Box plot to check for outliers ###

import plotly.express as px
import plotly.graph_objs as go

#Make subplots to align plots
fig = make_subplots(rows=2, cols=2, subplot_titles=("AnnualPremium", "Age", "DaysSinceCreated"))

#Annual Premium boxplot
annual_premium_boxplot = px.box(df, x="AnnualPremium", labels={'AnnualPremium':'AnnualPremium'})
fig.add_trace(annual_premium_boxplot["data"][0], row=1, col=1)

#Age boxplot
age_boxplot = px.box(df, x="Age", labels={'Age':'Age'})
fig.add_trace(age_boxplot["data"][0], row=1, col=2)

#Days Since Created boxplot
days_since_created_boxplot = px.box(df, x="DaysSinceCreated", labels={'DaysSinceCreated':'DaysSinceCreated'})
fig.add_trace(days_since_created_boxplot["data"][0], row=2, col=1)

fig.update_layout(
    height=600,
    width=800
)


fig.show()

### 8.1 Outlier count

In [None]:
### Finding number of outliers ###
_, ax = plt.subplots()

bp = ax.boxplot(df["AnnualPremium"])
outliers = [f.get_ydata() for f in bp["fliers"]]
outlier_count = len(outliers[0])

print(f"Number of outliers: {outlier_count}")

## 9. Feature vs Result plot

Change variable for analysis of different feature against Result

In [None]:
fig = px.histogram(df, "Age", color='Result', title='PastAccident Vs Result', width=600, height=400)
fig.show()

# Data pre-processing

## 2. Imputation

### 2.1 Imputation with 'Missing', forward fill and mode

In [None]:
# from sklearn.impute import SimpleImputer

# # Mapping Switch column where NaN is mapped 'Missing'
# df['Switch'] = df['Switch'].map(lambda x: 'Missing' if pd.isnull(x) else x)

# # Mapping PastAccident column where NaN is mapped 'Missing'
# df['PastAccident'] = df['PastAccident'].map({'Yes': 1, 'No': 0, np.nan: 'Missing'})

# # Forward fill the values in the "age" column
# df["Age"].ffill(inplace=True)

# # Gender mode imputation
# imputer = SimpleImputer(strategy='most_frequent') # Create an instance of the SimpleImputer class
# # Fit the imputer to the Gender column
# imputer.fit(df[['Gender']])
# # Transform the Gender column using the fitted imputer
# df['Gender'] = imputer.transform(df[['Gender']])


# # HasDrivingLicense mode imputation
# imputer = SimpleImputer(strategy='most_frequent') # Create an instance of the SimpleImputer class
# # Fit the imputer to the Gender column
# imputer.fit(df[['HasDrivingLicense']])
# # Transform the Gender column using the fitted imputer
# df['HasDrivingLicense'] = imputer.transform(df[['HasDrivingLicense']])


# # Forward fill the values in the "age" column
# df["VehicleAge"].ffill(inplace=True)
# # Forward fill the values in the "RegionID" column
# df["RegionID"].ffill(inplace=True)
# df

### 2.2 Imputation with IterativeImputer (Bayesian estimator) - Best performance on Models!

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# # Ecoding features must be done for iterative imputer
# as it can not handle string inputs

## Encoding

mapping1 = {'Yes': 1, 'No': 0}
df['PastAccident'] = df['PastAccident'].map(mapping1)
# Mapping for Gender
mapping2 = {'Male': 1, 'Female': 0}
df['Gender'] = df['Gender'].map(mapping2)

# Mapping for VehicleAge
df['VehicleAge'] = df['VehicleAge'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})



# Create the imputer object
imputer = IterativeImputer(max_iter=10, random_state=0)

# Impute the missing values
df_imputed = imputer.fit_transform(df)

# The imputed dataset is returned as a numpy array, so you can convert it back to a Pandas DataFrame if needed
df = pd.DataFrame(df_imputed.round(), columns=df.columns)
df

### 2.3 Imputation with KNNImputer

In [None]:
# from sklearn.impute import KNNImputer

# # Create a KNNImputer object
# imputer = KNNImputer()

# # Impute the missing values
# df_imputed = imputer.fit_transform(df)

# # The imputed dataset is returned as a numpy array, so you can convert it back to a Pandas DataFrame if needed
# df = pd.DataFrame(df_imputed, columns=df.columns)
# df

### 2.4 Checking distributions after imputation for evaluation

In [None]:
### Pie chart for each feature to check ratios ###

# List of columns to include in the plot
columns_to_include = ['Gender', 'Switch', 'VehicleAge', 'PastAccident', 'Result']


column_names = df.columns
# Filter the list of column names to only include the specified columns
filtered_column_names = [column_name for column_name in column_names if column_name in columns_to_include]

# Set the figure size
plt.figure(figsize=(15, 15))

# Iterate over the filtered column names
for i, column_name in enumerate(filtered_column_names):
    # Get the counts of each value for the column
    counts = df[column_name].value_counts()

    # Get the labels and values for the pie chart
    labels = counts.index
    values = counts.values

    # Create a subplot
    plt.subplot(3, 3, i+1)

    # Create the pie chart
    plt.pie(values, labels=labels, autopct='%1.1f%%')

    # Add a title
    plt.title(column_name)

# Show the plot
plt.show()


In [None]:
# Plot histogram of Feature vs Target Variable
fig = px.histogram(
    df,
    "Age",
    color='Result',
    nbins=100,
    title='Age & Result ditribution',
    width=700,
    height=500
)

fig.show()

## 3. Remove outliers and Feature Scaling

### 3.1 Remove Outliers

In [None]:
# Check for outliers after imputation

# Create a figure with three subplots arranged in a single row
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(21, 5))

# Plot the first boxplot
sns.boxplot(x=df["AnnualPremium"], ax=ax1)

# Plot the second boxplot
sns.boxplot(x=df["Age"], ax=ax2)

In [None]:
### 2. Removing Outliers ####

# Using the IQR method to remove outliers AnnualPremium
Q1 = df['AnnualPremium'].quantile(0.25)
Q3 = df['AnnualPremium'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['AnnualPremium'] < (Q1 - 1.5 * IQR)) | (df['AnnualPremium'] > (Q3 + 1.5 * IQR)))]

# Using the IQR method to remove outliers Age
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Age'] < (Q1 - 1.5 * IQR)) | (df['Age'] > (Q3 + 1.5 * IQR)))]

## Check outliers ##
_, ax = plt.subplots()

bp = ax.boxplot(df["AnnualPremium"])
outliers = [f.get_ydata() for f in bp["fliers"]]
outlier_count = len(outliers[0])

print(f"Number of outliers: {outlier_count}")


### 3.2 Feature scaling numerical variables

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the AnnualPremium column
scaler.fit(df[['AnnualPremium']])
df['AnnualPremium'] = scaler.transform(df[['AnnualPremium']])

# # Fit and transform the SalesChannelID column
# df['SalesChannelID'] = scaler.fit_transform(df[['SalesChannelID']])


df['Age'] = scaler.fit_transform(df[['Age']])
df

## 4. Encoding Features

### 4.1 Encoding: Ordinal Encode "VehicleAge"

In [None]:
# Using mapping to encode VehicleAge

# Mapping for VehicleAge
df['VehicleAge'] = df['VehicleAge'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})

df


### 4.2 One-hot encode categorical values

In [None]:
# One-hot encode
df = pd.get_dummies(df, columns=['Gender'])
df = pd.get_dummies(df, columns=['Switch'])
df = pd.get_dummies(df, columns=['PastAccident'])
df

## 5. Feature Selection

### 5.1 Chi squared test for categorical features

In [None]:
from sklearn.feature_selection import chi2

# Select all the features except the target variable 'Result'
# and the numerical features
X = df.drop(columns=['Result', 'Age', 'AnnualPremium', 'DaysSinceCreated'])

# Select the target variable 'Result'
y = df['Result']

# Calculate the chi squared test for each feature
chi2_scores, p_values = chi2(X, y)

# Create a list of tuples containing the feature, score, and p-value
feature_scores = [(feature, score, p_value) for feature, score, p_value in zip(X.columns, chi2_scores, p_values)]

# Sort the list of tuples by the chi squared score in descending order
feature_scores.sort(key=lambda x: x[1], reverse=True)

# Print the features in order of highest chi squared score to lowest
for feature, score, p_value in feature_scores:
    print(f"{feature}: ----Chi-score: {score:.2f} ---- p-value: {p_value:.2f}")




### 5.2 Pearson Correlation for numerical features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(5, 3))

# Calculate the Pearson correlation coefficients for all the numeric features
corr = df[['Age', 'AnnualPremium', 'DaysSinceCreated']].corr(method='pearson', numeric_only=True)


# Plot the correlation matrix using a heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True)

# Show the plot


plt.show()

### 5.3 Feature selection: dropping irrelevant features based on Chi and Pearson

In [None]:
df = df.drop('DaysSinceCreated', axis=1)
# df = df.drop('RegionID', axis=1)
# We drop HasDrivingLicense as they have majority of identical value
df = df.drop('HasDrivingLicense', axis=1)

df

In [None]:
df

## 6. Handling imbalanced data

###  6.0 Splitting data into train and test set

In [None]:
# Separate the features and target
X = df.drop('Result', axis=1)
y = df['Result']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify=y)

X_train

### 6.2 SMOTEENN

In [None]:
import pandas as pd
from imblearn.combine import SMOTEENN


# Create the SMOTEENN object
smote_enn = SMOTEENN(random_state=0)

# Fit the SMOTEENN object on the data
X_sampledEEN, y_sampledEEN = smote_enn.fit_resample(X_train, y_train)

# Check the balance of the resampled data
print(y_sampledEEN.value_counts())

### 6.3 SMOTENC

In [None]:
### 6. Handling imbalance --- SMOTENC ###

from imblearn.over_sampling import SMOTENC

# Assume that X is the input features and y is the target variable
# Assume that X has both numeric and categorical features

# Select the indices of the categorical and numeric features
cat_idx = [1, 2, 4, 5, 6, 7, 8, 9, 10]
num_idx = [0, 3]

# Oversample the minority class using SMOTENC
smotenc = SMOTENC(categorical_features=cat_idx, random_state=42)
X_resampledSMOTENC, y_resampledSMOTENC = smotenc.fit_resample(X_train, y_train)



print(y_resampledSMOTENC.value_counts())

# Model Implementation

## 1. XGBoost

### 1.1 Hyperparameter Optimization using RandomizedSearchCV

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import random

# Define the parameters to search
parameters = {'max_depth': [3, 5, 7, 10],
              'learning_rate': [0.1, 0.5, 1],
              'subsample': [0.5, 0.8, 1.0],
              'colsample_bytree': [0.5, 0.8, 1.0],
              'reg_alpha': [0, 0.1, 0.5, 1],
              'reg_lambda': [0, 0.1, 0.5, 1],
              'n_estimators': [10, 20]}

# Create the XGBoost model
xgb_model = xgb.XGBClassifier()

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

# Create the random search object
random_search = RandomizedSearchCV(estimator=xgb_model,
                                   param_distributions=parameters, cv=cv, n_iter=10, scoring='f1', random_state=42)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters
print(random_search.best_params_)

### 1.2 Model implementation

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
# Assume that X is the input features and y is the target variable

# Calculate the class imbalance ratio
negative_count = sum(y_train == 0)
positive_count = sum(y_train == 1)
ratio = negative_count / positive_count

# Create an XGBoost classifier
clf = xgb.XGBClassifier(subsample=0.1, reg_lambda=0.1, reg_alpha=1,
                        max_depth=7, learning_rate=0.1, colsample_bytree=0.8,
                        n_estimators=10, scale_pos_weight = ratio)
# Train the classifier
clf.fit(X_train, y_train)

# Predict the class labels for new data
y_predict = clf.predict(X_test)


# Perform K-Fold cross-validation with K=5
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring = 'roc_auc')

# Print the score values
print('ROC_AUC Scores: ' + str(scores))
# Print the mean and standard deviation of the scores
print(f'Mean score: {scores.mean():.2f}')

# Metric scores
accuracy = accuracy_score(y_test, y_predict)
print(f'Accuracy: {accuracy:.2f}')
classification_random = (classification_report(y_test, y_predict))
print(classification_random)



#(subsample=1.0, reg_lambda=0.1, reg_alpha=1,
                        #max_depth=7, learning_rate=0.1, colsample_bytree=0.8,
                        #n_estimators=3, scale_pos_weight = ratio) -- BEST PERFORMANCE


#{'subsample': 0.5, 'reg_lambda': 0.1, 'reg_alpha': 1,
#'n_estimators': 10, 'max_depth': 10, 'learning_rate': 1, 'colsample_bytree': 0.5}

### 1.3 Performance Evaluation

In [None]:
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score

# Compute precision and recall, and compute AUC of the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_predict)
pr_auc = auc(recall, precision)

# Plot the precision-recall curve
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.show()

# Print the AUC
print('PR-AUC: {:.3f}'.format(pr_auc))
from sklearn.metrics import roc_auc_score


from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(y_test, y_predict)
roc_auc = auc(fpr, tpr)




plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

roc_auc = roc_auc_score (y_test, y_predict)
print('ROC-AUC: {:.3f}'.format(roc_auc))


import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_predict)

# Plot the confusion matrix as a heatmap
plt.imshow(cm, cmap='Blues')

# Add labels to the plot
plt.xlabel('Predicted label')
plt.ylabel('True label')

# Add a colorbar
plt.colorbar()

# Add the numbers to the plot
threshold = cm.max() / 2.0
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j],
                 horizontalalignment='center',
                 color='white' if cm[i, j] > threshold else 'black')

# Show the plot
plt.show()

kappa = cohen_kappa_score(y_test, y_predict)

print(kappa)

## 2. KNN

### 2.1 Hyperparameter Optimization using RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier
knn = KNeighborsClassifier()

# Define the grid of hyperparameters to search
param_grid = {'n_neighbors': [3, 5, 7, 9, 12, 15]}


cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

# Create a randomized search object using 3-fold cross validation
rsearch = RandomizedSearchCV(knn, param_grid, cv=cv, scoring = 'f1')

# Fit the randomized search object to your training data
rsearch.fit(X_sampledEEN, y_sampledEEN)

# Print the best hyperparameters found by the search
print(rsearch.best_params_)


### 2.2 Model implementation

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3)

# Create the cross-validation object
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)

# Train and evaluate the model using cross-validation
scores = cross_val_score(knn, X_resampledSMOTENC, y_resampledSMOTENC, cv=rskf, scoring = 'roc_auc')



# Train the classifier
knn.fit(X_sampledEEN, y_sampledEEN)

# Predict the class labels for new data
y_predict = knn.predict(X_test)


# Print the mean and standard deviation of the scores
print(scores)
print(f'Mean score: {scores.mean():.2f}')
print(f'Standard deviation: {scores.std():.2f}')





# Metric scores
print('\n')
classification_random = (classification_report(y_test, y_predict))
print(classification_random)

### 2.3 Performance Evaluation

In [None]:
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score

# Compute precision and recall, and compute AUC of the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_predict)
pr_auc = auc(recall, precision)

# Plot the precision-recall curve
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.show()

# Print the AUC
print('PR-AUC: {:.3f}'.format(pr_auc))
from sklearn.metrics import roc_auc_score


from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(y_test, y_predict)
roc_auc = auc(fpr, tpr)




plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

roc_auc = roc_auc_score (y_test, y_predict)
print('ROC-AUC: {:.3f}'.format(roc_auc))


import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_predict)

# Plot the confusion matrix as a heatmap
plt.imshow(cm, cmap='Blues')

# Add labels to the plot
plt.xlabel('Predicted label')
plt.ylabel('True label')

# Add a colorbar
plt.colorbar()

# Add the numbers to the plot
threshold = cm.max() / 2.0
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j],
                 horizontalalignment='center',
                 color='white' if cm[i, j] > threshold else 'black')

# Show the plot
plt.show()

kappa = cohen_kappa_score(y_test, y_predict)

print(kappa)

## 3. Random Forest

### 3.1 Hyperparameter Optimization using RandomizedSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

# Define the parameter space that you want to search over
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 2, 7, 13],
    'min_samples_split': [1, 3, 8],
    'min_samples_leaf': [1, 3, 6],
    'max_features': [2, 4, 7]
}

# Create a random forest classifier
rfc = RandomForestClassifier()

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

# Create a randomized search object using 3-fold cross validation
rsearch = RandomizedSearchCV(rfc, param_grid, cv=cv, scoring = 'f1')

# Fit the randomized search object to your training data
rsearch.fit(X_sampledEEN, y_sampledEEN)

# Print the best hyperparameters found by the search
print(rsearch.best_params_)


### 3.2 Model implementation

In [None]:
### Model Implemenation: RandomForestClassifier ###
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

random_forest = RandomForestClassifier(n_estimators = 10, max_features = 2, min_samples_leaf = 6, min_samples_split = 3,
                                       criterion = 'gini', random_state = 0, max_depth =None)

random_forest.fit(X_sampledEEN, y_sampledEEN)
y_predict = random_forest.predict(X_test)



# Perform K-Fold cross-validation with K=5
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring = 'roc_auc')

# Print the score values
print('ROC_AUC Scores: ' + str(scores))
# Print the mean and standard deviation of the scores
print(f'Mean score: {scores.mean():.2f}')


# Metric scores
print('\n')
classification_random = (classification_report(y_test, y_predict))
print(classification_random)

### 3.3 Performance Evaluation

In [None]:
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt

# Compute precision and recall, and compute AUC of the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_predict)
pr_auc = auc(recall, precision)

# Plot the precision-recall curve
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.show()

# Print the AUC
print('PR-AUC: {:.3f}'.format(pr_auc))
from sklearn.metrics import roc_auc_score


from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
fpr, tpr, thresholds = roc_curve(y_test, y_predict)
roc_auc = auc(fpr, tpr)




plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()

roc_auc = roc_auc_score (y_test, y_predict)
print('ROC-AUC: {:.3f}'.format(roc_auc))


import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_predict)

# Plot the confusion matrix as a heatmap
plt.imshow(cm, cmap='Blues')

# Add labels to the plot
plt.xlabel('Predicted label')
plt.ylabel('True label')

# Add a colorbar
plt.colorbar()

# Add the numbers to the plot
threshold = cm.max() / 2.0
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j],
                 horizontalalignment='center',
                 color='white' if cm[i, j] > threshold else 'black')

# Show the plot
plt.show()

kappa = cohen_kappa_score(y_test, y_predict)

print(kappa)
