# **0.0 DATA PROCESSING**

In [None]:
!git clone https://github.com/edwinkmusaasizi/Machine-Learning.git

Cloning into 'Machine-Learning'...
remote: Enumerating objects: 69, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 69 (delta 31), reused 7 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (69/69), 467.68 KiB | 4.37 MiB/s, done.
Resolving deltas: 100% (31/31), done.


In [None]:
%cd Machine-Learning
%cd data
%cd interim
!ls

/content/Machine-Learning/data/interim/Machine-Learning/data/interim/Machine-Learning/data/interim/Machine-Learning
/content/Machine-Learning/data/interim/Machine-Learning/data/interim/Machine-Learning/data/interim/Machine-Learning/data
/content/Machine-Learning/data/interim/Machine-Learning/data/interim/Machine-Learning/data/interim/Machine-Learning/data/interim
cleaned_mental_health_data.csv


0.1 Data Processing

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load data
df = pd.read_csv("cleaned_mental_health_data.csv")

# Define adherence labels based on questionnaire responses
non_adherence_columns = [
    "Do you ever forget to take your medication?",
    "Are you careless at times about taking your medication?",
    "When you feel better, do you sometimes stop taking your medication?",
    "Sometimes if you feel worse when you take the medication, do you stop taking it?",
    "I take my medication only when I am sick"
]

df["adherence"] = np.where(df[non_adherence_columns].eq("Yes").any(axis=1), 0, 1)

# Drop redundant columns
df = df.drop(columns=non_adherence_columns + ["If you have any further comments about medication or this questionnaire, please write them below"])

# Identify all categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print("Categorical columns to encode:", categorical_cols)

# Encode all categorical features
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Split features and labels
X = df.drop(columns="adherence").values
y = df["adherence"].values

# Split data into train, validation, test (70-15-15)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Apply SMOTE to only the training set
from imblearn.over_sampling import SMOTE

# Apply SMOTE only to the training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check new class distribution
from collections import Counter
print("New class distribution:", Counter(y_train_resampled))

# Normalize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
test_dataset = TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

Categorical columns to encode: Index(['sex', 'Religion', 'marital status', 'education status', 'residence',
       'substance use', 'comorbidity',
       'It is unnatural for my mind and body to be controlled by medication?',
       'My thoughts are clearer on medication',
       'By staying on medication, I can prevent getting sick',
       'I feel weird, like a ‘zombie’ on medication',
       'Medication makes me feel tired and sluggish',
       'Some of your symptoms are made by your mind.', 'You are mentally well',
       'You do not need medication', 'Your stay in the hospital is necessary',
       'The doctor is right in prescribing medication for you.',
       'You do not need to be seen by a doctor or psychiatrist',
       'If someone said you have a nervous or mental illness, they would be right',
       'None of the unusual things you are experiencing are due to an illness.',
       '. Loss of energy or drive', 'Feeling unmotivated or numb',
       'Daytime sedation or drowsi

In [None]:
# Check class distribution in the training set
class_distribution = np.bincount(y_train)
print("Class Distribution in Training Set:")
print(f"Class 0 (Non-Adherent): {class_distribution[0]}")
print(f"Class 1 (Adherent): {class_distribution[1]}")

Class Distribution in Training Set:
Class 0 (Non-Adherent): 55
Class 1 (Adherent): 26


Implement class weight

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Define loss function with class weights
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Example model (Assuming a simple neural network)
class AdherenceModel(nn.Module):
    def __init__(self, input_size):
        super(AdherenceModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 2)  # Output has 2 classes (0 and 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)  # No softmax needed for CrossEntropyLoss
        return x

# Initialize model
input_size = X_train.shape[1]  # Number of features
model = AdherenceModel(input_size)

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop (simplified)
for epoch in range(10):  # Adjust epochs as needed
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        batch_y = batch_y.long()  # Convert to long for CrossEntropyLoss
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")



Epoch 1, Loss: 0.6953945159912109
Epoch 2, Loss: 0.7195978164672852
Epoch 3, Loss: 0.63300621509552
Epoch 4, Loss: 0.6330546736717224
Epoch 5, Loss: 0.6474331021308899
Epoch 6, Loss: 0.5303978323936462
Epoch 7, Loss: 0.5786312818527222
Epoch 8, Loss: 0.5541043281555176
Epoch 9, Loss: 0.5522679090499878
Epoch 10, Loss: 0.46876275539398193


# MODEL IMPLEMANTION

## Implement Logisitic Regretion

1 Import Libraries

In [190]:
from sklearn.linear_model import LogisticRegression
import time
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming data preprocessing has been done and the data is available in these variables
# X_train, X_val, X_test, y_train, y_val, y_test
# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


2. Training

In [191]:
# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42, max_iter=1000)

# Record the start time for training
start_time = time.time()

# Train the model
logreg_model.fit(X_train_scaled, y_train)

# Record the end time for training
end_time = time.time()

# Calculate the training time
training_time = end_time - start_time


3. Evaluation

In [192]:
import pandas as pd

# Get predicted probabilities
y_prob_logreg = logreg_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_logreg = (y_prob_logreg >= 0.5).astype(int)

# Calculate evaluation metrics
logreg_precision = precision_score(y_test, y_pred_logreg)
logreg_recall = recall_score(y_test, y_pred_logreg)
logreg_f1 = f1_score(y_test, y_pred_logreg)
logreg_auc = roc_auc_score(y_test, y_prob_logreg)

# Store results in a dictionary for comparison
model_results = {
    "Model": ["Logistic Regression"],
    "Precision": [logreg_precision],
    "Recall": [logreg_recall],
    "F1-Score": [logreg_f1],
    "AUC": [logreg_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
results_df = pd.DataFrame(model_results)

# Display results
print(results_df)


                 Model  Precision    Recall  F1-Score       AUC  \
0  Logistic Regression       0.25  0.166667       0.2  0.347222   

   Training Time (s)  
0           0.015465  


## Implementing Support Vector Machine

In [None]:
from sklearn.svm import SVC
import time
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming data preprocessing has been done and the data is available in these variables
# X_train, X_val, X_test, y_train, y_val, y_test
# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


# Initialize the SVM model with a radial basis function kernel
svm_model = SVC(probability=True, random_state=42)

# Record the start time for training
start_time = time.time()

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Record the end time for training
end_time = time.time()

# Calculate the training time
training_time = end_time - start_time


Evaluation

In [193]:
# Get predicted probabilities
y_prob_svm = svm_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_svm = (y_prob_svm >= 0.5).astype(int)

# Calculate evaluation metrics
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)
svm_auc = roc_auc_score(y_test, y_prob_svm)

# Store results in a dictionary for comparison
svm_results = {
    "Model": ["Support Vector Machine (SVM)"],
    "Precision": [svm_precision],
    "Recall": [svm_recall],
    "F1-Score": [svm_f1],
    "AUC": [svm_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
svm_results_df = pd.DataFrame(svm_results)

# Append to the existing results DataFrame
results_df = pd.concat([results_df, svm_results_df], ignore_index=True)

# Display updated results
print(results_df)


                          Model  Precision    Recall  F1-Score       AUC  \
0           Logistic Regression       0.25  0.166667       0.2  0.347222   
1  Support Vector Machine (SVM)       0.00  0.000000       0.0  0.416667   

   Training Time (s)  
0           0.015465  
1           0.015465  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Implementing Random Forest Classifier

In [195]:
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming data preprocessing has been done and the data is available in these variables
# X_train, X_val, X_test, y_train, y_val, y_test
# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#TRAINING

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Record the start time for training
start_time = time.time()

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Record the end time for training
end_time = time.time()

# Calculate the training time
training_time = end_time - start_time




Evaluation

In [194]:
# Get predicted probabilities
y_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_rf = (y_prob_rf >= 0.5).astype(int)

# Calculate evaluation metrics
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, y_prob_rf)

# Store results in a dictionary for comparison
rf_results = {
    "Model": ["Random Forest"],
    "Precision": [rf_precision],
    "Recall": [rf_recall],
    "F1-Score": [rf_f1],
    "AUC": [rf_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
rf_results_df = pd.DataFrame(rf_results)

# Append to the existing results DataFrame
results_df = pd.concat([results_df, rf_results_df], ignore_index=True)

# Display updated results
print(results_df)


                          Model  Precision    Recall  F1-Score       AUC  \
0           Logistic Regression       0.25  0.166667  0.200000  0.347222   
1  Support Vector Machine (SVM)       0.00  0.000000  0.000000  0.416667   
2                 Random Forest       1.00  0.166667  0.285714  0.465278   

   Training Time (s)  
0           0.015465  
1           0.015465  
2           0.015465  


## Implementing a K-Nearest Neighbor

In [197]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import time

# Initialize KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Record the start time for training
start_time = time.time()

# Train the model
knn_model.fit(X_train_scaled, y_train)

# Record the end time for training
end_time = time.time()

# Calculate training time
training_time = end_time - start_time

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test_scaled)




In [196]:
# Get predicted probabilities
y_prob_knn = knn_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_knn = (y_prob_knn >= 0.5).astype(int)

# Calculate evaluation metrics
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, y_prob_knn)

# Store results in a dictionary for comparison
knn_results = {
    "Model": ["K-Nearest Neighbors (KNN)"],
    "Precision": [knn_precision],
    "Recall": [knn_recall],
    "F1-Score": [knn_f1],
    "AUC": [knn_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
knn_results_df = pd.DataFrame(knn_results)

# Append to the existing results DataFrame
results_df = pd.concat([results_df, knn_results_df], ignore_index=True)

# Display updated results
print(results_df)


                          Model  Precision    Recall  F1-Score       AUC  \
0           Logistic Regression   0.250000  0.166667  0.200000  0.347222   
1  Support Vector Machine (SVM)   0.000000  0.000000  0.000000  0.416667   
2                 Random Forest   1.000000  0.166667  0.285714  0.465278   
3     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   

   Training Time (s)  
0           0.015465  
1           0.015465  
2           0.015465  
3           0.239112  


Implementing Gradient Boost machine

In [198]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import time

# Initialize Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Record the start time for training
start_time = time.time()

# Train the model
gbm_model.fit(X_train_scaled, y_train)

# Record the end time for training
end_time = time.time()
training_time = end_time - start_time

# Predict on the validation and test sets
y_val_pred = gbm_model.predict(X_val_scaled)
y_test_pred = gbm_model.predict(X_test_scaled)

# Get predicted probabilities
y_prob_knn = knn_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_knn = (y_prob_knn >= 0.5).astype(int)

# Calculate evaluation metrics
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, y_prob_knn)

# Store results in a dictionary for comparison
knn_results = {
    "Model": ["K-Nearest Neighbors (KNN)"],
    "Precision": [knn_precision],
    "Recall": [knn_recall],
    "F1-Score": [knn_f1],
    "AUC": [knn_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
knn_results_df = pd.DataFrame(knn_results)

# Append to the existing results DataFrame
results_df = pd.concat([results_df, knn_results_df], ignore_index=True)

# Display updated results
print(results_df)


                          Model  Precision    Recall  F1-Score       AUC  \
0           Logistic Regression   0.250000  0.166667  0.200000  0.347222   
1  Support Vector Machine (SVM)   0.000000  0.000000  0.000000  0.416667   
2                 Random Forest   1.000000  0.166667  0.285714  0.465278   
3     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
4     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   

   Training Time (s)  
0           0.015465  
1           0.015465  
2           0.015465  
3           0.239112  
4           0.253225  


## Implementing XGBoost

In [None]:
!pip install xgboost




In [199]:
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import time

# Initialize XGBoost Classifier
xgboost_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Record the start time for training
start_time = time.time()

# Train the model
xgboost_model.fit(X_train_scaled, y_train)

# Record the end time for training
end_time = time.time()
training_time = end_time - start_time

# Predict on the validation and test sets
y_val_pred = xgboost_model.predict(X_val_scaled)
y_test_pred = xgboost_model.predict(X_test_scaled)

# Get predicted probabilities
y_prob_xgb = xgboost_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_xgb = (y_prob_xgb >= 0.5).astype(int)

# Calculate evaluation metrics
xgb_precision = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_auc = roc_auc_score(y_test, y_prob_xgb)

# Store results in a dictionary for comparison
xgb_results = {
    "Model": ["XGBoost"],
    "Precision": [xgb_precision],
    "Recall": [xgb_recall],
    "F1-Score": [xgb_f1],
    "AUC": [xgb_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
xgb_results_df = pd.DataFrame(xgb_results)

# Append to the existing results DataFrame
results_df = pd.concat([results_df, xgb_results_df], ignore_index=True)

# Display updated results
print(results_df)



                          Model  Precision    Recall  F1-Score       AUC  \
0           Logistic Regression   0.250000  0.166667  0.200000  0.347222   
1  Support Vector Machine (SVM)   0.000000  0.000000  0.000000  0.416667   
2                 Random Forest   1.000000  0.166667  0.285714  0.465278   
3     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
4     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
5                       XGBoost   0.200000  0.166667  0.181818  0.375000   

   Training Time (s)  
0           0.015465  
1           0.015465  
2           0.015465  
3           0.239112  
4           0.253225  
5           0.066676  


## Implementing LightGBM

In [None]:
!pip install lightgbm


In [200]:
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import time

# Initialize LightGBM Classifier
lgbm_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Record the start time for training
start_time = time.time()

# Train the model
lgbm_model.fit(X_train_scaled, y_train)

# Record the end time for training
end_time = time.time()
training_time = end_time - start_time

# Predict on the validation and test sets
y_val_pred = lgbm_model.predict(X_val_scaled)
y_test_pred = lgbm_model.predict(X_test_scaled)

# Get predicted probabilities
y_prob_lgbm = lgbm_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_lgbm = (y_prob_lgbm >= 0.5).astype(int)

# Calculate evaluation metrics
lgbm_precision = precision_score(y_test, y_pred_lgbm)
lgbm_recall = recall_score(y_test, y_pred_lgbm)
lgbm_f1 = f1_score(y_test, y_pred_lgbm)
lgbm_auc = roc_auc_score(y_test, y_prob_lgbm)

# Store results in a dictionary for comparison
lgbm_results = {
    "Model": ["LightGBM"],
    "Precision": [lgbm_precision],
    "Recall": [lgbm_recall],
    "F1-Score": [lgbm_f1],
    "AUC": [lgbm_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
lgbm_results_df = pd.DataFrame(lgbm_results)

# Append to the existing results DataFrame
results_df = pd.concat([results_df, lgbm_results_df], ignore_index=True)

# Display updated results
print(results_df)




[LightGBM] [Info] Number of positive: 26, number of negative: 55
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 217
[LightGBM] [Info] Number of data points in the train set: 81, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.320988 -> initscore=-0.749237
[LightGBM] [Info] Start training from score -0.749237
                          Model  Precision    Recall  F1-Score       AUC  \
0           Logistic Regression   0.250000  0.166667  0.200000  0.347222   
1  Support Vector Machine (SVM)   0.000000  0.000000  0.000000  0.416667   
2                 Random Forest   1.000000  0.166667  0.285714  0.465278   
3     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
4     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
5                       XGBoost   0.200000  0.166667  0.1818

## Implementing CATBoost

In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [215]:
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import time

# Initialize CatBoost Classifier
catboost_model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3, random_seed=42, verbose=0)

# Record start time
start_time = time.time()

# Train the model
catboost_model.fit(X_train_scaled, y_train)

# Record end time
end_time = time.time()
training_time = end_time - start_time

# Predict on validation and test sets
y_val_pred = catboost_model.predict(X_val_scaled)
y_test_pred = catboost_model.predict(X_test_scaled)

# Get predicted probabilities
y_prob_catboost = catboost_model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_catboost = (y_prob_catboost >= 0.5).astype(int)

# Calculate evaluation metrics
catboost_precision = precision_score(y_test, y_pred_catboost)
catboost_recall = recall_score(y_test, y_pred_catboost)
catboost_f1 = f1_score(y_test, y_pred_catboost)
catboost_auc = roc_auc_score(y_test, y_prob_catboost)

# Store results in a dictionary for comparison
catboost_results = {
    "Model": ["CatBoost"],
    "Precision": [catboost_precision],
    "Recall": [catboost_recall],
    "F1-Score": [catboost_f1],
    "AUC": [catboost_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
catboost_results_df = pd.DataFrame(catboost_results)

# Append to the existing results DataFrame
results_df = pd.concat([results_df, catboost_results_df], ignore_index=True)

# Display updated results
print(results_df)


                          Model  Precision    Recall  F1-Score       AUC  \
0           Logistic Regression   0.250000  0.166667  0.200000  0.347222   
1  Support Vector Machine (SVM)   0.000000  0.000000  0.000000  0.416667   
2                 Random Forest   1.000000  0.166667  0.285714  0.465278   
3     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
4     K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
5                       XGBoost   0.200000  0.166667  0.181818  0.375000   
6                      LightGBM   0.250000  0.166667  0.200000  0.458333   
7                      CatBoost   0.250000  0.166667  0.200000  0.416667   
8                      CatBoost   0.250000  0.166667  0.200000  0.416667   
9                      CatBoost   0.250000  0.166667  0.200000  0.416667   

   Training Time (s)  
0           0.015465  
1           0.015465  
2           0.015465  
3           0.239112  
4           0.253225  
5           0.066676  
6 

## Implementing Fastforwar-NN

In [216]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Define the Feedforward Neural Network (FNN) Model
class FNNModel(nn.Module):
    def __init__(self, input_dim):
        super(FNNModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)  # Input layer
        self.fc2 = nn.Linear(64, 32)         # Hidden layer
        self.fc3 = nn.Linear(32, 1)          # Output layer
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))   # Apply ReLU activation
        x = self.relu(self.fc2(x))   # Apply ReLU activation
        x = self.sigmoid(self.fc3(x))  # Output layer with sigmoid for binary classification
        return x

# Initialize the FNN model
input_dim = X_train_scaled.shape[1]  # Number of features
fnn_model = FNNModel(input_dim)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fnn_model.to(device)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(fnn_model.parameters(), lr=0.001)

# Record start time for training
start_time = time.time()

# Train the FNN Model (assuming X_train_scaled and y_train are tensors)
for epoch in range(100):  # Number of epochs
    # Move data to device
    inputs = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
    labels = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)

    # Forward pass
    outputs = fnn_model(inputs)
    loss = criterion(outputs, labels)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Record end time for training
end_time = time.time()
training_time = end_time - start_time

# Predict on validation and test sets
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
y_val_pred = fnn_model(X_val_tensor).cpu().detach().numpy()
y_val_pred = (y_val_pred > 0.5).astype(int)  # Threshold at 0.5

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_pred = fnn_model(X_test_tensor).cpu().detach().numpy()
y_test_pred = (y_test_pred > 0.5).astype(int)  # Threshold at 0.5

# Get predicted probabilities
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    # Convert the test data to a tensor
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

    # Get predictions
    y_prob_fnn = model(X_test_tensor).cpu().numpy().flatten()  # Probabilities for class 1

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred_fnn = (y_prob_fnn >= 0.5).astype(int)

# Calculate evaluation metrics
fnn_precision = precision_score(y_test, y_pred_fnn)
fnn_recall = recall_score(y_test, y_pred_fnn)
fnn_f1 = f1_score(y_test, y_pred_fnn)
fnn_auc = roc_auc_score(y_test, y_prob_fnn)

# Calculate training time (assuming it was previously calculated or use current time)
start_time = time.time()
# model.fit(X_train_scaled, y_train)  # Uncomment and use this if not done yet
end_time = time.time()
training_time = end_time - start_time

# Store results in a dictionary for comparison
fnn_results = {
    "Model": ["Feedforward Neural Network (FNN)"],
    "Precision": [fnn_precision],
    "Recall": [fnn_recall],
    "F1-Score": [fnn_f1],
    "AUC": [fnn_auc],
    "Training Time (s)": [training_time]
}

# Convert to DataFrame
fnn_results_df = pd.DataFrame(fnn_results)

# Initialize an empty DataFrame to hold results if it doesn't exist
try:
    results_df
except NameError:
    results_df = pd.DataFrame(columns=["Model", "Precision", "Recall", "F1-Score", "AUC", "Training Time (s)"])

# Append to the existing results DataFrame
results_df = pd.concat([results_df, fnn_results_df], ignore_index=True)

# Display updated results
print(results_df)

                               Model  Precision    Recall  F1-Score       AUC  \
0                Logistic Regression   0.250000  0.166667  0.200000  0.347222   
1       Support Vector Machine (SVM)   0.000000  0.000000  0.000000  0.416667   
2                      Random Forest   1.000000  0.166667  0.285714  0.465278   
3          K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
4          K-Nearest Neighbors (KNN)   0.666667  0.333333  0.444444  0.645833   
5                            XGBoost   0.200000  0.166667  0.181818  0.375000   
6                           LightGBM   0.250000  0.166667  0.200000  0.458333   
7                           CatBoost   0.250000  0.166667  0.200000  0.416667   
8                           CatBoost   0.250000  0.166667  0.200000  0.416667   
9                           CatBoost   0.250000  0.166667  0.200000  0.416667   
10  Feedforward Neural Network (FNN)   0.250000  0.166667  0.200000  0.347222   

    Training Time (s)  
0  

## Implementing A neural network

In [205]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Define the Neural Network Model
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.output(x))
        return x

# Initialize the model
input_dim = X_train_scaled.shape[1]  # Number of features
model = NeuralNetwork(input_dim)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare data (assuming X_train_scaled, y_train are numpy arrays or tensors)
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)

X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1).to(device)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1).to(device)

# Initialize the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
start_time = time.time()

# Set the model to training mode
model.train()
for epoch in range(100):  # Training for 100 epochs
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# Record the end time for training
end_time = time.time()
training_time = end_time - start_time

# Evaluation
model.eval()

# Make predictions on the validation and test sets
with torch.no_grad():
    y_val_pred_prob = model(X_val_tensor).squeeze().cpu().numpy()
    y_test_pred_prob = model(X_test_tensor).squeeze().cpu().numpy()

# Convert probabilities to binary predictions (threshold = 0.5)
y_val_pred = (y_val_pred_prob >= 0.5).astype(int)
y_test_pred = (y_test_pred_prob >= 0.5).astype(int)

# Calculate evaluation metrics for validation set
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_auc = roc_auc_score(y_val, y_val_pred_prob)

# Calculate evaluation metrics for test set
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_auc = roc_auc_score(y_test, y_test_pred_prob)

# Print the evaluation results
print(f"Neural Network Model Evaluation (Validation Set):")
print(f"Validation Precision: {val_precision:.4f} | Validation Recall: {val_recall:.4f}")
print(f"Validation F1-Score: {val_f1:.4f} | Validation AUC: {val_auc:.4f}")

print(f"\nNeural Network Model Evaluation (Test Set):")
print(f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {test_f1:.4f} | Test AUC: {test_auc:.4f}")

print(f"\nTraining Time: {training_time:.4f} seconds")


Neural Network Model Evaluation (Validation Set):
Validation Precision: 0.7500 | Validation Recall: 0.5000
Validation F1-Score: 0.6000 | Validation AUC: 0.7361

Neural Network Model Evaluation (Test Set):
Test Precision: 0.2500 | Test Recall: 0.1667
Test F1-Score: 0.2000 | Test AUC: 0.3889

Training Time: 0.3942 seconds


## Implementing DNN

In [206]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Define the Deep Neural Network (DNN) Model
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # First hidden layer with 128 units
        self.fc2 = nn.Linear(128, 64)         # Second hidden layer with 64 units
        self.fc3 = nn.Linear(64, 32)          # Third hidden layer with 32 units
        self.fc4 = nn.Linear(32, 1)           # Output layer with 1 unit (for binary classification)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))  # ReLU activation after first layer
        x = self.relu(self.fc2(x))  # ReLU activation after second layer
        x = self.relu(self.fc3(x))  # ReLU activation after third layer
        x = self.sigmoid(self.fc4(x))  # Sigmoid activation for binary classification
        return x

# Initialize the model
input_dim = X_train_scaled.shape[1]  # Number of features (input dimension)
model = DNN(input_dim)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare data (convert to tensors)
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)

X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1).to(device)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1).to(device)

# Initialize loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

# Training the model
start_time = time.time()

# Set model to training mode
model.train()
for epoch in range(100):  # Training for 100 epochs
    optimizer.zero_grad()  # Zero the gradients
    outputs = model(X_train_tensor)  # Forward pass
    loss = criterion(outputs, y_train_tensor)  # Calculate the loss
    loss.backward()  # Backpropagate the loss
    optimizer.step()  # Update the model parameters

# Record the end time for training
end_time = time.time()
training_time = end_time - start_time

# Evaluation
model.eval()  # Set model to evaluation mode

# Make predictions on validation and test sets
with torch.no_grad():  # No gradients needed for evaluation
    y_val_pred_prob = model(X_val_tensor).squeeze().cpu().numpy()  # Validation predictions
    y_test_pred_prob = model(X_test_tensor).squeeze().cpu().numpy()  # Test predictions

# Convert probabilities to binary predictions (threshold = 0.5)
y_val_pred = (y_val_pred_prob >= 0.5).astype(int)
y_test_pred = (y_test_pred_prob >= 0.5).astype(int)

# Calculate evaluation metrics for validation set
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_auc = roc_auc_score(y_val, y_val_pred_prob)

# Calculate evaluation metrics for test set
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_auc = roc_auc_score(y_test, y_test_pred_prob)

# Print the evaluation results
print(f"DNN Model Evaluation (Validation Set):")
print(f"Validation Precision: {val_precision:.4f} | Validation Recall: {val_recall:.4f}")
print(f"Validation F1-Score: {val_f1:.4f} | Validation AUC: {val_auc:.4f}")

print(f"\nDNN Model Evaluation (Test Set):")
print(f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {test_f1:.4f} | Test AUC: {test_auc:.4f}")

print(f"\nTraining Time: {training_time:.4f} seconds")


DNN Model Evaluation (Validation Set):
Validation Precision: 1.0000 | Validation Recall: 0.5000
Validation F1-Score: 0.6667 | Validation AUC: 0.7778

DNN Model Evaluation (Test Set):
Test Precision: 0.2500 | Test Recall: 0.1667
Test F1-Score: 0.2000 | Test AUC: 0.3472

Training Time: 0.3088 seconds


# Comparing The Models