# Import Libraries

In [1]:
import os
import io
import sys
import ast

from google.cloud import storage
from google.cloud import bigquery

import json
import torch
import sklearn
import xgboost
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Import data

In [2]:
# GCP authorization
bq_client = bigquery.Client()

In [3]:
# Variables
PROJECT_ID = "hmh-em-deepasm"
BQ_ML_DATASET = "ml_250bp_3" # Ali: hg19_250_ml #ml_250bp

In [None]:
# dic_data = {'train': {'samples': ['gm12878',
#                                   'CD14',
#                                   'fibroblast',
#                                   'A549',
#                                   'spleen_female_adult',
#                                   'HeLa_S3']},
#             'validation': {'samples': ['mammary_epithelial',
#                                        'sk_n_sh',
#                                        'CD34']},
#             'test': {'samples': ['HepG2',
#                                  'righ_lobe_liver',
#                                  't_cell_male_adult']}}

In [4]:
query = f"SELECT * FROM {PROJECT_ID}.{BQ_ML_DATASET}.TRAINING WHERE cpg_directional_fm IS NOT NULL AND asm IS NOT NULL"

# Execute Query and store as DF
df_train = bq_client.query(query).to_dataframe()

In [5]:
query = f"SELECT * FROM {PROJECT_ID}.{BQ_ML_DATASET}.VALIDATION WHERE cpg_directional_fm IS NOT NULL AND asm IS NOT NULL"

# Execute Query and store as DF
df_validation = bq_client.query(query).to_dataframe()

In [6]:
query = f"SELECT * FROM {PROJECT_ID}.{BQ_ML_DATASET}.TESTING WHERE cpg_directional_fm IS NOT NULL AND asm IS NOT NULL"

# Execute Query and store as DF
df_test = bq_client.query(query).to_dataframe()

# Preparing data

In [None]:
df_train.head()

Unnamed: 0,chr,region_inf,region_sup,region_nb_cpg,nb_cpg_found,sample,clustering_index,asm,nb_reads,std_read_fm,...,transition_probability_from_6_to_7,transition_probability_from_7_to_0,transition_probability_from_7_to_1,transition_probability_from_7_to_2,transition_probability_from_7_to_3,transition_probability_from_7_to_4,transition_probability_from_7_to_5,transition_probability_from_7_to_6,transition_probability_from_7_to_7,entropy_state_distribution
0,9,1648501,1648750,6,5,spleen_female_adult,1926,0,54,0.2202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.673
1,1,38241751,38242000,6,5,fibroblast,47,0,67,0.3425,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0549
2,16,76524001,76524250,3,3,spleen_female_adult,3107,0,48,0.2414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365
3,8,22006751,22007000,5,5,gm12878,1768,0,92,0.3338,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.673
4,5,80235501,80235750,5,5,HeLa_S3,1202,0,113,0.2128,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.673


In [None]:
df_train.columns.tolist()

### Getting Tabular data

In [None]:
columns_to_drop = ['cpg_directional_fm', "cpgs_w_padding", 'sample', 'chr', 'clustering_index', 'region_inf', 'region_sup', 'region_nb_cpg']
df_train_tabular = df_train.drop(columns=columns_to_drop)
df_validation_tabular = df_validation.drop(columns=columns_to_drop)
df_test_tabular = df_test.drop(columns=columns_to_drop)

In [None]:
df_X_train_tabular = df_train_tabular.drop("asm", axis=1)
df_X_test_tabular = df_test_tabular.drop("asm", axis=1)
df_X_validation_tabular = df_validation_tabular.drop("asm", axis=1)

# df_X_train_tabular['cpg_fm'] = df_X_train_tabular['cpg_fm'].apply(lambda x: np.mean(x))
# df_X_test_tabular['cpg_fm'] = df_X_test_tabular['cpg_fm'].apply(lambda x: np.mean(x))
# df_X_validation_tabular['cpg_fm'] = df_X_validation_tabular['cpg_fm'].apply(lambda x: np.mean(x))

df_y_train_tabular = df_train_tabular["asm"]
df_y_test_tabular = df_test_tabular["asm"]
df_y_validation_tabular = df_validation_tabular["asm"]

### Getting Imagery Data

In [None]:
df_train

Unnamed: 0,chr,region_inf,region_sup,region_nb_cpg,nb_cpg_found,sample,clustering_index,asm,nb_reads,std_read_fm,...,transition_probability_from_3_to_1,transition_probability_from_3_to_2,transition_probability_from_3_to_3,transition_probability_from_3_to_4,transition_probability_from_4_to_0,transition_probability_from_4_to_1,transition_probability_from_4_to_2,transition_probability_from_4_to_3,transition_probability_from_4_to_4,entropy_state_distribution
0,8,1756001,1756250,6,5,HeLa_S3,1743,0,101,0.2114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3322
1,18,41006501,41006750,5,5,HeLa_S3,3277,0,59,0.3262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5004
2,8,10570001,10570250,6,6,HeLa_S3,1754,1,89,0.2875,...,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.0,0.5,1.0114
3,16,19591751,19592000,5,5,fibroblast,3036,0,94,0.1160,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0000
4,1,5721751,5722000,6,6,CD14,7,0,137,0.2734,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644788,5,154379501,154379750,4,3,HeLa_S3,1295,0,67,0.3517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365
644789,5,156696501,156696750,3,3,HeLa_S3,1297,0,82,0.4222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000
644790,4,32407751,32408000,3,3,A549,903,0,135,0.3858,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365
644791,15,78919751,78920000,4,4,HeLa_S3,2982,0,83,0.3836,...,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5623


In [7]:
imagery_var = 'cpgs_w_padding' # 'cpg_directional_fm' OR 'cpgs_w_padding'
imagery_cols = ['asm', imagery_var]
df_train_imagery = df_train[imagery_cols]
df_test_imagery = df_test[imagery_cols]
df_validation_imagery = df_validation[imagery_cols]

In [None]:
df_train_imagery

Unnamed: 0,asm,cpgs_w_padding
0,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...
644788,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
644789,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
644790,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
644791,0,"""[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


In [8]:
# Function to convert data strings to numpy arrays
df_train_imagery[imagery_var] = df_train_imagery[imagery_var].apply(
        lambda x: ast.literal_eval(x.strip('"'))
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_imagery[imagery_var] = df_train_imagery[imagery_var].apply(


In [9]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

X = df_train_imagery.drop('asm', axis=1)
y = df_train_imagery['asm']

# Initialize the RandomOverSampler object
ros = RandomOverSampler(random_state=42)

# Fit and apply the transform
X_resampled, y_resampled = ros.fit_resample(X, y)

# Convert X_resampled back to a DataFrame
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)

# Add the resampled target column back to the DataFrame
X_resampled_df['asm'] = y_resampled

# Now df_train_imagery is the balanced dataset
df_train_imagery = X_resampled_df

In [10]:
# random oversampling
df_train_imagery['asm'].value_counts()

asm
0    633665
1    633665
Name: count, dtype: Int64

In [11]:
df_test_imagery[imagery_var] = df_test_imagery[imagery_var].apply(
        lambda x: ast.literal_eval(x.strip('"'))
    )
df_validation_imagery[imagery_var] = df_validation_imagery[imagery_var].apply(
        lambda x: ast.literal_eval(x.strip('"'))
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_imagery[imagery_var] = df_test_imagery[imagery_var].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validation_imagery[imagery_var] = df_validation_imagery[imagery_var].apply(


In [12]:
df_X_train_imagery = df_train_imagery.drop("asm", axis=1)
df_X_test_imagery = df_test_imagery.drop('asm', axis=1)
df_X_validation_imagery = df_validation_imagery.drop("asm", axis=1)

df_y_train_imagery = df_train_imagery.drop(imagery_var, axis=1)
df_y_test_imagery = df_test_imagery.drop(imagery_var, axis=1)
df_y_validation_imagery = df_validation_imagery.drop(imagery_var, axis=1)

In [None]:
#np.stack(df_X_t rain_imagery[imagery_var].values)

In [13]:
df_X_train_imagery[imagery_var] = df_X_train_imagery[imagery_var].apply(np.array)
df_X_validation_imagery[imagery_var] = df_X_validation_imagery[imagery_var].apply(np.array)
df_X_test_imagery[imagery_var] = df_X_test_imagery[imagery_var].apply(np.array)

# Step 2: Stack the arrays
all_arrays_train = np.stack(df_X_train_imagery[imagery_var].values)
all_arrays_val = np.stack(df_X_validation_imagery[imagery_var].values)
all_arrays_test = np.stack(df_X_test_imagery[imagery_var].values)

# Step 3: Convert to a PyTorch tensor
X_train = torch.tensor(all_arrays_train)
X_test = torch.tensor(all_arrays_test)
X_val = torch.tensor(all_arrays_val)

In [14]:
y_train = torch.tensor(df_y_train_imagery.iloc[:, 0].values)
y_test = torch.tensor(df_y_test_imagery.iloc[:, 0].values)
y_val = torch.tensor(df_y_validation_imagery.iloc[:, 0].values)

In [15]:
print(X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape)

torch.Size([1267330, 10, 20]) torch.Size([198085, 10, 20]) torch.Size([239833, 10, 20]) torch.Size([1267330]) torch.Size([198085]) torch.Size([239833])


# Baseline Models

## Random Forest

### Training

In [None]:
df_X_train_tabular.columns.tolist()


['nb_cpg_found',
 'nb_reads',
 'std_read_fm',
 'mean_read_fm',
 'std_cpg_fm',
 'mean_cpg_fm',
 'std_cpg_cov',
 'mean_cpg_cov',
 'std_cpg_dist',
 'mean_cpg_dist',
 'read_fm_kd_0',
 'read_fm_kd_1',
 'read_fm_kd_2',
 'read_fm_kd_3',
 'read_fm_kd_4',
 'read_fm_kd_5',
 'read_fm_kd_6',
 'read_fm_kd_7',
 'read_fm_kd_8',
 'read_fm_kd_9',
 'read_fm_kd_10',
 'cpg_fm_kd_0',
 'cpg_fm_kd_1',
 'cpg_fm_kd_2',
 'cpg_fm_kd_3',
 'cpg_fm_kd_4',
 'cpg_fm_kd_5',
 'cpg_fm_kd_6',
 'cpg_fm_kd_7',
 'cpg_fm_kd_8',
 'cpg_fm_kd_9',
 'cpg_fm_kd_10',
 'cpg_cov_kd_0',
 'cpg_cov_kd_1',
 'cpg_cov_kd_2',
 'cpg_cov_kd_3',
 'cpg_cov_kd_4',
 'cpg_cov_kd_5',
 'cpg_cov_kd_6',
 'cpg_cov_kd_7',
 'cpg_cov_kd_8',
 'cpg_cov_kd_9',
 'cpg_cov_kd_10',
 'cpg_cov_kd_11',
 'cpg_cov_kd_12',
 'cpg_cov_kd_13',
 'cpg_cov_kd_14',
 'cpg_cov_kd_15',
 'cpg_cov_kd_16',
 'cpg_cov_kd_17',
 'cpg_cov_kd_18',
 'cpg_cov_kd_19',
 'cpg_cov_kd_20',
 'cpg_dist_kd_0',
 'cpg_dist_kd_1',
 'cpg_dist_kd_2',
 'cpg_dist_kd_3',
 'cpg_dist_kd_4',
 'cpg_dist_kd_5

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFClassifier_ = RandomForestClassifier(class_weight= 'balanced')
RFClassifier_.fit(df_X_train_tabular, df_y_train_tabular)

### Validation

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = RFClassifier_.predict(df_X_validation_tabular)
print(confusion_matrix(df_y_validation_tabular, y_pred))
print(sklearn.metrics.classification_report(df_y_validation_tabular, y_pred))

## Logistic Regression

### Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
Scaler_logistic_regression = StandardScaler()
Scaler_logistic_regression.fit(df_X_train_tabular)

df_X_train_scaled = Scaler_logistic_regression.transform(df_X_train_tabular)
df_X_validation_scaled = Scaler_logistic_regression.transform(df_X_validation_tabular)

### Training

In [None]:
from sklearn.linear_model import LogisticRegression
LR_ = LogisticRegression(class_weight= 'balanced')
LR_.fit(df_X_train_scaled, df_y_train_tabular)

### Validation

In [None]:
y_pred = LR_.predict(df_X_validation_scaled)
print(sklearn.metrics.classification_report(df_y_validation_tabular, y_pred))

## XGboost

### Training

In [None]:
from xgboost import XGBClassifier
XGBClassifier_ = XGBClassifier(class_weight= 'balanced')
XGBClassifier_.fit(df_X_train_tabular, df_y_train_tabular)

Parameters: { "class_weight" } are not used.



### Validation

In [None]:
y_pred = XGBClassifier_.predict(df_X_validation_tabular)
print(sklearn.metrics.classification_report(df_y_validation_tabular, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99    237014
         1.0       0.60      0.17      0.26      2819

    accuracy                           0.99    239833
   macro avg       0.80      0.58      0.63    239833
weighted avg       0.99      0.99      0.99    239833



## CNN

### Preprocessing

In [16]:
# Getting class weights
df = pd.concat([df_train_imagery])
neg, pos = np.bincount(df['asm'])
total = neg + pos
print('Number of regions assessed for ASM: {}\nRegions with ASM found: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

weight_for_0 = (1 / neg)*(total)/2.0
weight_for_1 = (1 / pos)*(total)/2.0

class_weight_asm = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Number of regions assessed for ASM: 1267330
Regions with ASM found: 633665 (50.00% of total)

Weight for class 0: 1.00
Weight for class 1: 1.00


### Model definition

In [None]:
class SimpleCNNModel(nn.Module):
    def __init__(self):
        super(SimpleCNNModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1, 1))
        self.bn1 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(kernel_size=(2, 1))
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1, 1))
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1, 1))
        self.bn3 = nn.BatchNorm2d(64)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(in_features=1280, out_features=64)  # Adjusted the input features after calculations
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(in_features=64, out_features=1)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x  # returns logits

    def predict(self, x):
        logit = self.forward(x)
        probability = torch.sigmoid(logit)
        return probability > 0.5


### Training

In [None]:
# Creating the PyTorch model
CNN_ = SimpleCNNModel()


# Define the loss function and the optimizer
optimizer = torch.optim.AdamW(CNN_.parameters(), lr=0.001, weight_decay=0.001)

# Taking the class weights into account
pos_weight = torch.tensor(class_weight_asm[1], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Device Handling (GPU usage if possible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Passing the objects to the device
CNN_ = CNN_.to(device)
X_train = X_train.to(device)
X_train = X_train.float()
X_train = X_train.unsqueeze(1)
X_test = X_test.to(device)
X_test = X_test.float()
X_test = X_test.unsqueeze(1)
X_val = X_val.to(device)
X_val = X_val.float()
X_val = X_val.unsqueeze(1)
y_train = y_train.to(device)
y_train = y_train.float()
y_test = y_test.to(device)
y_test = y_test.float()
y_val = y_val.to(device)
y_val = y_val.float()

In [None]:
from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

# Define the batch size
batch_size = 64  # You can adjust this size depending on your GPU memory

# Create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
def calculate_accuracy(y_pred, y_true):
    predicted = torch.round(torch.sigmoid(y_pred))
    correct = (predicted == y_true).float()
    acc = correct.sum() / len(correct)
    return acc

num_epochs = 250

for epoch in range(num_epochs):
    CNN_.train()  # Set the model to training mode
    running_loss = 0.0
    running_accuracy = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = CNN_(inputs).squeeze()
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        running_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = running_accuracy / len(train_loader.dataset)

    # Validation loss
    CNN_.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    val_accuracy = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = CNN_(inputs).squeeze()
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            val_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)

    val_loss /= len(val_loader.dataset)
    val_accuracy /= len(val_loader.dataset)

    # Print losses
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

Epoch 1/250, Train Loss: 0.5537, Train Acc: 0.8360, Val Loss: 0.3612, Val Acc: 0.9573
Epoch 2/250, Train Loss: 0.5338, Train Acc: 0.8308, Val Loss: 0.3307, Val Acc: 0.9153
Epoch 3/250, Train Loss: 0.5279, Train Acc: 0.8280, Val Loss: 0.3372, Val Acc: 0.8889
Epoch 4/250, Train Loss: 0.5236, Train Acc: 0.8288, Val Loss: 0.3298, Val Acc: 0.8963
Epoch 5/250, Train Loss: 0.5191, Train Acc: 0.8329, Val Loss: 0.3304, Val Acc: 0.9005
Epoch 6/250, Train Loss: 0.5182, Train Acc: 0.8312, Val Loss: 0.3267, Val Acc: 0.9143
Epoch 7/250, Train Loss: 0.5155, Train Acc: 0.8331, Val Loss: 0.3209, Val Acc: 0.9096
Epoch 8/250, Train Loss: 0.5166, Train Acc: 0.8344, Val Loss: 0.3222, Val Acc: 0.9050
Epoch 9/250, Train Loss: 0.5135, Train Acc: 0.8347, Val Loss: 0.3210, Val Acc: 0.9046
Epoch 10/250, Train Loss: 0.5109, Train Acc: 0.8361, Val Loss: 0.3200, Val Acc: 0.8978
Epoch 11/250, Train Loss: 0.5104, Train Acc: 0.8361, Val Loss: 0.3255, Val Acc: 0.9279
Epoch 12/250, Train Loss: 0.5124, Train Acc: 0.8359,

KeyboardInterrupt: 

### Validation

In [None]:
# import classification report
from sklearn.metrics import classification_report

In [None]:
# classification report
y_pred = CNN_(X_val).squeeze()
y_pred = torch.round(torch.sigmoid(y_pred))
print(classification_report(y_val.detach().numpy(), y_pred.detach().numpy(), digits=4))

              precision    recall  f1-score   support

         0.0     0.9983    0.9208    0.9580    237014
         1.0     0.1156    0.8709    0.2041      2819

    accuracy                         0.9202    239833
   macro avg     0.5570    0.8958    0.5811    239833
weighted avg     0.9880    0.9202    0.9491    239833



# CNN 2

In [None]:
class SecondCNNModel(nn.Module):
    def __init__(self):
        super(SecondCNNModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(2, 2), padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(2, 2), padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.flatten = nn.Flatten()
        # Recalculate the output dimensions:
        # Pooling now correctly outputs (5 x 10) after first, and (3 x 5) after second
        self.fc1 = nn.Linear(in_features=32 * 3 * 5, out_features=64)  # Recalculated input features
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(in_features=64, out_features=1)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

    def predict(self, x):
        logit = self.forward(x)
        probability = torch.sigmoid(logit)
        return probability > 0.5

In [None]:
CNN_ = SecondCNNModel()


# Define the loss function and the optimizer
optimizer = torch.optim.AdamW(CNN_.parameters(), lr=0.001, weight_decay=0.001)

# Taking the class weights into account
pos_weight = torch.tensor(class_weight_asm[1], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Device Handling (GPU usage if possible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def calculate_accuracy(y_pred, y_true):
    predicted = torch.round(torch.sigmoid(y_pred))
    correct = (predicted == y_true).float()
    acc = correct.sum() / len(correct)
    return acc

num_epochs = 250

for epoch in range(num_epochs):
    CNN_.train()  # Set the model to training mode
    running_loss = 0.0
    running_accuracy = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = CNN_(inputs).squeeze()
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        running_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = running_accuracy / len(train_loader.dataset)

    # Validation loss
    CNN_.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    val_accuracy = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = CNN_(inputs).squeeze()
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            val_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)

    val_loss /= len(val_loader.dataset)
    val_accuracy /= len(val_loader.dataset)

    # Print losses
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

Epoch 1/250, Train Loss: 0.5446, Train Acc: 0.8494, Val Loss: 0.3020, Val Acc: 0.9228
Epoch 2/250, Train Loss: 0.4938, Train Acc: 0.8521, Val Loss: 0.3007, Val Acc: 0.9087
Epoch 3/250, Train Loss: 0.4864, Train Acc: 0.8535, Val Loss: 0.2963, Val Acc: 0.9152
Epoch 4/250, Train Loss: 0.4736, Train Acc: 0.8558, Val Loss: 0.3226, Val Acc: 0.8817
Epoch 5/250, Train Loss: 0.4753, Train Acc: 0.8573, Val Loss: 0.3063, Val Acc: 0.8921
Epoch 6/250, Train Loss: 0.4679, Train Acc: 0.8592, Val Loss: 0.3057, Val Acc: 0.9366
Epoch 7/250, Train Loss: 0.4629, Train Acc: 0.8594, Val Loss: 0.2974, Val Acc: 0.9023
Epoch 8/250, Train Loss: 0.4623, Train Acc: 0.8599, Val Loss: 0.2928, Val Acc: 0.9058
Epoch 9/250, Train Loss: 0.4621, Train Acc: 0.8610, Val Loss: 0.3051, Val Acc: 0.9225
Epoch 10/250, Train Loss: 0.4596, Train Acc: 0.8626, Val Loss: 0.3130, Val Acc: 0.8790
Epoch 11/250, Train Loss: 0.4559, Train Acc: 0.8619, Val Loss: 0.2902, Val Acc: 0.9159
Epoch 12/250, Train Loss: 0.4589, Train Acc: 0.8608,

In [None]:
# classification report
y_pred = CNN_(X_val).squeeze()
y_pred = torch.round(torch.sigmoid(y_pred))
print(classification_report(y_val.detach().numpy(), y_pred.detach().numpy(), digits=4))

              precision    recall  f1-score   support

         0.0     0.9982    0.9183    0.9566    237014
         1.0     0.1118    0.8645    0.1980      2819

    accuracy                         0.9177    239833
   macro avg     0.5550    0.8914    0.5773    239833
weighted avg     0.9878    0.9177    0.9477    239833



# 3rd CNN

In [None]:
class ThirdCNNModel(nn.Module):
    def __init__(self):
        super(ThirdCNNModel, self).__init__()
        # Using 3x3 kernels now, with padding=1 to maintain dimensions
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3, 3), padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.flatten = nn.Flatten()

        # Calculating the output dimensions:
        # First Convolution: (10 x 20) -> (10 x 20) with padding=1
        # First Pooling: (10 x 20) -> (5 x 10)
        # Second Convolution: (5 x 10) -> (5 x 10) with padding=1
        # Second Pooling: (5 x 10) -> (2 x 5)
        self.fc1 = nn.Linear(in_features=32 * 2 * 5, out_features=64)  # Updated dimensions to 2 x 5
        self.dropout = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(in_features=64, out_features=1)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

    def predict(self, x):
        logit = self.forward(x)
        probability = torch.sigmoid(logit)
        return probability > 0.5

In [None]:
CNN_ = ThirdCNNModel()

# Define the loss function and the optimizer
optimizer = torch.optim.AdamW(CNN_.parameters(), lr=0.001, weight_decay=0.001)

# Taking the class weights into account
pos_weight = torch.tensor(class_weight_asm[1], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Device Handling (GPU usage if possible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def calculate_accuracy(y_pred, y_true):
    predicted = torch.round(torch.sigmoid(y_pred))
    correct = (predicted == y_true).float()
    acc = correct.sum() / len(correct)
    return acc

num_epochs = 250

for epoch in range(num_epochs):
    CNN_.train()  # Set the model to training mode
    running_loss = 0.0
    running_accuracy = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = CNN_(inputs).squeeze()
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        running_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = running_accuracy / len(train_loader.dataset)

    # Validation loss
    CNN_.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    val_accuracy = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = CNN_(inputs).squeeze()
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            val_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)

    val_loss /= len(val_loader.dataset)
    val_accuracy /= len(val_loader.dataset)

    # Print losses
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

Epoch 1/250, Train Loss: 0.5087, Train Acc: 0.8621, Val Loss: 0.3024, Val Acc: 0.9330
Epoch 2/250, Train Loss: 0.4708, Train Acc: 0.8585, Val Loss: 0.2910, Val Acc: 0.9059
Epoch 3/250, Train Loss: 0.4629, Train Acc: 0.8613, Val Loss: 0.2875, Val Acc: 0.9137
Epoch 4/250, Train Loss: 0.4556, Train Acc: 0.8626, Val Loss: 0.2944, Val Acc: 0.9003
Epoch 5/250, Train Loss: 0.4539, Train Acc: 0.8634, Val Loss: 0.2838, Val Acc: 0.9219
Epoch 6/250, Train Loss: 0.4491, Train Acc: 0.8681, Val Loss: 0.2940, Val Acc: 0.9360
Epoch 7/250, Train Loss: 0.4475, Train Acc: 0.8676, Val Loss: 0.2902, Val Acc: 0.9016
Epoch 8/250, Train Loss: 0.4423, Train Acc: 0.8678, Val Loss: 0.2859, Val Acc: 0.9088
Epoch 9/250, Train Loss: 0.4413, Train Acc: 0.8688, Val Loss: 0.2871, Val Acc: 0.9136
Epoch 10/250, Train Loss: 0.4405, Train Acc: 0.8683, Val Loss: 0.2834, Val Acc: 0.9189
Epoch 11/250, Train Loss: 0.4385, Train Acc: 0.8679, Val Loss: 0.2849, Val Acc: 0.9135
Epoch 12/250, Train Loss: 0.4351, Train Acc: 0.8694,

In [None]:
# classification report
y_pred = CNN_(X_val).squeeze()
y_pred = torch.round(torch.sigmoid(y_pred))
print(classification_report(y_val.detach().numpy(), y_pred.detach().numpy(), digits=4))

              precision    recall  f1-score   support

         0.0     0.9983    0.9174    0.9561    237014
         1.0     0.1108    0.8652    0.1964      2819

    accuracy                         0.9168    239833
   macro avg     0.5545    0.8913    0.5763    239833
weighted avg     0.9878    0.9168    0.9472    239833



# ViT

In [17]:
class VisionTransformer(nn.Module):
    def __init__(self, in_channels=1, patch_size=2, emb_size=128, img_size=(10, 20), num_heads=4, depth=6, num_classes=1):
        super(VisionTransformer, self).__init__()
        self.patch_size = patch_size
        num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size) + \
                      (img_size[0] % patch_size != 0) + (img_size[1] % patch_size != 0)
        self.patch_embedding = nn.Linear(patch_size * patch_size * in_channels, emb_size)
        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, emb_size))  # Modified line
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size))
        self.dropout = nn.Dropout(0.1)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_size, nhead=num_heads),
            num_layers=depth
        )
        self.to_cls_token = nn.Identity()
        self.fc = nn.Linear(emb_size, num_classes)

    def forward(self, x):
        if len(x.shape) != 4:
          raise ValueError("Expected input to have 4 dimensions [B, C, H, W], got {}".format(x.shape))

        B, C, H, W = x.shape
        x = x.view(B, C, H // self.patch_size, self.patch_size, W // self.patch_size, self.patch_size)
        x = x.permute(0, 2, 4, 1, 3, 5).contiguous()
        x = x.view(B, -1, self.patch_size * self.patch_size * C)

        x = self.patch_embedding(x)

        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.position_embeddings

        x = self.dropout(x)
        x = self.transformer(x)
        x = self.to_cls_token(x[:, 0])

        return self.fc(x)

    def predict(self, x):
        logit = self.forward(x)
        probability = torch.sigmoid(logit)
        return probability > 0.5

In [18]:
# Creating the PyTorch model
ViT_ = VisionTransformer(in_channels=1)

# Define the loss function and the optimizer
optimizer = torch.optim.AdamW(ViT_.parameters(), lr=0.001, weight_decay=0.001)

# Taking the class weights into account
pos_weight = torch.tensor(class_weight_asm[1], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Device Handling (GPU usage if possible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Passing the objects to the device
ViT_.to(device)
criterion.to(device)



BCEWithLogitsLoss()

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [20]:
X_train = X_train.to(device)
X_train = X_train.float()
X_train = X_train.unsqueeze(1)
X_test = X_test.to(device)
X_test = X_test.float()
X_test = X_test.unsqueeze(1)
X_val = X_val.to(device)
X_val = X_val.float()
X_val = X_val.unsqueeze(1)
y_train = y_train.to(device)
y_train = y_train.float()
y_test = y_test.to(device)
y_test = y_test.float()
y_val = y_val.to(device)
y_val = y_val.float()

In [21]:
from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

# Define the batch size
batch_size = 16

# Create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
def calculate_accuracy(y_pred, y_true):
    predicted = torch.round(torch.sigmoid(y_pred))
    correct = (predicted == y_true).float()
    acc = correct.sum() / len(correct)
    return acc

num_epochs = 50

for epoch in range(num_epochs):
    ViT_.train()  # Set the model to training mode
    running_loss = 0.0
    running_accuracy = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = ViT_(inputs).squeeze()
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print losses
    print(f'Epoch {epoch+1}/{num_epochs}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


In [None]:
torch.save(ViT_, 'ViT_oversampled.pth')
torch.save(ViT_.state_dict(), 'ViT_oversampled_state_dict.pth')

In [None]:
# Load the model
ViT_ = torch.load('ViT_oversampled.pth', map_location=torch.device('cpu'))

ViT_.load_state_dict(torch.load('ViT_oversampled_state_dict.pth', map_location=torch.device('cpu')))
ViT_.eval()

# Create a DataLoader for your validation dataset
batch_size = 32  # You can adjust the batch size according to your system's capability
dataset = TensorDataset(X_val, y_val)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Initialize lists to store true labels and predictions
all_preds = []
all_labels = []

# Evaluate in batches
for X_batch, y_batch in data_loader:
    with torch.no_grad():  # No need to compute gradients
        y_pred = ViT_(X_batch).squeeze()
        y_pred = torch.round(torch.sigmoid(y_pred))
        all_preds.extend(y_pred.detach().numpy())
        all_labels.extend(y_batch.detach().numpy())

# Classification report
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, digits=4))

# 2D RNN

In [None]:
class RNN(nn.Module):
    def __init__(self, in_channels, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(in_channels, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Reshape input to (batch_size, seq_len, in_channels)
        batch_size, _, height, width = x.size()
        x = x.view(batch_size, 1, height * width)

        # Set initial hidden state
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)

        # Forward propagate RNN
        out, _ = self.rnn(x, h0)

        # Decode the hidden state of the last time step
        out = out[:, 0, :]

        # Fully connected layer to get the final output
        out = self.fc(out)

        return out

in_channels = 1  # Number of input channels (e.g., grayscale image)
hidden_size = 64  # Size of the hidden state
num_layers = 2  # Number of RNN layers
num_classes = 1  # Number of output classes
RNN_ = RNN(in_channels, hidden_size, num_layers, num_classes)

In [None]:
# Define the loss function and the optimizer
optimizer = torch.optim.AdamW(RNN_.parameters(), lr=0.001, weight_decay=0.001)

# Taking the class weights into account
pos_weight = torch.tensor(class_weight_asm[1], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Device Handling (GPU usage if possible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Passing the objects to the device
RNN_.to(device)
criterion.to(device)

In [None]:
def calculate_accuracy(y_pred, y_true):
    predicted = torch.round(torch.sigmoid(y_pred))
    correct = (predicted == y_true).float()
    acc = correct.sum() / len(correct)
    return acc

num_epochs = 50

for epoch in range(num_epochs):
    RNN_.train()  # Set the model to training mode
    running_loss = 0.0
    running_accuracy = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = RNN_(inputs).squeeze()
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print losses
    print(f'Epoch {epoch+1}/{num_epochs}')

In [None]:
torch.save(RNN_, 'path/to/save/ViT_.pth')

# Or, save only the model's state dictionary
torch.save(RNN_.state_dict(), 'path/to/save/RNN_state_dict.pth')

In [None]:
# classification report
RNN_.eval()
y_pred = RNN_(X_val).squeeze()
y_pred = torch.round(torch.sigmoid(y_pred))
print(classification_report(y_val.detach().numpy(), y_pred.detach().numpy(), digits=4))

## RNN

In [None]:
train_scalar_features = np.array(df_X_train_tabular)
val_scalar_features = np.array(df_X_validation_tabular)
test_scalar_features = np.array(df_X_test_tabular)

train_scalar_features_torch = torch.tensor(train_scalar_features)
test_scalar_features_torch = torch.tensor(test_scalar_features)
val_scalar_features_torch = torch.tensor(val_scalar_features)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
class RNN_MODEL(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNN_MODEL, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, output_dim)


    def forward(self, x):
      h0 = torch.randn(self.layer_dim, x.size(0), self.hidden_dim)
      out, hn = self.rnn(x, h0.detach())
      out = self.fc(out[:, -1, :])
      return out

In [None]:
RNN = RNN_MODEL(train_scalar_features.shape[1], 100, 5, 1)

In [None]:
train_scalar_features_RNN = train_scalar_features_torch.unsqueeze(1) # N, L, H_in
test_scalar_features_RNN = test_scalar_features_torch.unsqueeze(1) # N, L, H_in
val_scalar_features_RNN = val_scalar_features_torch.unsqueeze(1) # N, L, H_in

print(train_scalar_features_RNN.shape)
print(test_scalar_features_RNN.shape)
print(val_scalar_features_RNN.shape)

In [None]:
optimizer = torch.optim.AdamW(RNN.parameters(), lr=0.001, weight_decay=0.001)
pos_weight = torch.tensor(class_weight_asm[1])
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [None]:
# Device handling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RNN.to(device)
train_scalar_features_RNN = train_scalar_features_RNN.to(device)
train_labels_torch = train_labels_torch.to(device)
test_scalar_features_RNN = test_scalar_features_RNN.to(device)
test_labels_torch = test_labels_torch.to(device)
val_scalar_features_RNN = val_scalar_features_RNN.to(device)
val_labels_torch = val_labels_torch.to(device)

In [None]:
# Training the model
train_loss  = []

trainloader = DataLoader(torch.utils.data.TensorDataset(train_scalar_features_RNN, train_labels_torch.unsqueeze(1)), batch_size=BATCH_SIZE, shuffle=True)
testloader = DataLoader(torch.utils.data.TensorDataset(test_scalar_features_RNN, test_labels_torch.unsqueeze(1)), batch_size=BATCH_SIZE, shuffle=True)
valloader = DataLoader(torch.utils.data.TensorDataset(val_scalar_features_RNN, val_labels_torch.unsqueeze(1)), batch_size=BATCH_SIZE, shuffle=True)
# Iterate through each epoch
for epoch in range(1000):
  for batch in trainloader:
    # Get the inputs and labels
    inputs, labels = batch
    # Forward pass
    outputs = RNN(inputs)
    # Calculate loss
    loss = criterion(outputs, labels)

    train_loss.append(loss.item())


    # Zero gradients, backward pass, update weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## LSTM

## ViT

# Fine-tuning Baseline models and choosing the best one

### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import classification_report

# Define the hyperparameter grid
grid = {
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}




# pick 10 random hyperparameter combinations
n = 10
random_grid = {k: np.random.choice(v, n) for k, v in grid.items()}

# fit logistic regression with each hyperparameter combination
results = []
for i in range(n):
    max_depth = random_grid['max_depth'][i]
    n_estimators = random_grid['n_estimators'][i]
    max_features = random_grid['max_features'][i]
    min_samples_split = random_grid['min_samples_split'][i]
    min_samples_leaf = random_grid['min_samples_leaf'][i]
    bootstrap = random_grid['bootstrap'][i]


    # fit logistic regression
    model = RandomForestClassifier(class_weight='balanced', max_depth=max_depth, n_estimators=n_estimators, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, bootstrap=bootstrap)
    model.fit(df_X_train_tabular, df_y_train_tabular)

    # evaluate model on f_1 score
    y_pred = model.predict(df_X_validation_tabular)
    f1 = f1_score(df_y_validation_tabular, y_pred)

    # save results
    print(max_depth, n_estimators, max_features, min_samples_split, min_samples_leaf, bootstrap, f1)
    results.append((max_depth, n_estimators, max_features, min_samples_split, min_samples_leaf, bootstrap, f1))

# get the best hyperparameters
best_hyperparameters = max(results, key=lambda x: x[6])
best_hyperparameters


10 150 sqrt 10 2 False 0.1686506617011372
90 40 sqrt 5 4 True 0.3577549271636675
None 90 sqrt 5 2 False 0.25362318840579706


  warn(


None 110 auto 10 4 False 0.3666046703864435
10 90 sqrt 10 1 False 0.1671716506389145
90 30 sqrt 10 2 True 0.29967909158232536
40 40 sqrt 2 2 False 0.3304592644729896


  warn(


100 140 auto 2 4 False 0.36686134544667964


  warn(


10 50 auto 10 2 False 0.1699912687218752
50 40 sqrt 2 4 False 0.3544510083520065


(100, 140, 'auto', 2, 4, False, 0.36686134544667964)

In [None]:
best_hyperparameters = max(results, key=lambda x: x[6])
best_hyperparameters

(100, 140, 'auto', 2, 4, False, 0.36686134544667964)

In [None]:
best_model = RandomForestClassifier(class_weight='balanced', max_depth=best_hyperparameters[0] , n_estimators=best_hyperparameters[1], max_features=best_hyperparameters[2], min_samples_split=best_hyperparameters[3], min_samples_leaf=best_hyperparameters[4], bootstrap=best_hyperparameters[5])
best_model.fit(df_X_train_tabular, df_y_train_tabular)
y_pred = best_model.predict(df_X_validation_tabular)
print(classification_report(df_y_validation_tabular, y_pred))

  warn(


              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99    237014
         1.0       0.46      0.30      0.36      2819

    accuracy                           0.99    239833
   macro avg       0.73      0.65      0.68    239833
weighted avg       0.99      0.99      0.99    239833



### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import classification_report
grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']}

lr_ = LogisticRegression(class_weight='balanced')

# pick 10 random hyperparameter combinations
n = 10
random_grid = {k: np.random.choice(v, n) for k, v in grid.items()}

# fit logistic regression with each hyperparameter combination
results = []
for i in range(n):
    C = random_grid['C'][i]
    penalty = random_grid['penalty'][i]
    solver = random_grid['solver'][i]

    # fit logistic regression
    model = LogisticRegression(C=C, penalty=penalty, solver=solver)
    model.fit(df_X_train_scaled, df_y_train_tabular)

    # evaluate model on f_1 score
    y_pred = model.predict(df_X_validation_scaled)
    f1 = f1_score(df_y_validation_tabular, y_pred)

    # save results
    print(C, penalty, solver, f1)
    results.append((C, penalty, solver, f1))

# get the best hyperparameters
best_hyperparameters = max(results, key=lambda x: x[3])
best_hyperparameters

# defining best_model
best_model = LogisticRegression(C=best_hyperparameters[0], penalty=best_hyperparameters[1], solver=best_hyperparameters[2])


In [None]:
# get classificaiton report of best_model
best_model.fit(df_X_train_scaled, df_y_train_tabular)
y_pred = best_model.predict(df_X_validation_scaled)

# classification report
print(classification_report(df_y_validation_tabular, y_pred))

### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import classification_report
grid = {'max_depth': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150],
        'learning_rate': [0.001, 0.01, 0.1, 1, 10],
        'min_child_weight': [1, 5, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'reg_alpha': [0, 0.001, 0.01, 0.1]}

xgb = XGBClassifier(class_weight='balanced')

# pick 10 random hyperparameter combinations
n = 10
random_grid = {k: np.random.choice(v, n) for k, v in grid.items()}

# fit logistic regression with each hyperparameter combination
results = []
for i in range(n):
    max_depth = random_grid['max_depth'][i]
    n_estimators = random_grid['n_estimators'][i]
    learning_rate = random_grid['learning_rate'][i]
    min_child_weight = random_grid['min_child_weight'][i]
    subsample = random_grid['subsample'][i]
    colsample_bytree = random_grid['colsample_bytree'][i]
    reg_alpha = random_grid['reg_alpha'][i]

    # fit logistic regression
    model = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate,
                          min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=colsample_bytree,
                          reg_alpha=reg_alpha)
    model.fit(df_X_train_tabular, df_y_train_tabular)

    # evaluate model on f_1 score
    y_pred = model.predict(df_X_validation_tabular)
    f1 = f1_score(df_y_validation_tabular, y_pred)

    # save results
    print(max_depth, n_estimators, learning_rate, min_child_weight, subsample, colsample_bytree, reg_alpha, f1)
    results.append((max_depth, n_estimators, learning_rate, min_child_weight, subsample, colsample_bytree, reg_alpha, f1))

# get the best hyperparameters
best_hyperparameters = max(results, key=lambda x: x[7])
best_hyperparameters

# get classificaiton report
best_model = XGBClassifier(max_depth=best_hyperparameters[0], n_estimators=best_hyperparameters[1],
                           learning_rate=best_hyperparameters[2], min_child_weight=best_hyperparameters[3],
                           subsample=best_hyperparameters[4], colsample_bytree=best_hyperparameters[5],
                           reg_alpha=best_hyperparameters[6])
best_model.fit(df_X_train_tabular, df_y_train_tabular)
y_pred = best_model.predict(df_X_validation_tabular)
print(classification_report(df_y_validation_tabular, y_pred))


100 90 0.01 10 0.8 1.0 0.1 0.024509803921568627
70 150 10.0 5 1.0 0.8 0.01 0.0
50 20 1.0 10 1.0 1.0 0.1 0.2669809299335762
None 30 0.1 5 0.6 0.6 0.001 0.13856960408684546
30 20 0.01 10 0.8 0.8 0.1 0.0
100 70 1.0 1 0.6 1.0 0.001 0.25920408597573946
30 70 1.0 5 1.0 1.0 0.001 0.2681797128300139
50 130 1.0 5 1.0 0.6 0.1 0.26986365773389753
30 70 0.1 1 0.8 0.6 0.01 0.21195172210774213
70 10 10.0 5 0.8 0.6 0.0 0.05950500263296471
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99    237014
         1.0       0.40      0.20      0.27      2819

    accuracy                           0.99    239833
   macro avg       0.70      0.60      0.63    239833
weighted avg       0.98      0.99      0.98    239833

