In [1]:
import sys
import os
import io
import logging
import ast

from google.cloud import bigquery
from google.cloud import storage

import dask.dataframe as dd
#from dask.distributed import Client
from google.cloud import bigquery
from dask import delayed
import pandas as pd
from sklearn.utils import check_random_state
from scipy.stats import entropy

import concurrent.futures
import ast

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.utils.class_weight import compute_class_weight


# Python packages for data, stats
import numpy as np
import pandas as pd
import seaborn as sns
import random

# ML
import sklearn
from sklearn.decomposition import PCA, KernelPCA, NMF, TruncatedSVD
from sklearn.manifold import TSNE, LocallyLinearEmbedding, SpectralEmbedding
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


# To get the time
from datetime import datetime

# To write on the same line
from IPython.display import clear_output

# Set a random seed for reproducibility
random_seed = 42

# Setting the random seed for various libraries
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from scipy.stats import norm

# Print different versions
print(sys.version)
print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("Seaborn version:", sns.__version__)
print("Sklearn version:", sklearn.__version__)


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
Numpy version: 1.25.2
Pandas version: 2.0.3
Seaborn version: 0.13.1
Sklearn version: 1.2.2
Using device: cuda


In [2]:
# Variables
PROJECT_ID = "hmh-em-deepasm"
BQ_ML_DATASET = "ml_250bp_3" # Ali: hg19_250_ml #ml_250bp
bq_client = bigquery.Client()

In [3]:
datasets = ['TRAINING', 'VALIDATION', 'TESTING']
seq_var = 'cpg_directional_fm'
dic_data = {dataset: {} for dataset in datasets}

for dataset in datasets:
  print(f"Processing {dataset} dataset...")
  query = f"""
    SELECT asm, {seq_var}
    FROM {PROJECT_ID}.{BQ_ML_DATASET}.{dataset}
    WHERE {seq_var} IS NOT NULL AND asm IS NOT NULL
    """

  dic_data[dataset]['raw'] = bq_client.query(query).to_dataframe()
  dic_data[dataset]['raw'][seq_var] = dic_data[dataset]['raw'][seq_var].apply(
        lambda x: ast.literal_eval(x.strip('"'))
    )


Processing TRAINING dataset...
Processing VALIDATION dataset...
Processing TESTING dataset...


In [4]:
padding_value = 0.0
batch_size = 5000  # Batch size for training
max_sequence = 0

for dataset in datasets:
  print(f"Processing: {dataset}")
  sequences = [list(row) for row in dic_data[dataset]['raw'][seq_var]]
  labels = list(dic_data[dataset]['raw']['asm'])

  if len(max(sequences, key=len)) > max_sequence:
    max_sequence = len(max(sequences, key=len))
    print(f" Max sequence length: {max_sequence}")

    # Convert sequences to tensors and pad them

  # Convert sequences to tensors and pad them
  padded_sequences = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in sequences],
                                                    batch_first=True,
                                                    padding_value=padding_value)


  # Convert labels to a tensor
  labels = torch.tensor(labels, dtype=torch.float32)

  if dataset == 'TRAINING':
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels.numpy())
    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

  # Create DataLoader for batch processing
  data = TensorDataset(padded_sequences, labels)
  dic_data[dataset]['dataloader'] = DataLoader(data, batch_size=batch_size, shuffle=True)


Processing: TRAINING
 Max sequence length: 38
Processing: VALIDATION
Processing: TESTING
 Max sequence length: 45


In [None]:
# # Example data
# sequences = [
#     [1.5, 2.3, 3.1],
#     [0.1, 0.2],
#     [1.0, 2.1, 3.0, 4.1]
# ]
# labels = [0, 1, 0]

# # Convert sequences to tensors and pad them
# padded_sequences = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in sequences], batch_first=True, padding_value=0.0)

# # Convert labels to a tensor
# labels = torch.tensor(labels, dtype=torch.float32)

# # Create DataLoader for batch processing
# dataset = TensorDataset(padded_sequences, labels)
# dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# # Compute class weights
# class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels.numpy())
# class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# # Define the BCELoss with class weights correctly applied
# criterion = nn.BCELoss(weight=class_weights[1])

# # Define a Transformer model
# class TransformerModel(nn.Module):
#     def __init__(self):
#         super(TransformerModel, self).__init__()
#         self.encoder_layer = nn.TransformerEncoderLayer(d_model=32, nhead=2, dim_feedforward=128)
#         self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
#         self.position_embeddings = nn.Embedding(100, 32)  # Assuming max sequence length is 100
#         self.fc = nn.Linear(32, 1)

#     def forward(self, x):
#         x = x.unsqueeze(-1).repeat(1, 1, 32)  # Extend features to d_model size
#         seq_length, N = x.shape[1], x.shape[0]
#         positions = torch.arange(0, seq_length, device=x.device).unsqueeze(0).repeat(N, 1)
#         x += self.position_embeddings(positions)
#         x = x.permute(1, 0, 2)  # Reshape x to [seq_length, batch, features]
#         x = self.transformer_encoder(x)
#         x = self.fc(x[-1, :, :])  # Take the last sequence output
#         return torch.sigmoid(x.view(-1))  # Ensure output is flat

# # Instantiate model and move it to the selected device
# transformer_model = TransformerModel().to(device)

# # Define optimizer
# optimizer_transformer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

# # Training loop
# for epoch in range(10):
#     for data, targets in dataloader:
#         data, targets = data.to(device), targets.to(device)

#         # Training Transformer
#         transformer_model.zero_grad()
#         outputs = transformer_model(data)

#         # Compute loss
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer_transformer.step()

#     print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 1.5338258743286133
Epoch 2, Loss: 0.2836953401565552
Epoch 3, Loss: 0.46044814586639404
Epoch 4, Loss: 0.6938964128494263
Epoch 5, Loss: 0.21642085909843445
Epoch 6, Loss: 0.1471812129020691
Epoch 7, Loss: 0.4084697663784027
Epoch 8, Loss: 0.2152155339717865
Epoch 9, Loss: 0.08698108792304993
Epoch 10, Loss: 0.17779676616191864




In [None]:
# import torch
# import torch.nn as nn
# from torch.utils.data import DataLoader, TensorDataset
# from sklearn.utils.class_weight import compute_class_weight

# # Check for GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Example data
# sequences = [
#     [1.5, 2.3, 3.1],
#     [0.1, 0.2],
#     [1.0, 2.1, 3.0, 4.1]
# ]
# labels = [0, 1, 0]

# # Convert sequences to tensors and pad them
# padded_sequences = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in sequences], batch_first=True, padding_value=0.0)

# # Convert labels to a tensor
# labels = torch.tensor(labels, dtype=torch.float32)

# # Create DataLoader for batch processing
# dataset = TensorDataset(padded_sequences, labels)
# dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# # Compute class weights
# class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels.numpy())
# class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# # Define the BCELoss with class weights correctly applied
# criterion = nn.BCELoss(weight=class_weights[1])

# # Define an RNN model
# class RNNModel(nn.Module):
#     def __init__(self):
#         super(RNNModel, self).__init__()
#         self.rnn = nn.LSTM(input_size=1, hidden_size=64, batch_first=True)
#         self.fc = nn.Linear(64, 1)

#     def forward(self, x):
#         x, _ = self.rnn(x)
#         x = self.fc(x[:, -1, :])  # Take the output of the last sequence step
#         return torch.sigmoid(x.view(-1))  # Ensure output is flat

# # Instantiate model and move it to the selected device
# rnn_model = RNNModel().to(device)

# # Define optimizer
# optimizer_rnn = torch.optim.Adam(rnn_model.parameters(), lr=0.001)

# # Training loop
# for epoch in range(10):
#     for data, targets in dataloader:
#         data, targets = data.to(device), targets.to(device)

#         # Training RNN
#         rnn_model.zero_grad()
#         outputs = rnn_model(data.unsqueeze(-1))  # Add a feature dimension
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer_rnn.step()

#     print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# # Prediction can be done by calling rnn_model(data) after training


Using device: cpu
Epoch 1, Loss: 0.957861065864563
Epoch 2, Loss: 1.1194591522216797
Epoch 3, Loss: 0.923400342464447
Epoch 4, Loss: 0.9046634435653687
Epoch 5, Loss: 0.8847229480743408
Epoch 6, Loss: 0.8639021515846252
Epoch 7, Loss: 1.1244492530822754
Epoch 8, Loss: 1.1254537105560303
Epoch 9, Loss: 0.8033512830734253
Epoch 10, Loss: 1.1244105100631714


In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import TensorDataset, DataLoader
# from sklearn.metrics import accuracy_score, f1_score

# # Device configuration
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Hyperparameters
# input_size = 32    # Number of features (change this to make it divisible by num_heads)
# hidden_size = 128  # Number of hidden units in the RNN/Transformer
# num_layers = 2     # Number of layers in the RNN/Transformer
# num_heads = 8      # Number of attention heads (for the Transformer)
# batch_size = 64    # Batch size
# num_epochs = 100   # Number of training epochs
# lr = 0.001         # Learning rate

# # Example dataset
# sequences = [[0.1, 0.2, 0.3], [0.4, 0.5], [0.6, 0.7, 0.8, 0.9]]
# labels = [0, 1, 0]

# # Convert data to PyTorch tensors
# seq_lengths = [len(seq) for seq in sequences]
# seq_tensor = torch.tensor([seq + [0.0] * (max(seq_lengths) - len(seq)) for seq in sequences], dtype=torch.float32)
# label_tensor = torch.tensor(labels, dtype=torch.long)

# # Create a TensorDataset and DataLoader
# dataset = TensorDataset(seq_tensor, label_tensor)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# # RNN Model
# class RNNModel(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers):
#         super(RNNModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, 2)  # Binary classification

#     def forward(self, x):
#         # Pack padded batch of sequences for RNN module
#         packed_input = nn.utils.rnn.pack_padded_sequence(x, [len(seq) for seq in x], batch_first=True, enforce_sorted=False)

#         # Forward propagate RNN
#         out, _ = self.rnn(packed_input)

#         # Unpack padding
#         out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

#         # Get the last output from each sequence
#         out = out[torch.arange(out.size(0)), [len(seq) - 1 for seq in x]]

#         # Pass through fully connected layer
#         out = self.fc(out)
#         return out

# # Transformer Model
# class TransformerModel(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, num_heads):
#         super(TransformerModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_size, num_heads), num_layers)
#         self.fc = nn.Linear(hidden_size, 2)  # Binary classification

#     def forward(self, x):
#         # Add dummy dimension for transformer
#         x = x.unsqueeze(1)

#         # Forward propagate Transformer Encoder
#         out = self.transformer_encoder(x)

#         # Get the last output from each sequence
#         out = out[:, 0]

#         # Pass through fully connected layer
#         out = self.fc(out)
#         return out


# # Function to handle imbalanced dataset
# def weighted_binary_cross_entropy(output, target, weights=None):
#     if weights is None:
#         weights = torch.tensor([1.0, 1.0])
#     loss = nn.CrossEntropyLoss(weight=weights)
#     return loss(output, target)

# # Instantiate and train the model
# model = TransformerModel(input_size, hidden_size, num_layers, num_heads).to(device)
# optimizer = optim.Adam(model.parameters(), lr=lr)
# criterion = weighted_binary_cross_entropy

# for epoch in range(num_epochs):
#     for batch_seqs, batch_labels in dataloader:
#         batch_seqs = batch_seqs.to(device)
#         batch_labels = batch_labels.to(device)

#         # Forward pass
#         outputs = model(batch_seqs)
#         loss = criterion(outputs, batch_labels)

#         # Backward and optimize
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     # Compute accuracy and F1-score
#     y_true = label_tensor.tolist()
#     y_pred = torch.max(outputs, 1)[1].cpu().tolist()
#     acc = accuracy_score(y_true, y_pred)
#     f1 = f1_score(y_true, y_pred)
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {acc:.4f}, F1-score: {f1:.4f}')

# # Example usage
# example_seq = [[0.1, 0.2, 0.3, 0.4]]
# example_tensor = torch.tensor(example_seq, dtype=torch.float32).to(device)
# output = model(example_tensor)
# pred = torch.max(output, 1)[1].item()
# print(f"Predicted label for example sequence: {pred}")

In [None]:

# # Convert sequences to tensors and pad them
# #padded_sequences = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in sequences], batch_first=True, padding_value=0.0)

# # # Convert labels to a tensor
# #labels = torch.tensor(labels, dtype=torch.float32)

# # # Create DataLoader for batch processing
# # dataset = TensorDataset(padded_sequences, labels)
# # dataloader = DataLoader(dataset, batch_size=100, shuffle=True)

# # # Compute class weights
# class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels.numpy())
# class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# # # Define the BCELoss with class weights correctly applied
# criterion = nn.BCELoss(weight=class_weights[1])

# # Define a Transformer model
# class TransformerModel(nn.Module):
#     def __init__(self):
#         super(TransformerModel, self).__init__()
#         self.encoder_layer = nn.TransformerEncoderLayer(d_model=32, nhead=2, dim_feedforward=128)
#         self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
#         self.position_embeddings = nn.Embedding(100, 32)  # Assuming max sequence length is 100
#         self.fc = nn.Linear(32, 1)

#     def forward(self, x):
#         x = x.unsqueeze(-1).repeat(1, 1, 32)  # Extend features to d_model size
#         seq_length, N = x.shape[1], x.shape[0]
#         positions = torch.arange(0, seq_length, device=x.device).unsqueeze(0).repeat(N, 1)
#         x += self.position_embeddings(positions)
#         x = x.permute(1, 0, 2)  # Reshape x to [seq_length, batch, features]
#         x = self.transformer_encoder(x)
#         x = self.fc(x[-1, :, :])  # Take the last sequence output
#         return torch.sigmoid(x.view(-1))  # Ensure output is flat

# # # Instantiate model and move it to the selected device
# transformer_model = TransformerModel().to(device)

# # # Define optimizer
# optimizer_transformer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

# # # Training loop
# for epoch in range(100):
#     for data, targets in dataloader:
#         data, targets = data.to(device), targets.to(device)

#         # Training Transformer
#         transformer_model.zero_grad()
#         outputs = transformer_model(data)

#         # Compute loss
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer_transformer.step()

#     print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [None]:
sequences = [
    [1.5, 2.3, 3.1],
    [0.1, 0.2],
    [1.0, 2.1, 3.0, 4.1]
]
labels = [0, 1, 0]

In [None]:
class_weights[1]

tensor(28.9716, device='cuda:0')

In [18]:
# Hyperparameters
d_model = 32  # Dimensionality of the model
nhead = 4  # Number of heads in the multi-head attention models
num_layers = 8  # Number of sub-encoder-layers in the transformer
dim_feedforward = 128  # Size of the feedforward model in nn.TransformerEncoder
max_seq_length = max_sequence
learning_rate = 0.0005  # Learning rate for the optimizer
num_epochs = 20  # Number of training epochs
dropout = 0.3
weight_decay = 0.01

# # Define a Transformer model
class TransformerModel(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, max_seq_length, num_layers, dropout):
        super(TransformerModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,  # Adding dropout to the encoder layer
            layer_norm_eps=1e-6  # Using a smaller epsilon for layer normalization for more precise calculations
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.position_embeddings = nn.Embedding(max_seq_length, d_model)  # Prepare position embeddings
        self.fc = nn.Linear(d_model, 1)

    def forward(self, x):
        x = x.unsqueeze(-1).repeat(1, 1, d_model)  # Extend features to match d_model
        seq_length, N = x.shape[1], x.shape[0]
        positions = torch.arange(0, seq_length, device=x.device).unsqueeze(0).repeat(N, 1)
        x += self.position_embeddings(positions)
        x = x.permute(1, 0, 2)  # Reshape x to [seq_length, batch, features]
        x = self.transformer_encoder(x)
        x = self.fc(x[-1, :, :])  # Take the last sequence output
        return torch.sigmoid(x.view(-1))  # Flatten the output for compatibility with target

# Define loss function with class weights
criterion = nn.BCELoss(weight=class_weights[1])

# Instantiate model and move it to the selected device
transformer_model = TransformerModel(d_model, nhead, dim_feedforward, max_seq_length, num_layers, dropout).to(device)

# Define optimizer
optimizer_transformer = torch.optim.Adam(transformer_model.parameters(),
                                         lr=learning_rate,
                                         weight_decay = weight_decay)

# # Training loop
for epoch in range(num_epochs):
    for data, targets in dic_data['TRAINING']['dataloader']:
        data, targets = data.to(device), targets.to(device)

        # Training Transformer
        transformer_model.zero_grad()
        outputs = transformer_model(data)

        # Compute loss
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer_transformer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")



Epoch 1, Loss: 2.7219479084014893
Epoch 2, Loss: 2.489398956298828
Epoch 3, Loss: 2.6095423698425293
Epoch 4, Loss: 1.9272940158843994
Epoch 5, Loss: 1.8746535778045654
Epoch 6, Loss: 1.9068864583969116
Epoch 7, Loss: 1.9541019201278687
Epoch 8, Loss: 1.6243597269058228
Epoch 9, Loss: 1.5730787515640259
Epoch 10, Loss: 1.6436265707015991
Epoch 11, Loss: 1.5704618692398071
Epoch 12, Loss: 1.6391793489456177
Epoch 13, Loss: 1.836776852607727
Epoch 14, Loss: 1.6030900478363037
Epoch 15, Loss: 1.7205359935760498
Epoch 16, Loss: 1.6805227994918823
Epoch 17, Loss: 1.609703540802002
Epoch 18, Loss: 1.8406842947006226
Epoch 19, Loss: 1.7668598890304565
Epoch 20, Loss: 1.853647232055664


In [23]:
def evaluate_model(model, dataloader, criterion, device, threshold_prediction = 0.5):
    model = model.to(device)
    model.eval()  # Set the model to evaluation mode

    total_loss = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for data, targets in dataloader:
            data, targets = data.to(device), targets.to(device)

            # Forward pass
            outputs = model(data)
            loss = criterion(outputs, targets)

            # Get the predictions
            preds = (outputs > threshold_prediction).float()  # Threshold the output

            # Update lists
            all_preds.extend(preds.view(-1).cpu().numpy())
            all_targets.extend(targets.view(-1).cpu().numpy())

            # Update loss
            total_loss += loss.item() * data.size(0)

    # Calculate the average loss
    average_loss = total_loss / len(dataloader.dataset)

    # Generate the classification report
    report = classification_report(all_targets, all_preds, output_dict=True)

    # Print the classification report
    print("Classification Report:\n")
    print(classification_report(all_targets, all_preds))

    # Additionally, print out the average loss
    print(f"Average Test Loss: {average_loss:.4f}")

    # Return the report as a dictionary for further analysis if needed
    return report

# Assuming you have your model, test_dataloader, and criterion defined, you can call:
report = evaluate_model(transformer_model, dic_data['TESTING']['dataloader'], criterion, device, 0.15)
print(report)

Classification Report:

              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98    193452
         1.0       0.32      0.58      0.42      4633

    accuracy                           0.96    198085
   macro avg       0.66      0.78      0.70    198085
weighted avg       0.97      0.96      0.97    198085

Average Test Loss: 2.0233
{'0.0': {'precision': 0.9898076416337286, 'recall': 0.9708661580133573, 'f1-score': 0.9802454058172975, 'support': 193452}, '1.0': {'precision': 0.3238152369526095, 'recall': 0.5825598963954242, 'f1-score': 0.4162553979025293, 'support': 4633}, 'accuracy': 0.9617840825908069, 'macro avg': {'precision': 0.656811439293169, 'recall': 0.7767130272043907, 'f1-score': 0.6982504018599134, 'support': 198085}, 'weighted avg': {'precision': 0.9742307791207285, 'recall': 0.9617840825908069, 'f1-score': 0.9670542721793687, 'support': 198085}}


In [None]:
class EarlyStopping:
    def __init__(self, patience=10, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 3
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                           Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.best_score = None
        self.epochs_no_improve = 0
        self.early_stop = False

    def __call__(self, val_loss):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.epochs_no_improve += 1
            if self.epochs_no_improve >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.epochs_no_improve = 0

        if self.early_stop and self.verbose:
            print("Early stopping")

def train_model(model, device, dataloaders, criterion, optimizer, threshold_prediction = 0.5, num_epochs=10):
    model = model.to(device)
    early_stopping = EarlyStopping(patience=5, verbose=True)

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        # Each epoch has a training and validation phase
        for phase in ['TRAINING', 'VALIDATION']:
            if phase == 'TRAINING':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            all_preds = []
            all_targets = []

            # Iterate over data.
            for data, targets in dataloaders[phase]:
                data, targets = data.to(device), targets.to(device)

                if phase == 'TRAINING':
                    loss, preds = train_step(model, data, targets, criterion, optimizer, threshold_prediction, device)
                else:
                    with torch.no_grad():
                        outputs = model(data)
                        loss = criterion(outputs, targets)
                        preds = (outputs > threshold_prediction).float()

                running_loss += loss.item() * data.size(0)
                all_preds.extend(preds.view(-1).cpu().numpy())
                all_targets.extend(targets.cpu().numpy())

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_f1 = f1_score(all_targets, all_preds, average=None)  # Calculate F1 for each class

            print(f'{phase.capitalize()} Loss: {epoch_loss:.4f} F1-Score (per class): {epoch_f1}')

            if phase == 'VALIDATION':
                early_stopping(epoch_loss)
                if early_stopping.early_stop:
                    print("Early stopping triggered.")
                    return

def train_step(model, data, targets, criterion, optimizer, threshold_prediction, device):
    optimizer.zero_grad()
    outputs = model(data)
    loss = criterion(outputs, targets)
    preds = (outputs > threshold_prediction).float()
    loss.backward()
    optimizer.step()
    return loss, preds

# Assuming you have your model, dataloaders, criterion, and optimizer defined, you can call:
train_model(transformer_model,
            device,
            {'TRAINING': dic_data['TRAINING']['dataloader'], 'VALIDATION': dic_data['VALIDATION']['dataloader']},
            criterion,
            optimizer_transformer,
            num_epochs = num_epochs,
            )

Epoch 1/100
Training Loss: 1.8361 F1-Score (per class): [9.91277549e-01 8.95977063e-04]
Validation Loss: 1.1724 F1-Score (per class): [0.99408825 0.        ]
Epoch 2/100
Training Loss: 1.7535 F1-Score (per class): [9.91290989e-01 5.38599641e-04]
Validation Loss: 1.1507 F1-Score (per class): [0.99408825 0.        ]
Epoch 3/100
Training Loss: 1.7528 F1-Score (per class): [0.99129251 0.00125606]
Validation Loss: 1.1527 F1-Score (per class): [0.99408825 0.        ]
Epoch 4/100
Training Loss: 1.7393 F1-Score (per class): [9.91288636e-01 3.59034198e-04]
Validation Loss: 1.1638 F1-Score (per class): [0.99408825 0.        ]
Epoch 5/100
Training Loss: 1.7503 F1-Score (per class): [9.91295723e-01 5.38889887e-04]
Validation Loss: 1.1686 F1-Score (per class): [0.99408825 0.        ]
Epoch 6/100
Training Loss: 1.7365 F1-Score (per class): [9.91294934e-01 5.38841491e-04]
Validation Loss: 1.1653 F1-Score (per class): [0.99408825 0.        ]
Epoch 7/100
Training Loss: 1.7520 F1-Score (per class): [0.9