In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install transformers
!pip install gensim
!pip install spacy
!pip install torch
!pip install colorama

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing co

In [None]:
# Imports
from datasets import load_from_disk, Dataset
from transformers import AutoModel, AutoTokenizer

import time
import os
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import inspect

from colorama import Fore, Style

# Save dataset
def save_dataset(dataset: Dataset, path) -> None:
  dataset.save_to_disk(path)

# Load roberta model
def load_roberta_model():
  return AutoModel.from_pretrained("roberta-base")

def print_colored(variable, color):
  color_map = {
      "blue": Fore.BLUE,
      "red": Fore.RED,
      "green": Fore.GREEN
  }

  if color not in color_map:
      print("Couleur non supportée.")
      return

  color_code = color_map[color]
  reset_code = Style.RESET_ALL

  frame = inspect.currentframe().f_back
  variable_name = [name for name, value in frame.f_locals.items() if value is variable][0]

  print(f"{color_code}{variable_name} = {variable}{reset_code}")

def exist_model(folder_path):
  files = os.listdir(folder_path)
  if 'model.pth' in files:
      return True
  else :
      return False

# Locations
UNMASKED_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Unmasked"
MASKED_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Masked"
AUGMENTED_MASKED_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Augmented_Masked"

UNMASKED_REID_MODEL_LOCATION = "/content/drive/MyDrive/Models/Unmasked"
MASKED_REID_MODEL_LOCATION = "/content/drive/MyDrive/Models/Masked"
AUGMENTED_MASKED_REID_MODEL_LOCATION = "/content/drive/MyDrive/Models/Augmented_Masked"

# Parameters
NB_EPOCHS = 1000
PATIENCE = 5
LR = 1e-5
NB_NEURONS = 200

# Loading the datasets
unmasked_dataset = load_from_disk(UNMASKED_DATASET_LOCATION)
print_colored(unmasked_dataset, "red")
masked_dataset = load_from_disk(MASKED_DATASET_LOCATION)
print_colored(masked_dataset, "blue")
augmented_masked_dataset = load_from_disk(AUGMENTED_MASKED_DATASET_LOCATION)
print_colored(augmented_masked_dataset, "green")

class CustomModel(nn.Module):
  def __init__(self, dataset: Dataset):
      super(CustomModel, self).__init__()
      self.inputs = self.get_inputs(dataset)
      self.input_size = self.get_input_size(dataset)
      self.labels = self.get_labels(dataset)
      self.output_size = len(self.labels)
      self.losses = []
      self.accuracies = []

      self.fc1 = nn.Linear(self.input_size, NB_NEURONS)
      self.fc2 = nn.Linear(NB_NEURONS, NB_NEURONS)
      self.fc3 = nn.Linear(NB_NEURONS, self.output_size)

  def get_inputs(self, dataset: Dataset):
      return [sample['target_embeddings'] for sample in dataset]

  def get_labels(self, dataset: Dataset):
      labels = set(sample['label'] for sample in dataset)
      return list(labels)

  def get_input_size(self, dataset: Dataset):
      if len(dataset) == 0:
          raise ValueError("The dataset is empty")

      first_instance_size = len(dataset[0]['target_embeddings'])
      for instance in dataset:
          current_size = len(instance['target_embeddings'])
          if current_size != first_instance_size:
              raise ValueError("Sizes of target_embeddings are not consistent across the entire dataset")

      return first_instance_size

  # Forward function (we can change here layers)
  def forward(self, x):
      x = self.fc1(x)
      x = torch.nn.functional.relu(self.fc2(x))
      x = self.fc3(x)
      return x

  # Loss function designed for multi-class classification problems: combines a log-softmax function and the negative log-likelihood loss
  def compute_loss(self, predictions, label):
      loss = nn.CrossEntropyLoss()(predictions, label)
      return loss

  # To get the accuracy of 1 iteration
  def compute_accuracy(self, predictions, label):
      # Get the index of the maximum value in predictions
      predicted_label = torch.argmax(predictions)
      # print_colored(predicted_label, "red")

      # Convert label to tensor
      label = torch.tensor(label).clone().detach().long()
      # print_colored(label, "blue")

      # Check if the prediction matches the true label
      correct_prediction = (predicted_label == label).item()
      # print_colored(correct_prediction, "green")

      # Calculate accuracy (1 if correct, 0 otherwise)
      accuracy = 1 if correct_prediction else 0

      return accuracy

  # Step function
  def train_step(self, target_embeddings, label, optimizer):
      # Set the model in training mode
      self.train()

      # Convert label to tensor
      label = torch.tensor(label).clone().detach().long()

      # Convert input to tensor
      target_embeddings = torch.tensor(target_embeddings, dtype=torch.float32)

      # Pass embeddings through the model
      predictions = self.forward(target_embeddings)
      # print_colored(predictions, "red")

      # Calculate the loss
      loss = self.compute_loss(predictions, label)
      # print_colored(loss, "blue")

      # Backpropagation and weight update
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # Calculate accuracy
      accuracy = self.compute_accuracy(predictions, label)

      return loss.item(), accuracy

  def train_model(self, optimizer, epochs, path):
      best_val_loss = float('inf')
      no_improvement_count = 0

      # Training loop
      start_time = time.time()
      for epoch in range(epochs):
          total_loss = 0.0
          total_accuracy = 0.0
          for input, label in zip(self.inputs, self.labels):
              loss, accuracy = self.train_step(input, label, optimizer)
              total_loss += loss
              total_accuracy += accuracy

          average_loss = total_loss / len(self.inputs)
          average_accuracy = total_accuracy / len(self.inputs)
          self.losses.append(average_loss)
          self.accuracies.append(average_accuracy)
          print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss}, Accuracy: {average_accuracy}, Time: {time.time() - start_time}")

          if epoch % 20 == 0:
              if average_loss < best_val_loss:
                  best_val_loss = average_loss
                  no_improvement_count = 0
              else :
                  no_improvement_count += 1

          if epoch % 100 == 0 :
              save_path=os.path.join(path, 'model.pth')
              torch.save(self.state_dict(), save_path)
              print("Model saved successfully.")

          if average_accuracy == 1.0 or no_improvement_count >= PATIENCE:
              print("Training stopped. Accuracy reached 1.0 or Early Stopping.")
              save_path=os.path.join(path, 'model.pth')
              torch.save(self.state_dict(), save_path)
              print("Model saved successfully.")
              break

  def plot_metrics(self, path=None):
      # Plotting loss
      plt.figure()
      plt.plot(self.losses, label='Loss')
      plt.title('Evolution of Loss')
      plt.xlabel('Epoch')
      plt.ylabel('Loss')
      plt.legend()

      if path:
          plt.savefig(os.path.join(path, 'loss_plot.png'))
      else:
          plt.show()

      # Plotting accuracy
      plt.figure()
      plt.plot(self.accuracies, label='Accuracy')
      plt.title('Evolution of Accuracy')
      plt.xlabel('Epoch')
      plt.ylabel('Accuracy')
      plt.legend()

      if path:
          plt.savefig(os.path.join(path, 'accuracy_plot.png'))
      else:
          plt.show()

      # Close all figures
      plt.close('all')


# Unmasked model
unmasked_model = CustomModel(unmasked_dataset)
unmasked_optimizer = optim.Adam(unmasked_model.parameters(), lr=LR)
unmasked_model.train_model(unmasked_optimizer, epochs=NB_EPOCHS, path=UNMASKED_REID_MODEL_LOCATION)
unmasked_model.plot_metrics(path=UNMASKED_REID_MODEL_LOCATION)

# Masked model
masked_model = CustomModel(masked_dataset)
masked_optimizer = optim.Adam(masked_model.parameters(), lr=LR)
masked_model.train_model(masked_optimizer, epochs=NB_EPOCHS*5, path=MASKED_REID_MODEL_LOCATION)
masked_model.plot_metrics(path=MASKED_REID_MODEL_LOCATION)

# Augmented Masked model
augmented_masked_model = CustomModel(augmented_masked_dataset)
augmented_masked_optimizer = optim.Adam(augmented_masked_model.parameters(), lr=LR)
augmented_masked_model.train_model(augmented_masked_optimizer, epochs=NB_EPOCHS*10, path=AUGMENTED_MASKED_REID_MODEL_LOCATION)
augmented_masked_model.plot_metrics(path=AUGMENTED_MASKED_REID_MODEL_LOCATION)

[31munmasked_dataset = Dataset({
    features: ['target_text', 'input_text', 'target_tokens', 'target_attention_mask', 'target_embeddings', 'input_tokens', 'input_attention_mask', 'label'],
    num_rows: 1000
})[0m
[34mmasked_dataset = Dataset({
    features: ['target_tokens', 'target_attention_mask', 'label', 'target_embeddings'],
    num_rows: 2000
})[0m
[32maugmented_masked_dataset = Dataset({
    features: ['target_tokens', 'target_attention_mask', 'label', 'target_embeddings'],
    num_rows: 5000
})[0m


  label = torch.tensor(label).clone().detach().long()


Epoch 1/5000, Loss: 3.462979799985886, Accuracy: 0.0, Time: 1.8268609046936035
Model saved successfully.
Epoch 2/5000, Loss: 3.4533475048542024, Accuracy: 0.0, Time: 3.697840929031372
Epoch 3/5000, Loss: 3.448990528345108, Accuracy: 0.0005, Time: 5.590865850448608
Epoch 4/5000, Loss: 3.4425659096240997, Accuracy: 0.001, Time: 7.638948202133179
Epoch 5/5000, Loss: 3.433959473848343, Accuracy: 0.002, Time: 9.899642944335938
Epoch 6/5000, Loss: 3.422606560230255, Accuracy: 0.004, Time: 11.920069932937622
Epoch 7/5000, Loss: 3.407670157670975, Accuracy: 0.0055, Time: 13.764780521392822
Epoch 8/5000, Loss: 3.388075436115265, Accuracy: 0.0075, Time: 15.70287013053894
Epoch 9/5000, Loss: 3.3629002759456634, Accuracy: 0.005, Time: 17.667414665222168
Epoch 10/5000, Loss: 3.3319980380535124, Accuracy: 0.004, Time: 19.606078624725342
Epoch 11/5000, Loss: 3.2959878921508787, Accuracy: 0.0045, Time: 21.67618179321289
Epoch 12/5000, Loss: 3.256439040660858, Accuracy: 0.0045, Time: 23.938897132873535