In [1]:
# Use huggingface datasets
from datasets import Dataset
import torch
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import transformers
import mlflow
import importlib
import json

import sys
sys.path.append("../")
from src import models

importlib.reload(models)
importlib.reload(models.lstm_models)
importlib.reload(models.train_model)
importlib.reload(models.eval_model)

<module 'src.models.eval_model' from '/home/mas-server/etu/nn/paraphrase_detection/notebooks/../src/models/eval_model.py'>

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("lstms")

<Experiment: artifact_location='/home/mas-server/etu/nn/paraphrase_detection/notebooks/mlruns/1', creation_time=1711035737638, experiment_id='1', last_update_time=1711035737638, lifecycle_stage='active', name='lstms', tags={}>

In [3]:
use_gpu = torch.cuda.is_available()
if use_gpu:
    print("Using CUDA")

Using CUDA


In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained("microsoft/deberta-v3-large", use_fast=True)
def tokenize_function(samples):
    return tokenizer(            
        samples["question1"], # first string
        samples["question1"], # second string
        return_tensors="pt", # return torch tensor
        padding="max_length", # Pad seqeunces
        max_length=128, # Max len for padded seq
        truncation=True, # Truncate string
        return_token_type_ids=True, # Return mask for q1 and q2
    )
def collate_fn(data):
    input_ids = torch.stack([example["input_ids"] for example in data])
    labels = torch.stack([example["labels"] for example in data]).reshape(-1, 1)
    return {"input_ids": input_ids, "labels": labels}



In [5]:
data_dir = '../data/processed'
TRAIN = 'train'
TEST = 'test'
VAL = 'val'
text_datasets = {
    x : Dataset.from_pandas(pd.read_csv(data_dir + "/" + x + ".csv"))
        # Tokenize questions
        .map(tokenize_function, batched = True)
        # Rename is_duplicate to labels
        .map(lambda examples: {"labels": examples["is_duplicate"]}, batched=True
        )
    for x in [TRAIN, VAL, TEST]
}
for dataset in text_datasets:
    text_datasets[dataset].set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
dataloaders = {
    x: torch.utils.data.DataLoader(
        text_datasets[x], batch_size = 128,
        shuffle=True, num_workers = 12,
        collate_fn = collate_fn
    )
    for x in [TRAIN, VAL, TEST]
}

Map:   0%|          | 0/283000 [00:00<?, ? examples/s]

Map:   0%|          | 0/283000 [00:00<?, ? examples/s]

Map:   0%|          | 0/60643 [00:00<?, ? examples/s]

Map:   0%|          | 0/60643 [00:00<?, ? examples/s]

Map:   0%|          | 0/60644 [00:00<?, ? examples/s]

Map:   0%|          | 0/60644 [00:00<?, ? examples/s]

In [6]:
next(iter(dataloaders["val"]))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'input_ids': tensor([[   1,  577,  295,  ...,    0,    0,    0],
         [   1,  458,  281,  ...,    0,    0,    0],
         [   1, 2597,  269,  ...,    0,    0,    0],
         ...,
         [   1, 3549,  278,  ...,    0,    0,    0],
         [   1,  458,  269,  ...,    0,    0,    0],
         [   1,  771, 1568,  ...,    0,    0,    0]]),
 'labels': tensor([[1],
         [0],
         [1],
         [1],
         [0],
         [1],
         [1],
         [0],
         [0],
         [0],
         [1],
         [1],
         [0],
         [0],
         [1],
         [0],
         [1],
         [1],
         [0],
         [1],
         [1],
         [0],
         [0],
         [1],
         [0],
         [0],
         [1],
         [0],
         [0],
         [1],
         [1],
         [0],
         [1],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [1],
         [1],
         [0],
         [0],
         [1],
         [0],
         [0],

In [7]:
def run_mlflow_experiment(
    exp_name,
    model_config_path,
    dataloaders,
    epochs
):
    with open(model_config_path, "r") as f:
        model_config = json.load(f)
    model_type = model_config.get("type", "No type in config")
    if model_type == "lstm":
        model = models.lstm_models.build_SimpleBiLSTM(model_config)
    elif model_type == "residual_lstm":
        raise NotImplemented()
    elif model_type == "transformers":
        raise NotImplemented()
    with mlflow.start_run(run_name=exp_name):
        model.to("cuda")
        mlflow.log_params(model_config)
        mlflow.log_param("epochs", epochs)
        mlflow.set_tag("model_name", model_type)
        model = models.train_model.train_model(
            model,
            torch.nn.BCEWithLogitsLoss(),
            optim.Adam(model.parameters()),
            dataloaders,
            epochs
        )
        models.eval_model.eval_model(model, dataloaders)
        mlflow.pytorch.log_model(model, "torch_models")

In [12]:
config_paths = "../models/"
# configs = [x for x in os.listdir(config_paths) if x.endswith(".json")]
configs = ["lstm4.json"]
print(configs)

['lstm4.json']


In [13]:
import os 
# Disable tokenizers parallel to disable nasty warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
for conf in configs:
    run_mlflow_experiment(
        conf[:-5],
        config_paths + conf,
        dataloaders,
        8
    )

Epoch 0/8
----------
Training batch 2210/2211
Validation batch 473/474
Epoch 1/8
----------
Training batch 2210/2211
Validation batch 473/474
Epoch 2/8
----------
Training batch 2210/2211
Validation batch 473/474
Epoch 3/8
----------
Training batch 2210/2211
Validation batch 473/474
Epoch 4/8
----------
Training batch 2210/2211
Validation batch 473/474
Epoch 5/8
----------
Training batch 2210/2211
Validation batch 473/474
Epoch 6/8
----------
Training batch 2210/2211
Validation batch 473/474
Epoch 7/8
----------
Training batch 2210/2211
Validation batch 473/474

Training completed in 4m 26s


\Testing completed in 0m 3s
