In [None]:
!git clone https://github.com/cher-liang/Gravitas-NLP
%cd Gravitas-NLP

In [None]:
!pip install docx2txt

In [None]:
%mkdir -p datasets/semeval2013-Task7-5way/processed

In [None]:
%cd Gravitas-NLP/datasets/semeval2013-Task7-5way/processed
!curl -L -o dataset.zip "https://drive.google.com/uc?id=12LAWEMQpGCxkFQbZFRN6_v8imQkg40rp"
!unzip dataset.zip

In [None]:
%cd
%cd /content/Gravitas-NLP/

In [1]:
import torch
from torch.utils.data import DataLoader
from torch import nn
from torch import optim
from torch.optim import Optimizer

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from tqdm.autonotebook import tqdm, trange

import pandas as pd

from data import torch_dataset
from basis.enums import TrainingType
from basis import config
from pytorch_model import QuestionAnswerPairSimilarityModel

from pathlib import Path
import logging
import math
import os
from typing import Dict, Type, Callable, List

logger = logging.getLogger(__name__)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
target_device = torch.device(device)

max_length: int = None

In [3]:
qnli_model_name = "cross-encoder/qnli-distilroberta-base"
sts_model_name = "cross-encoder/stsb-distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(qnli_model_name)
model_config = AutoConfig.from_pretrained(qnli_model_name)

# qnli_bert_model = AutoModelForSequenceClassification.from_pretrained(qnli_model_name)
# sts_bert_model = AutoModelForSequenceClassification.from_pretrained(sts_model_name)

In [4]:
# def smart_batching_collate_text_only(batch):
#     texts = [[] for _ in range(len(batch[0]))]

#     for example in batch:
#         for idx, text in enumerate(example):
#             texts[idx].append(text.strip())

#     tokenized = tokenizer(
#         *texts, padding=True, truncation="longest_first", return_tensors="pt"
#     )

#     for name in tokenized:
#         tokenized[name] = tokenized[name].to(target_device)

#     return tokenized

In [5]:
# def smart_batching_collate(batch):
#     texts = [[] for _ in range(len(batch[0].texts))]
#     labels = []

#     for example in batch:
#         for idx, text in enumerate(example.texts):
#             texts[idx].append(text.strip())

#         labels.append(example.label)

#     tokenized = tokenizer(
#         *texts,
#         padding=True,
#         truncation="longest_first",
#         return_tensors="pt",
#         max_length=max_length
#     )

#     labels = torch.tensor(labels, dtype=torch.float).to(target_device)

#     for name in tokenized:
#         tokenized[name] = tokenized[name].to(target_device)

#     return tokenized, labels

In [4]:
def smart_batching_collate_new(batch):
    ques_ref_ans_texts = [[] for _ in range(len(batch[0].ques_ref_ans_texts))]
    ques_stud_ans_texts = [[] for _ in range(len(batch[0].ques_stud_ans_texts))]
    ref_ans_stud_ans_texts = [[] for _ in range(len(batch[0].ref_ans_stud_ans_texts))]
    scores = []

    for example in batch:
        for idx, (
            ques_ref_ans_text,
            ques_stud_ans_text,
            ref_ans_stud_ans_text,
        ) in enumerate(
            zip(
                example.ques_ref_ans_texts,
                example.ques_stud_ans_texts,
                example.ref_ans_stud_ans_texts,
            )
        ):
            ques_ref_ans_texts[idx].append(ques_ref_ans_text.strip())
            ques_stud_ans_texts[idx].append(ques_stud_ans_text.strip())
            ref_ans_stud_ans_texts[idx].append(ref_ans_stud_ans_text.strip())

        scores.append(example.scores)

    ques_ref_tokenized = tokenizer(
        *ques_ref_ans_texts,
        padding=True,
        truncation="longest_first",
        return_tensors="pt",
        max_length=max_length
    )
    ques_stud_tokenized = tokenizer(
        *ques_stud_ans_texts,
        padding=True,
        truncation="longest_first",
        return_tensors="pt",
        max_length=max_length
    )
    ref_stud_tokenized = tokenizer(
        *ref_ans_stud_ans_texts,
        padding=True,
        truncation="longest_first",
        return_tensors="pt",
        max_length=max_length
    )

    scores = torch.tensor(scores, dtype=torch.float).to(target_device)

    for ques_ref_name, ques_stud_name, ref_stud_name in zip(
        ques_ref_tokenized, ques_stud_tokenized, ref_stud_tokenized
    ):
        ques_ref_tokenized[ques_ref_name] = ques_ref_tokenized[ques_ref_name].to(target_device)
        ques_stud_tokenized[ques_stud_name] = ques_stud_tokenized[ques_stud_name].to(target_device)
        ref_stud_tokenized[ref_stud_name] = ref_stud_tokenized[ref_stud_name].to(target_device)
    
    return ques_ref_tokenized, ques_stud_tokenized, ref_stud_tokenized, scores

In [5]:
class GravitasData:
    def __init__(self, row) -> None:
        self.ques_ref_ans_texts=[row["question_text"],row["best_match_reference_answer"]]
        self.ques_stud_ans_texts=[row["question_text"],row["answer_text"]]
        self.ref_ans_stud_ans_texts=[row["best_match_reference_answer"],row["answer_text"]]
        self.scores=row["normalized_scores"]

    def __str__(self) -> str:
        string = "Question & Ref Ans: {}\n".format(self.ques_ref_ans_texts)
        string += "Question & Student Ans: {}\n".format(self.ques_stud_ans_texts)
        string += "Ref Ans & Student Ans: {}\n".format(self.ref_ans_stud_ans_texts)
        string += "Scores: {}\n".format(self.scores)
        return string

In [7]:
root_path = Path(config.ROOT_PATH) / "datasets" / "semeval2013-Task7-5way" / "processed"

training_dataset = torch_dataset.GravitasDataset(root_path)
eval_dataset = torch_dataset.GravitasDataset(root_path,train=TrainingType.TESTING_UNSEEN_ANSWERS)
test_dataset = torch_dataset.GravitasDataset(root_path,train=TrainingType.TESTING_UNSEEN_QUESTIONS)

training_data = training_dataset.data.apply(GravitasData,axis=1).tolist()
eval_data = eval_dataset.data.apply(GravitasData,axis=1).tolist()
test_data = test_dataset.data.apply(GravitasData,axis=1).tolist()

d:\Dev\Gravitas-NLP\basis\..\datasets\semeval2013-Task7-5way\processed


In [9]:
# model=QuestionAnswerPairSimilarityModel(256,qnli_model_name=qnli_model_name,sts_model_name=sts_model_name)
model = QuestionAnswerPairSimilarityModel(768,256,qnli_model_name,sts_model_name)

In [10]:
# print(training_data[2])

In [11]:
train_dataloader = DataLoader(
    training_data, batch_size=32, collate_fn=smart_batching_collate_new, shuffle=True
)
eval_dataloader = DataLoader(
    eval_data, batch_size=32, collate_fn=smart_batching_collate_new, shuffle=True
)
test_dataloader = DataLoader(
    test_data, batch_size=32, collate_fn=smart_batching_collate_new, shuffle=True
)

# iterator = tqdm(train_dataloader, desc="Batches")
# for ques_ref_features, ques_stud_features, ref_stud_features, scores in iterator:
#     pass

In [12]:
# # Define the loss function
# criterion = nn.BCELoss()

# # Choose the optimizer
# optimizer = optim.AdamW(model.parameters(), lr=0.001)

# # Number of epochs
# epochs = 100

# # Initialize variables for early stopping
# patience = 10
# best_val_loss = None
# epochs_no_improve = 0

# # Train the model
# for epoch in range(epochs):
#     # Training phase
#     running_loss = 0.0
#     for i, data in enumerate(train_data, 0):
#         inputs1, inputs2, labels = data
#         optimizer.zero_grad()
#         outputs = model(inputs1, inputs2)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()

#     # Validation phase
#     val_loss = 0.0
#     with torch.no_grad():
#         for data in val_data:
#             inputs1, inputs2, labels = data
#             outputs = model(inputs1, inputs2)
#             loss = criterion(outputs, labels)
#             val_loss += loss.item()

#     # Check for early stopping
#     if best_val_loss is None or val_loss < best_val_loss:
#         best_val_loss = val_loss
#         epochs_no_improve = 0
#     else:
#         epochs_no_improve += 1
#         if epochs_no_improve == patience:
#             print('Early stopping!')
#             break

# print('Finished Training')

In [13]:
def get_scheduler(optimizer, scheduler: str, warmup_steps: int, t_total: int):
    """
    Returns the correct learning rate scheduler. Available scheduler: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
    """
    scheduler = scheduler.lower()
    if scheduler == 'constantlr':
        return transformers.get_constant_schedule(optimizer)
    elif scheduler == 'warmupconstant':
        return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    elif scheduler == 'warmuplinear':
        return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    elif scheduler == 'warmupcosine':
        return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    elif scheduler == 'warmupcosinewithhardrestarts':
        return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    else:
        raise ValueError("Unknown scheduler {}".format(scheduler))

In [14]:
def save(path, model, tokenizer):
    """
    Saves all model and tokenizer to path
    """
    if path is None:
        return

    logger.info("Save model to {}".format(path))
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

In [15]:
def eval_during_training(evaluator, output_path, save_best_model, epoch, steps, callback, best_score):
    """Runs evaluation during the training"""
    score = evaluator(output_path=output_path, epoch=epoch, steps=steps)
    if callback is not None:
        callback(score, epoch, steps)
    if score > best_score:
        best_score = score
        if save_best_model:
            save(output_path)

    return best_score

In [20]:
def fit(
    model,
    train_dataloader: DataLoader,
    eval_dataloader: DataLoader,
    # evaluator: SentenceEvaluator = None,
    epochs: int = 1,
    criterion=None,
    activation_fct=nn.Identity(),
    scheduler: str = "WarmupLinear",
    warmup_steps: int = 10000,
    optimizer_class: Type[Optimizer] = torch.optim.AdamW,
    optimizer_params: Dict[str, object] = {"lr": 2e-5},
    weight_decay: float = 0.01,
    # evaluation_steps: int = 0,
    output_path: str = None,
    # save_best_model: bool = True,
    max_grad_norm: float = 1,
    # callback: Callable[[float, int, int], None] = None,
    show_progress_bar: bool = True,
):
    """
    Train the model with the given training objective
    Each training objective is sampled in turn for one batch.
    We sample only as many batches from each objective as there are in the smallest one
    to make sure of equal training with each dataset.

    :param train_dataloader: DataLoader with training InputExamples
    :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc.
    :param epochs: Number of epochs for training
    :param criterion: Which loss function to use for training. If None, will use nn.BCEWithLogitsLoss() if self.config.num_labels == 1 else nn.CrossEntropyLoss()
    :param activation_fct: Activation function applied on top of logits output of model.
    :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
    :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero.
    :param optimizer_class: Optimizer
    :param optimizer_params: Optimizer parameters
    :param weight_decay: Weight decay for model parameters
    :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps
    :param output_path: Storage path for the model and evaluation files
    :param save_best_model: If true, the best model (according to evaluator) is stored at output_path
    :param max_grad_norm: Used for gradient normalization.
    :param callback: Callback function that is invoked after each evaluation.
            It must accept the following three parameters in this order:
            `score`, `epoch`, `steps`
    :param show_progress_bar: If True, output a tqdm progress bar
    """
    # train_dataloader.collate_fn = self.smart_batching_collate

    # Automatic Mixed Precision
    from torch.cuda.amp import autocast

    scaler = torch.cuda.amp.GradScaler()

    model.to(target_device)

    if output_path is not None:
        os.makedirs(output_path, exist_ok=True)

    # best_score = -9999999
    num_train_steps = int(len(train_dataloader) * epochs)

    # Prepare optimizers
    param_optimizer = list(model.named_parameters())

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)

    if isinstance(scheduler, str):
        scheduler = get_scheduler(
            optimizer,
            scheduler=scheduler,
            warmup_steps=warmup_steps,
            t_total=num_train_steps,
        )

    if criterion is None:
        criterion = nn.CrossEntropyLoss()

    skip_scheduler = False

    # Initialize variables for early stopping
    patience = 10
    best_val_loss = None
    epochs_no_improve = 0

    for epoch in trange(epochs, desc="Epoch", disable=not show_progress_bar):
        # training_steps = 0
        model.zero_grad()
        model.train()

        for ques_ref_features, ques_stud_features, ref_stud_features, scores in tqdm(
            train_dataloader,
            desc="Iteration",
            smoothing=0.05,
            disable=not show_progress_bar,
        ):
            # if use_amp:
            with autocast():
                model_predictions = model(
                    ques_ref_features,
                    ques_stud_features,
                    ref_stud_features,
                    # return_dict=True
                )
                logits = activation_fct(model_predictions)
                logits = logits.view(-1)
                loss_value = criterion(logits, scores)

            scale_before_step = scaler.get_scale()
            scaler.scale(loss_value).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()

            skip_scheduler = scaler.get_scale() != scale_before_step
            # else:
            #     model_predictions = self.model(**features, return_dict=True)
            #     logits = activation_fct(model_predictions.logits)
            #     if self.config.num_labels == 1:
            #         logits = logits.view(-1)
            #     loss_value = criterion(logits, labels)
            #     loss_value.backward()
            #     torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
            #     optimizer.step()

            optimizer.zero_grad()

            if not skip_scheduler:
                scheduler.step()

        
        # Validation phase
        val_loss = 0.0
        with torch.no_grad():
            for ques_ref_features, ques_stud_features, ref_stud_features, scores in eval_dataloader: 
                model_predictions = model(
                    ques_ref_features,
                    ques_stud_features,
                    ref_stud_features,
                    # return_dict=True
                )
                logits = activation_fct(model_predictions)
                logits = logits.view(-1)
                loss = criterion(logits, scores)
                val_loss += loss.item()

        # Check for early stopping
        if best_val_loss is None or val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print('Early stopping!')
                break

        #     training_steps += 1

        #     if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
        #         best_score = eval_during_training(
        #             evaluator,
        #             output_path,
        #             save_best_model,
        #             epoch,
        #             training_steps,
        #             callback,
        #             best_score,
        #         )

        #         model.zero_grad()
        #         model.train()

        # if evaluator is not None:
        #     best_score=eval_during_training(
        #         evaluator, output_path, save_best_model, epoch, -1, callback, best_score
        #     )

In [21]:
num_epochs = 100
warmup_steps = math.ceil(
    len(train_dataloader) * num_epochs * 0.1
)  # 10% of train data for warm-up

fit(
    model,
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    epochs=100,
    warmup_steps=warmup_steps,
)

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/279 [00:00<?, ?it/s]

tensor([0.4202, 0.4617, 0.5190, 0.4209, 0.4414, 0.4758, 0.5024, 0.4053, 0.4639,
        0.5190, 0.5278, 0.5137, 0.3728, 0.4514, 0.4763, 0.3313, 0.4583, 0.3938,
        0.4287, 0.3879, 0.4553, 0.4243, 0.4700, 0.4478, 0.4326, 0.4534, 0.3804,
        0.3914, 0.4053, 0.4302, 0.4021, 0.3660], device='cuda:0',
       dtype=torch.float16, grad_fn=<ViewBackward0>) tensor([1.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0000, 1.0000,
        0.5000, 0.0000, 0.0000, 0.5000, 0.0000, 0.5000, 1.0000, 0.5000, 0.5000,
        1.0000, 1.0000, 1.0000, 0.5000, 0.5000, 0.5000, 0.5000, 0.0000, 1.0000,
        0.5000, 1.0000, 1.0000, 0.5000, 0.5000], device='cuda:0')
tensor([0.4133, 0.4072, 0.4587, 0.5288, 0.4478, 0.5259, 0.4128, 0.4739, 0.3879,
        0.4751, 0.4043, 0.4058, 0.3923, 0.4106, 0.3813, 0.4995, 0.4622, 0.3613,
        0.4172, 0.4753, 0.4783, 0.4973, 0.3235, 0.3777, 0.3892, 0.4241, 0.5195,
        0.4541, 0.4260, 0.4807, 0.4412, 0.3621], device='cuda:0',
       dtype=torch.float16, g

KeyboardInterrupt: 

In [None]:
# inp_dataloader = DataLoader(
#     question_answer,
#     batch_size=32,
#     collate_fn=smart_batching_collate_text_only,
#     shuffle=False,
# )

# iterator = tqdm(inp_dataloader, desc="Batches")

In [None]:
# model.eval()
# model.to(torch.device(device))
# with torch.no_grad():
#     for features in iterator:
#         qnli_predictions1 = model.base_model(**features, return_dict=True)
#         embeddings = qnli_predictions1[0]

#         print(embeddings.size())

torch.Size([32, 56, 768])
torch.Size([32, 47, 768])
torch.Size([32, 37, 768])
torch.Size([32, 47, 768])
torch.Size([32, 41, 768])
torch.Size([32, 39, 768])
torch.Size([32, 40, 768])
torch.Size([32, 43, 768])
torch.Size([32, 37, 768])
torch.Size([32, 38, 768])
torch.Size([32, 34, 768])
torch.Size([32, 41, 768])
torch.Size([32, 43, 768])
torch.Size([32, 64, 768])
torch.Size([32, 59, 768])
torch.Size([32, 73, 768])
torch.Size([32, 72, 768])
torch.Size([32, 96, 768])
torch.Size([32, 67, 768])
torch.Size([32, 42, 768])
torch.Size([32, 50, 768])
torch.Size([32, 39, 768])
torch.Size([32, 72, 768])
torch.Size([32, 87, 768])
torch.Size([32, 57, 768])
torch.Size([32, 71, 768])
torch.Size([32, 25, 768])
torch.Size([32, 22, 768])
torch.Size([32, 39, 768])
torch.Size([32, 40, 768])
torch.Size([32, 43, 768])
torch.Size([32, 39, 768])
torch.Size([32, 41, 768])
torch.Size([32, 50, 768])
torch.Size([32, 54, 768])
torch.Size([32, 37, 768])
torch.Size([32, 46, 768])
torch.Size([32, 45, 768])
torch.Size([