In [None]:
!git clone https://github.com/cher-liang/Gravitas-NLP
%cd Gravitas-NLP

In [None]:
!curl -L -o dataset.zip "https://drive.google.com/uc?id=12LAWEMQpGCxkFQbZFRN6_v8imQkg40rp"
!unzip dataset.zip
!rm dataset.zip

In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
# from torch import nn
# from torch import optim
# from torch.optim import Optimizer

# from tqdm.autonotebook import tqdm, trange
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
# import transformers

import os
import logging 
from dataclasses import dataclass
from typing import List, Dict, Type
from datetime import datetime
import math

from CustomCrossEncoder import CrossEncoder
from CustomEvaluator import CECustomEvaluator
# from CustomSTSBert import BertLSTMClassificationLayer

  from tqdm.autonotebook import tqdm, trange


In [2]:
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, 
)
logger = logging.getLogger(__name__)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
kf = KFold(n_splits=5, shuffle=True)

In [5]:
class StsData:
    def __init__(self, row) -> None:
        self.sentence_pair = (row["reference_answer"],row["answer"])
        self.score = row["normalized_score"]
        self.dataset = row["source"]
    
    def __str__(self) -> str:
        string = "Sentence Pair: {}\n".format(self.sentence_pair)
        string += "Score: {}\t Dataset: {}\n".format(self.score,self.dataset)
        return string

In [6]:
@dataclass
class DataFrameWrapper:
    dataset_source: str
    df: pd.DataFrame


dataset_names = ["asap_sas", "cunlp", "misc", "sag", "semeval", "stita"]
dataframes: List[DataFrameWrapper] = []
for dataset_name in dataset_names:
    dataframes.append(
        DataFrameWrapper(
            dataset_source=dataset_name,
            df=pd.read_excel(f"datasets/{dataset_name}.xlsx"),
        )
    )


In [7]:
datasets: Dict[str,List[StsData]]={}
for dataframe in dataframes:
    datasets[dataframe.dataset_source]=dataframe.df.apply(StsData, axis=1).tolist()

In [8]:
# Test dataset for Semeval
test_semeval_df = pd.read_excel("datasets/test/semeval_unseen_domains.xlsx")
dev_semeval_df1 = pd.read_excel("datasets/develop/semeval_unseen_answers.xlsx")
dev_semeval_df2 = pd.read_excel("datasets/develop/semeval_unseen_questions.xlsx")

test_semeval_dataset = test_semeval_df.apply(StsData, axis=1).tolist()
dev_semeval_dataset = (
    dev_semeval_df1.apply(StsData, axis=1).tolist()
    + dev_semeval_df2.apply(StsData, axis=1).tolist()
)

In [9]:
dataset_names.remove("semeval")

In [10]:
# sts_model_name = "cross-encoder/stsb-distilroberta-base"

# tokenizer = AutoTokenizer.from_pretrained(sts_model_name)
# config = AutoConfig.from_pretrained(sts_model_name)

# model = AutoModelForSequenceClassification.from_pretrained(sts_model_name)

In [11]:
best_score = -9999999

In [12]:
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
# print(model.model.base_model)

# print(model.model)

2024-02-04 22:22:24 - Use pytorch device: cuda


In [13]:
train_batch_size=16
num_epochs = 20
model_save_path = 'output/training_sts_continue_training-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [14]:
# 5 Fold Cross Validation with (60-20-20, Train-Develop-Test split)
kfSplits = [
    kf.split(dataset) for (key, dataset) in datasets.items() if key != "semeval"
]
for fold, indexes in enumerate(zip(*kfSplits)):
    train_datasets_list, test_datasets_list, dev_datasets_list = (
        [np.array(datasets["semeval"])],
        [np.array(test_semeval_dataset)],
        [np.array(dev_semeval_dataset)],
    )

    # Split each datasets into train, develop, test except "semeval" dataset
    for (train_test_index, dev_index), dataset_name in zip(indexes, dataset_names):
        train_index, test_index = train_test_split(train_test_index, test_size=0.25)
        train_datasets_list.append(np.take(datasets[dataset_name], train_index))
        dev_datasets_list.append(np.take(datasets[dataset_name], dev_index))
        test_datasets_list.append(np.take(datasets[dataset_name], test_index))

    train_dataset = torch.utils.data.ConcatDataset(train_datasets_list)
    dev_dataset = torch.utils.data.ConcatDataset(dev_datasets_list)
    test_dataset = torch.utils.data.ConcatDataset(test_datasets_list)

    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=train_batch_size,
        shuffle=True,
    )

    evaluator = CECustomEvaluator.from_input_examples(dev_dataset, name="sts-dev")

    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

    model.fit(train_dataloader=train_dataloader,
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=model_save_path)
    
    test_evaluator =  CECustomEvaluator.from_input_examples(test_dataset, name='sts-test')
    test_evaluator(model, output_path=model_save_path)


    # dev_dataloader = DataLoader(
    #     dataset=dev_dataset,
    #     batch_size=batch_size,
    #     # collate_fn=smart_batching_collate,
    #     shuffle=True,
    # )

    # test_dataloader = DataLoader(
    #     dataset=test_dataset,
    #     batch_size=batch_size,
    #     # collate_fn=smart_batching_collate,
    #     shuffle=True,
    # )

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1315 [00:00<?, ?it/s]

KeyboardInterrupt: 