In [1]:
!git clone https://github.com/cher-liang/Gravitas-NLP
%cd Gravitas-NLP

d:\Dev\Gravitas-NLP\Gravitas-NLP


Cloning into 'Gravitas-NLP'...


In [2]:
!pip install sentence-transformers

^C


In [None]:
!git checkout restart

In [None]:
!7z x datasets.zip

In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import logging 
from dataclasses import dataclass
from typing import List, Dict 
from datetime import datetime
import math

from sentence_transformers import losses
from CustomSentenceTransformer import SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
# from CustomEvaluator import CECustomEvaluator

In [2]:
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, 
)
logger = logging.getLogger(__name__)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class GravitasData:
    def __init__(self, row) -> None:
        self.texts = (row["reference_answer"],row["answer"])
        self.label = row["normalized_score"]
        self.dataset = row["source"]
    
    def __str__(self) -> str:
        string = "Sentence Pair: {}\n".format(self.sentence_pair)
        string += "Score: {}\t Dataset: {}\n".format(self.score,self.dataset)
        return string

In [5]:
@dataclass
class DataFrameWrapper:
    dataset_source: str
    df: pd.DataFrame

dataset_names = ["asap_sas", "cunlp", "misc", "sag", "semeval", "stita"]
dataframes: List[DataFrameWrapper] = []
for dataset_name in dataset_names:
    dataframes.append(
        DataFrameWrapper(
            dataset_source=dataset_name,
            df=pd.read_excel(f"datasets/{dataset_name}.xlsx"),
        )
    )


In [6]:
datasets: Dict[str,List[GravitasData]]={}
for dataframe in dataframes:
    datasets[dataframe.dataset_source]=dataframe.df.apply(GravitasData, axis=1).tolist()

In [7]:
# Test dataset for Semeval
test_semeval_df = pd.read_excel("datasets/test/semeval_unseen_domains.xlsx")
dev_semeval_df1 = pd.read_excel("datasets/develop/semeval_unseen_answers.xlsx")
dev_semeval_df2 = pd.read_excel("datasets/develop/semeval_unseen_questions.xlsx")

test_semeval_dataset = test_semeval_df.apply(GravitasData, axis=1).tolist()
dev_semeval_dataset = (
    dev_semeval_df1.apply(GravitasData, axis=1).tolist()
    + dev_semeval_df2.apply(GravitasData, axis=1).tolist()
)

In [8]:
dataset_names.remove("semeval")

In [9]:
# model = SentenceTransformer("avsolatorio/GIST-Embedding-v0")
# model_save_path = f'output/gist'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# print(model.model.base_model)
# print(model.model)

In [10]:
train_batch_size=16
num_epochs = 5

In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
# 5 Fold Cross Validation with (60-20-20, Train-Develop-Test split)
kfSplits = [
    kf.split(dataset) for (key, dataset) in datasets.items() if key != "semeval"
]
for fold, indexes in enumerate(zip(*kfSplits)):
    model = SentenceTransformer("avsolatorio/GIST-Embedding-v0")
    model_save_path = f"output/gist-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    train_datasets_list, test_datasets_list, dev_datasets_list = (
        [np.array(datasets["semeval"])],
        [np.array(test_semeval_dataset)],
        [np.array(dev_semeval_dataset)],
    )

    # Split each datasets into train, develop, test except "semeval" dataset
    for (train_test_index, dev_index), dataset_name in zip(indexes, dataset_names):
        train_index, test_index = train_test_split(
            train_test_index, test_size=0.25, random_state=42
        )
        train_datasets_list.append(np.take(datasets[dataset_name], train_index))
        dev_datasets_list.append(np.take(datasets[dataset_name], dev_index))
        test_datasets_list.append(np.take(datasets[dataset_name], test_index))

    train_dataset = torch.utils.data.ConcatDataset(train_datasets_list)
    dev_dataset = torch.utils.data.ConcatDataset(dev_datasets_list)
    test_dataset = torch.utils.data.ConcatDataset(test_datasets_list)

    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=train_batch_size,
        shuffle=True,
    )

    train_loss = losses.CosineSimilarityLoss(model=model)
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        dev_dataset, name="gist-dev"
    )

    warmup_steps = math.ceil(
        len(train_dataloader) * num_epochs * 0.1
    )  # 10% of train data for warm-up

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        freeze = True,
        evaluator=evaluator,
        epochs=num_epochs,
        evaluation_steps=1000,
        warmup_steps=warmup_steps,
        output_path=model_save_path,
    )

    # model.fit(
    #     train_dataloader=train_dataloader,
    #     freeze=True,
    #     evaluator=evaluator,
    #     epochs=num_epochs,
    #     evaluation_steps=1000,
    #     warmup_steps=warmup_steps,
    #     output_path=model_save_path,
    # )

    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_dataset, name="gist-test"
    )
    test_evaluator(model, output_path=model_save_path)

2024-02-19 13:46:14 - Load pretrained SentenceTransformer: avsolatorio/GIST-Embedding-v0


2024-02-19 13:46:15 - Use pytorch device: cuda


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1315 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/PyTorch Models"
