In [None]:
!git clone https://github.com/cher-liang/Gravitas-NLP
%cd Gravitas-NLP

d:\Dev\Gravitas-NLP\Gravitas-NLP


Cloning into 'Gravitas-NLP'...


In [None]:
!pip install sentence-transformers

^C


In [None]:
!git checkout restart

In [None]:
!7z x datasets.zip

In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import logging 
from dataclasses import dataclass
from typing import List, Dict 
from datetime import datetime
import math

from CustomCrossEncoder import CrossEncoder
from CustomEvaluator import CECustomEvaluator

  from tqdm.autonotebook import tqdm, trange


In [2]:
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, 
)
logger = logging.getLogger(__name__)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class QnliData:
    def __init__(self, row) -> None:
        self.sentence_pair = (row["question"],row["answer"])
        self.score = row["normalized_score"]
        self.dataset = row["source"]
    
    def __str__(self) -> str:
        string = "Sentence Pair: {}\n".format(self.sentence_pair)
        string += "Score: {}\t Dataset: {}\n".format(self.score,self.dataset)
        return string

In [5]:
def removeIrrelevantQuestions(df:pd.DataFrame):
    return df[~ df.question.isin(["Why?","Explain your reasoning.","Why not?","Why did it happen?",])]

In [6]:
train_semeval_df = pd.read_excel("datasets/semeval.xlsx")

test_semeval_df = pd.read_excel("datasets/test/semeval_unseen_domains.xlsx")
dev_semeval_df1 = pd.read_excel("datasets/develop/semeval_unseen_answers.xlsx")
dev_semeval_df2 = pd.read_excel("datasets/develop/semeval_unseen_questions.xlsx")

train_semeval_df = removeIrrelevantQuestions(train_semeval_df)
test_semeval_df = removeIrrelevantQuestions(test_semeval_df)
dev_semeval_df1 = removeIrrelevantQuestions(dev_semeval_df1)
dev_semeval_df2 = removeIrrelevantQuestions(dev_semeval_df2)

train_semeval_dataset = train_semeval_df.apply(QnliData, axis=1).to_numpy()
test_semeval_dataset = test_semeval_df.apply(QnliData, axis=1).tolist()
dev_semeval_dataset = np.append(
    dev_semeval_df1.apply(QnliData, axis=1).to_numpy()
    , dev_semeval_df2.apply(QnliData, axis=1).to_numpy()
)

In [7]:
sag_df = pd.read_excel("datasets/sag.xlsx")
misc_df = pd.read_excel("datasets/misc.xlsx")

sag_dataset = sag_df.apply(QnliData, axis=1).tolist()
misc_dataset = misc_df.apply(QnliData, axis=1).tolist()

In [None]:
train_batch_size = 16
num_epochs = 5

In [8]:
kf = KFold(n_splits=5, shuffle=True)
for fold, (sag_index, misc_index) in enumerate(
    zip(kf.split(sag_dataset), kf.split(misc_dataset))
):
    np.random.shuffle(train_semeval_dataset)
    np.random.shuffle(test_semeval_dataset)
    np.random.shuffle(dev_semeval_dataset)

    train_test_sag_index, dev_sag_index = sag_index
    train_test_misc_index, dev_misc_index = misc_index

    train_sag_index, test_sag_index = train_test_split(
        train_test_sag_index, test_size=0.25
    )
    train_misc_index, test_misc_index = train_test_split(
        train_test_misc_index, test_size=0.25
    )

    train_datasets_list = [
        train_semeval_dataset,
        np.take(sag_dataset, train_sag_index),
        np.take(misc_dataset, train_misc_index),
    ]

    dev_datasets_list = [
        dev_semeval_dataset,
        np.take(sag_dataset, dev_sag_index),
        np.take(misc_dataset, dev_misc_index),
    ]

    test_datasets_list = [
        test_semeval_dataset,
        np.take(sag_dataset, test_sag_index),
        np.take(misc_dataset, test_misc_index),
    ]

    train_dataset = torch.utils.data.ConcatDataset(train_datasets_list)
    dev_dataset = torch.utils.data.ConcatDataset(dev_datasets_list)
    test_dataset = torch.utils.data.ConcatDataset(test_datasets_list)

    model = CrossEncoder("cross-encoder/qnli-electra-base")
    model_save_path = f'output/training_qnli_fold{fold}'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=train_batch_size,
        shuffle=True,
    )

    evaluator = CECustomEvaluator.from_input_examples(dev_dataset, name="sts-dev")

    warmup_steps = math.ceil(
        len(train_dataloader) * num_epochs * 0.1
    )  # 10% of train data for warm-up

    model.fit(
        train_dataloader=train_dataloader,
        freeze=True,
        evaluator=evaluator,
        epochs=num_epochs,
        evaluation_steps=600,
        warmup_steps=warmup_steps,
        output_path=model_save_path,
    )

    test_evaluator = CECustomEvaluator.from_input_examples(
        test_dataset, name="sts-test"
    )
    test_evaluator(model, output_path=model_save_path)


9542 2711 5104
Sentence Pair: ('Look at the schematic diagram. What will happen to the other 2 bulbs if the middle bulb burns out?', 'They will stay lit.')
Score: 1.0	 Dataset: semeval

9542 2711 5104
Sentence Pair: ('Under what circumstances will a switch affect a bulb?', 'when the switch is connected between the bulb and the battery')
Score: 0.5	 Dataset: semeval

9542 2711 5104
Sentence Pair: ('Why do both bulbs A and B stay on when bulb C is burned out?', 'If bulb C is damaged, there is still a closed path with the battery.')
Score: 1.0	 Dataset: semeval

9543 2710 5104
Sentence Pair: ('Design a way to use carbon printing to find out if 2 Labrador retrievers have the same paw patterns. How will you know if the 2 dogs have paw patterns that are the same?', 'By comparing the 2 prints.')
Score: 0.5	 Dataset: semeval

9543 2710 5104
Sentence Pair: ('Mary told her friend Sharice that she had a rug to sell. Sharice asked if the rug would fit perfectly, wall to wall, in her bedroom. Mary 