In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
#!pip install transformers==4.1.1
import torch
from tqdm import tqdm
import random
import csv
#from transformers import Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from transformers import (
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertForSequenceClassification,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.model_selection import train_test_split
import os
from os import listdir
from os.path import isfile, join
import shutil
from pathlib import Path
import json
import pandas as pd
import shutil
import ast

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli')

In [None]:
class FEVEROUSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, use_labels = True):
        self.encodings = encodings
        self.labels = labels
        self.use_labels = use_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.use_labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    class_rep = classification_report(labels, preds, target_names= ['NOT ENOUGH INFO', 'SUPPORTS', 'REFUTES'], output_dict=True)
    print(class_rep)
    print("Acc: {}, Recall: {}, Precision: {}, F1: {}".format(acc, recall, precision, f1))
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'class_rep': class_rep
    }

In [None]:
def datasetStats(sentences, labels, useAmbLabel=False):
  stats = {0:0, 1:0, 2:0}
  if useAmbLabel: stats = {0:0, 1:0, 2:0, 3:0}
  for sentence, label in zip(sentences, labels):
    counter = stats[label]
    counter += 1
    stats[label] = counter
  return stats

In [None]:
def readTSVFile(file):
  texts = []
  labels = []
  labelToUse = 0 ## CONTRADICTING / NEI
  if 'uniform_true' in file:
    labelToUse = 1 ## SUPPORTS
  if 'uniform_false' in file:
    labelToUse = 2 ## REFUTES
  with open(file) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
      texts.append(row[0])
      labels.append(labelToUse)
  return texts, labels

In [None]:
def printPredictions(text_test, labels_test, predictions, labelToFilter=None):
  for text, label, prediction in zip(text_test ,labels_test, predictions):
    if labelToFilter is not None:
      if label == labelToFilter: print(text, label, prediction)
    else:
      print(text, label, prediction)

# FEVEROUS DATASET

In [None]:
def loadJsonL(fileName):
    f = open(fileName)
    examples = []
    for line in f:
        example = ast.literal_eval(line)
        examples.append(example)
    f.close()
    return examples

def to_sentence(selectedData):
  text = ""
  for pos in range(0, len(selectedData)):
    text += str(selectedData[pos]).strip()
    if pos + 1 < len(selectedData):
      text += " ; "
    else:
      text += "."
  return text.strip()

def to_feverous_input(claim, evidence, evidenceContext, title):
    sequence = []
    sequence.append(claim)
    sequence.append(to_sentence(list(title)))
    sequence.append(to_sentence(evidence))
    sequence.append(to_sentence(evidenceContext))
    return ' </s> '.join(sequence)

def toFeverousLabel(label):
  labelToUse = 0 ## CONTRADICTING / NEI
  if label == 'SUPPORTS':
    labelToUse = 1 ## SUPPORTS
  if label == 'REFUTES':
    labelToUse = 2 ## REFUTES
  return labelToUse

def getTextLabes(loadedFile):
  sentences = []
  sentencesSet = set()
  labels = []
  for data in loadedFile:
    if isinstance(data, dict) == False: continue
    claim = data['claim']
    label = data['label']
    evidence = data['evidence']
    evidenceContext = data['evidence_ctxt']
    title = data['title']
    text = to_feverous_input(claim, evidence, evidenceContext, title)
    labelData = toFeverousLabel(label)
    sentences.append(text)
    sentencesSet.add(text)
    labels.append(labelData)
  return sentences, sentencesSet, labels

BASE_PATH = "/gdrive/MyDrive/research/tenet/data/feverousWithValues/"  ## TODO: Change wrt to your GDRIVE Path.
fileTrainFeverous = BASE_PATH+"feverous_train.jsonl"
print("FILE TO OPEN:", fileTrainFeverous)

RANDOM_SEED = 42
sentencesTrain, sentencesSetTrain, labelsTrain = getTextLabes(loadJsonL(fileTrainFeverous))
statsTrain = datasetStats(sentencesTrain, labelsTrain)
print(statsTrain)

pairs = list(zip(sentencesTrain, labelsTrain))
random.Random(RANDOM_SEED).shuffle(pairs)
sentencesTrain, labelsTrain = zip(*pairs)
sentencesTrain = list(sentencesTrain)
labelsTrain = list(labelsTrain)

fileTestFeverous = BASE_PATH+"feverous_dev.jsonl"
print("FILE TO OPEN:", fileTestFeverous)

sentencesTest, sentencesSetTest, labelsTest = getTextLabes(loadJsonL(fileTestFeverous))
sentencesTest.append("* Test Claim should be 0")
labelsTest.append(0)
statsTest = datasetStats(sentencesTest, labelsTest)
print(statsTest)

EXTEND_WITH_TENET = False
EXTEND_WITH_WARM = True
EXTEND_WITH_COLD = False
EXTEND_WITH_BASELINES = True
if EXTEND_WITH_TENET:
  if EXTEND_WITH_WARM:
    fileExtendTrainFeverous = BASE_PATH+"tenet_generated_warm_50.jsonl"
    #fileExtendTrainFeverous = BASE_PATH+"tenet_generated_warm_100.jsonl"
    #fileExtendTrainFeverous = BASE_PATH+"tenet_generated_warm_200.jsonl"
    #fileExtendTrainFeverous = BASE_PATH+"tenet_generated_warm_300.jsonl"
    #fileExtendTrainFeverous = BASE_PATH+"tenet_generated_warm_400.jsonl"
    print("FILE TO OPEN:", fileExtendTrainFeverous)
    sentencesTrainExtend, sentencesSetTrainExtend, labelsTrainExtend = getTextLabes(loadJsonL(fileExtendTrainFeverous))
    statsGenerated = datasetStats(sentencesTrainExtend, labelsTrainExtend)
    print(statsGenerated)
    sentencesTrain += sentencesTrainExtend
    labelsTrain += labelsTrainExtend
    pairs = list(zip(sentencesTrain, labelsTrain))
    random.Random(RANDOM_SEED).shuffle(pairs)
    sentencesTrain, labelsTrain = zip(*pairs)
    sentencesTrain = list(sentencesTrain)
    labelsTrain = list(labelsTrain)
    print("Training Set Augmented")
    statsTrain = datasetStats(sentencesTrain, labelsTrain)
    print(statsTrain)
  if EXTEND_WITH_COLD:
    #fileExtendTrainFeverous = BASE_PATH+"tenet_generated_cold_300.jsonl"
    fileExtendTrainFeverous = BASE_PATH+"tenet_generated_cold_reversed_100.jsonl"
    print("FILE TO OPEN:", fileExtendTrainFeverous)
    sentencesTrainExtend, sentencesSetTrainExtend, labelsTrainExtend = getTextLabes(loadJsonL(fileExtendTrainFeverous))
    statsGenerated = datasetStats(sentencesTrainExtend, labelsTrainExtend)
    print(statsGenerated)
    sentencesTrain += sentencesTrainExtend
    labelsTrain += labelsTrainExtend
    pairs = list(zip(sentencesTrain, labelsTrain))
    random.Random(RANDOM_SEED).shuffle(pairs)
    sentencesTrain, labelsTrain = zip(*pairs)
    sentencesTrain = list(sentencesTrain)
    labelsTrain = list(labelsTrain)
    print("Training Set Augmented")
    statsTrain = datasetStats(sentencesTrain, labelsTrain)
    print(statsTrain)

if EXTEND_WITH_BASELINES:
   fileExtendTrainBaseline = BASE_PATH+"feverous_train_baseline_augmentation_not_nei.jsonl" ## all dataset
   print("FILE TO OPEN:", fileExtendTrainBaseline)
   sentencesTrainExtend, sentencesSetTrainExtend, labelsTrainExtend = getTextLabes(loadJsonL(fileExtendTrainBaseline))
   statsGenerated = datasetStats(sentencesTrainExtend, labelsTrainExtend)
   print(statsGenerated)
   sentencesTrain += sentencesTrainExtend
   labelsTrain += labelsTrainExtend
   pairs = list(zip(sentencesTrain, labelsTrain))
   random.Random(RANDOM_SEED).shuffle(pairs)
   sentencesTrain, labelsTrain = zip(*pairs)
   sentencesTrain = list(sentencesTrain)
   labelsTrain = list(labelsTrain)
   print("Training Set Augmented")
   statsTrain = datasetStats(sentencesTrain, labelsTrain)
   print(statsTrain)

In [None]:
### TRAIN MODEL

def model_trainer(train_dataset, test_dataset, model_path, config):
    #model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels =3, return_dict=True).to(config["device"])
    model = RobertaForSequenceClassification.from_pretrained("ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli", num_labels =3, return_dict=True).to(config["device"])

    training_args = TrainingArguments(
        output_dir=model_path,  # output directory
        num_train_epochs=1,  # total # of training epochs
        per_device_train_batch_size=4, #16,  # batch size per device during training (2 works with free plan, 4 with V100)
        per_device_eval_batch_size=4, #16,   # batch size for evaluation (2 work with free plan, 4 with V100)
        gradient_accumulation_steps=1,
        warmup_steps=0,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        #logging_dir=os.path.join(config["model_path"], "logs"),  # directory for storing logs
        logging_steps=1200,
        save_steps=5900,  # 1200,
        learning_rate=0.00001
        # save_strategy='epoch'
    )

    if test_dataset != None:
        trainer = Trainer(
            model=model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=train_dataset,  # training dataset
            eval_dataset=test_dataset,  # evaluation dataset
            compute_metrics=compute_metrics,
        )
    else:
        trainer = Trainer(
            model=model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=train_dataset,  # training dataset
            compute_metrics=compute_metrics,
        )
    return trainer, model

MODEL_PATH_LOCAL = '/content/feverous_verdict_predictor/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config={}
config["device"] = device
tokenizer = RobertaTokenizer.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli')
text_train = tokenizer(sentencesTrain, padding=True, truncation=True)
train_dataset = FEVEROUSDataset(text_train, labelsTrain)
text_train = tokenizer(sentencesTest, padding=True, truncation=True)
test_dataset = FEVEROUSDataset(text_train, labelsTest)
trainer, model = model_trainer(train_dataset, test_dataset,MODEL_PATH_LOCAL, config)
trainer.train()
scores = trainer.evaluate()
print(scores["eval_class_rep"])

function ClickConnect(){
    console.log("Working");
    document.querySelector("colab-toolbar-button#connect").click()
}
setInterval(ClickConnect,60000)