In [None]:
#!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!pip install transformers==4.1.1

Collecting transformers==4.1.1
  Downloading transformers-4.1.1-py3-none-any.whl (1.5 MB)
[?25l[K     |▏                               | 10 kB 38.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 43.9 MB/s eta 0:00:01[K     |▋                               | 30 kB 47.5 MB/s eta 0:00:01[K     |▉                               | 40 kB 51.1 MB/s eta 0:00:01[K     |█                               | 51 kB 29.9 MB/s eta 0:00:01[K     |█▎                              | 61 kB 33.0 MB/s eta 0:00:01[K     |█▌                              | 71 kB 23.4 MB/s eta 0:00:01[K     |█▊                              | 81 kB 25.2 MB/s eta 0:00:01[K     |██                              | 92 kB 27.1 MB/s eta 0:00:01[K     |██▏                             | 102 kB 25.8 MB/s eta 0:00:01[K     |██▍                             | 112 kB 25.8 MB/s eta 0:00:01[K     |██▋                             | 122 kB 25.8 MB/s eta 0:00:01[K     |██▉                             | 133 k

In [None]:
import torch
from tqdm import tqdm
import random
import csv
from transformers import Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import os
from os import listdir
from os.path import isfile, join
import shutil
from pathlib import Path

In [None]:
print(torch.cuda.device_count()) 
print(torch.cuda.get_device_name(0))

1
Tesla T4


In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [None]:
def readTSVFile(file):
  texts = []
  labels = []
  labelToUse = 0 ## CONTRADICTING / NEI
  if 'uniform_true' in file:
    labelToUse = 1 ## SUPPORTS
  if 'uniform_false' in file:
    labelToUse = 2 ## REFUTES
  with open(file) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
      texts.append(row[0])
      labels.append(labelToUse)
  return texts, labels

In [None]:
def extractSize(fileName):
  splits = fileName.split("_")
  return int(splits[-2])


print(extractSize('basket_full_names_uniform_true_fd_4_train.tsv'))
print(extractSize('soccer_uniform_true_row_1000_train.tsv'))

4
1000


In [None]:
BASE_PATH = "MyDrive/ricerca/pythia_feverous_vldb/"
#templatesInTrain = ['attribute', 'full', 'fd', 'row']
templatesInTrain = ['attribute']
trainSuffix = "_train.tsv"
testSuffix = "_test.tsv"
sizeMax = 100
sizeMin= 0

## fname = baseFolder + tName + "_" + matchType + "_" + type + "_" + str(datasetSize) + suffix

files = [f for f in listdir(BASE_PATH) if isfile(join(BASE_PATH, f))]
trainFiles = []
testFiles = []
for f in files:
  if trainSuffix in f:
    for template in templatesInTrain:
      if template in f:
        if template == 'fd':
          trainFiles.append(f)
        else:
          sizeFile = extractSize(f)
          if sizeFile > sizeMin and sizeFile <= sizeMax:
            trainFiles.append(f)
  
  if testSuffix in f:
    sizeFile = extractSize(f)
    if sizeFile > sizeMin and sizeFile <= sizeMax:
        testFiles.append(f)
    else:
      if template == 'fd' and template in f:
        testFiles.append(f)      

text_train = []
labels_train = []
text_test = []
labels_test = []

print("** TRAIN **")
for trainTSV in trainFiles:
  print(trainTSV)
  text, labels = readTSVFile(BASE_PATH + trainTSV)
  text_train.extend(text)
  labels_train.extend(labels)
print("** Test **")
for testTSV in testFiles:
  print(testTSV)
  text, labels = readTSVFile(BASE_PATH + testTSV)
  text_test.extend(text)
  labels_test.extend(labels)

train_data = list(zip(text_train, labels_train))
random_seed = 42
random.Random(random_seed).shuffle(train_data) #to get deterministic results
#random.shuffle(train_data)
text_train, labels_train = zip(*train_data)

print("TRAIN:", len(text_train))
print("TEST:", len(text_test))

#labels_train_unique = set(labels_train)
#print(labels_train_unique)

#labels_test_unique = set(labels_test)
#print(labels_test_unique)

** TRAIN **
iris_uniform_false_attribute_100_train.tsv
iris_contradicting_attribute_100_train.tsv
iris_uniform_true_attribute_100_train.tsv
basket_full_names_contradicting_attribute_100_train.tsv
basket_full_names_uniform_false_attribute_100_train.tsv
basket_full_names_uniform_true_attribute_100_train.tsv
soccer_contradicting_attribute_100_train.tsv
soccer_uniform_false_attribute_100_train.tsv
soccer_uniform_true_attribute_100_train.tsv
** Test **
basket_acronyms_contradicting_attribute_100_test.tsv
basket_acronyms_uniform_false_attribute_100_test.tsv
basket_acronyms_uniform_true_attribute_100_test.tsv
abalone_contradicting_attribute_100_test.tsv
abalone_uniform_false_attribute_100_test.tsv
abalone_uniform_true_attribute_100_test.tsv
adult_short_contradicting_attribute_100_test.tsv
adult_short_uniform_false_attribute_100_test.tsv
adult_short_uniform_true_attribute_100_test.tsv
mushroom_short_contradicting_attribute_100_test.tsv
mushroom_short_uniform_false_attribute_100_test.tsv
mushro

In [None]:
sigmodDemo = False
if sigmodDemo:
  BASE_PATH = "MyDrive/ricerca/pythia-feverous/"
  datasetTrain = ['iris', 'basket_full_names', 'soccer']
  datasetTest = ['abalone', 'adult', 'basket_acronyms', 'mushroom']
  text_train = []
  labels_train = []
  text_test = []
  labels_test = []
  for trainCSV in datasetTrain:
    fileNameUniformTrue = BASE_PATH + trainCSV + "_uniform_true" + "_train.tsv"
    fileNameUniformFalse = BASE_PATH + trainCSV + "_uniform_false" + "_train.tsv"
    fileNameContradicting = BASE_PATH + trainCSV + "_contradicting" + "_train.tsv"
    textsTrue, labelsTrue = readTSVFile(fileNameUniformTrue)
    textsFalse, labelsFalse = readTSVFile(fileNameUniformFalse)
    textsContradicting, labelsContradicting = readTSVFile(fileNameContradicting)
    text_train.extend(textsTrue)
    text_train.extend(textsFalse)
    text_train.extend(textsContradicting)
    labels_train.extend(labelsTrue)
    labels_train.extend(labelsFalse)
    labels_train.extend(labelsContradicting)

  for test in datasetTest:
    fileNameUniformTrue = BASE_PATH + test + "_uniform_true" + "_test.tsv"
    fileNameUniformFalse = BASE_PATH + test + "_uniform_false" + "_test.tsv"
    fileNameContradicting = BASE_PATH + test + "_contradicting" + "_test.tsv"
    textsTrue, labelsTrue = readTSVFile(fileNameUniformTrue)
    textsFalse, labelsFalse = readTSVFile(fileNameUniformFalse)
    textsContradicting, labelsContradicting = readTSVFile(fileNameContradicting)
    text_test.extend(textsTrue)
    text_test.extend(textsFalse)
    text_test.extend(textsContradicting)
    labels_test.extend(labelsTrue)
    labels_test.extend(labelsFalse)
    labels_test.extend(labelsContradicting)

  train_data = list(zip(text_train, labels_train))
  random.shuffle(train_data)
  text_train, labels_train = zip(*train_data)

  del textsTrue, textsFalse, textsContradicting, labelsTrue, labelsFalse,labelsContradicting

  print("TRAIN:", len(text_train))
  print("TEST:", len(text_test))

In [None]:
class FEVEROUSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, use_labels = True):
        self.encodings = encodings
        self.labels = labels
        self.use_labels = use_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.use_labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    class_rep = classification_report(labels, preds, target_names= ['NOT ENOUGH INFO', 'SUPPORTS', 'REFUTES'], output_dict=True)
    print(class_rep)
    print("Acc: {}, Recall: {}, Precision: {}, F1: {}".format(acc, recall, precision, f1))
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'class_rep': class_rep
    }

#Code for training and testing

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli')
text_train = tokenizer(text_train, padding=True, truncation=True)
train_dataset = FEVEROUSDataset(text_train, labels_train)
text_test = tokenizer(text_test, padding=True, truncation=True)
test_dataset = FEVEROUSDataset(text_test, labels_test)

#to keep the code unchanged we used the NEI class string for AMB class string

In [None]:
def model_trainer_new(model_path, train_dataset, test_dataset):
    model = RobertaForSequenceClassification.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli', num_labels =3, return_dict=True)#ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli

    training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    num_train_epochs=1,              # total # of training epochs
    #per_device_train_batch_size=16,  # batch size per device during training
    #per_device_eval_batch_size=16,   # batch size for evaluation
    per_device_train_batch_size=2,  # batch size per device during training (we used 2 to avoid memory errors)
    per_device_eval_batch_size=1,   # batch size for evaluation
    # gradient_accumulation_steps=3,
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir= os.path.join(model_path, 'logs'),            # directory for storing logs
    logging_steps=1200,
    save_steps = 5900, #1200,
    learning_rate = 1e-05
    # save_strategy='epoch'
    )

    if test_dataset != None:
        trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=test_dataset,          # evaluation dataset
        compute_metrics = compute_metrics,
        )
    else:
        trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        compute_metrics = compute_metrics,
        )
    return trainer, model

In [None]:
MODEL_PATH = '/models/feverous_verdict_predictor_pythia_2'
dirpath = Path(MODEL_PATH)
if dirpath.exists() and dirpath.is_dir():
  shutil.rmtree(dirpath)
  print("MODEL_PATH removed")

MODEL_PATH removed


In [None]:
#!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
trainer, model = model_trainer_new(MODEL_PATH, train_dataset, test_dataset)
trainer.train()
scores = trainer.evaluate()
print(scores)

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Step,Training Loss


Trainer is attempting to log a value of "{'NOT ENOUGH INFO': {'precision': 0.8983666061705989, 'recall': 0.8195364238410596, 'f1-score': 0.8571428571428571, 'support': 604}, 'SUPPORTS': {'precision': 0.9222423146473779, 'recall': 0.8443708609271523, 'f1-score': 0.881590319792567, 'support': 604}, 'REFUTES': {'precision': 0.7796610169491526, 'recall': 0.9139072847682119, 'f1-score': 0.8414634146341463, 'support': 604}, 'accuracy': 0.859271523178808, 'macro avg': {'precision': 0.8667566459223766, 'recall': 0.859271523178808, 'f1-score': 0.86006553052319, 'support': 1812}, 'weighted avg': {'precision': 0.8667566459223764, 'recall': 0.859271523178808, 'f1-score': 0.8600655305231901, 'support': 1812}}" of type <class 'dict'> for key "eval/class_rep" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'NOT ENOUGH INFO': {'precision': 0.8983666061705989, 'recall': 0.8195364238410596, 'f1-score': 0.8571428571428571, 'support': 604}, 'SUPPORTS': {'precision': 0.9222423146473779, 'recall': 0.8443708609271523, 'f1-score': 0.881590319792567, 'support': 604}, 'REFUTES': {'precision': 0.7796610169491526, 'recall': 0.9139072847682119, 'f1-score': 0.8414634146341463, 'support': 604}, 'accuracy': 0.859271523178808, 'macro avg': {'precision': 0.8667566459223766, 'recall': 0.859271523178808, 'f1-score': 0.86006553052319, 'support': 1812}, 'weighted avg': {'precision': 0.8667566459223764, 'recall': 0.859271523178808, 'f1-score': 0.8600655305231901, 'support': 1812}}
Acc: 0.859271523178808, Recall: 0.859271523178808, Precision: 0.859271523178808, F1: 0.8592715231788081
{'eval_loss': 0.9429283738136292, 'eval_accuracy': 0.859271523178808, 'eval_f1': 0.8592715231788081, 'eval_precision': 0.859271523178808, 'eval_recall': 0.859271523178808, 'eval_class_rep': {'NOT ENOUGH INFO': {'precision': 0.89836

# Test FEVEROUS BASELINE published

In [None]:
## link model: https://drive.google.com/file/d/1Zu3RUFzThPpsSkBhlYc0CBoRpIRxauGR/view?usp=sharing
!pip install googledrivedownloader



In [None]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1Zu3RUFzThPpsSkBhlYc0CBoRpIRxauGR',
                                    dest_path='/content/feverous_cell_extractor.zip',
                                    unzip=True)

Downloading 1Zu3RUFzThPpsSkBhlYc0CBoRpIRxauGR into /content/feverous_cell_extractor.zip... Done.
Unzipping...Done.


In [None]:
def model_trainer(model_path, test_dataset):
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels =4)
    model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels =3, return_dict=True)

    #anfs/bigdisc/rmya2/faiss_data/model_verdict_predictor/checkpoint-1500'
    training_args = TrainingArguments(
    output_dir='/content/results',          # output directory
    per_device_eval_batch_size=32,   # batch size for evaluation
    )

    trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    eval_dataset=test_dataset,          # evaluation dataset
    compute_metrics = compute_metrics,
    )
    return trainer, model

In [None]:
def claim_predictor(model_path, text_test, labels_test):
    ### args.model_path is the only required parameter
    tokenizer = RobertaTokenizer.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli')
    text_test = tokenizer(text_test, padding=True, truncation=True)
    test_dataset = FEVEROUSDataset(text_test, labels_test)

    trainer, model = model_trainer(model_path, test_dataset)
    predictions = trainer.predict(test_dataset)
    predictions = predictions.predictions.argmax(-1)
    return predictions

In [None]:
MODEL_PATH_LOCAL = '/content/feverous_cell_extractor/'
predictions = claim_predictor(MODEL_PATH_LOCAL, text_test, labels_test)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Some weights of the model checkpoint at /content/feverous_cell_extractor/ were not used when initializing RobertaForSequenceClassification: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/feverous_cell_extractor/ and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-st

{'NOT ENOUGH INFO': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 604}, 'SUPPORTS': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 604}, 'REFUTES': {'precision': 0.3333333333333333, 'recall': 1.0, 'f1-score': 0.5, 'support': 604}, 'accuracy': 0.3333333333333333, 'macro avg': {'precision': 0.1111111111111111, 'recall': 0.3333333333333333, 'f1-score': 0.16666666666666666, 'support': 1812}, 'weighted avg': {'precision': 0.1111111111111111, 'recall': 0.3333333333333333, 'f1-score': 0.16666666666666666, 'support': 1812}}
Acc: 0.3333333333333333, Recall: 0.3333333333333333, Precision: 0.3333333333333333, F1: 0.3333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
predictions

array([2, 2, 2, ..., 2, 2, 2])

In [None]:
unique_pred = set(predictions)
print(unique_pred)

{2}
