# Fact Verification and Extraction of Climate-Related Claims

# TRAINER: Evidence Ranking (Step 2)

### Imports

In [1]:
import json
import pandas as pd
from tqdm import tqdm
import pickle
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import os

### Training Arguments

In [2]:
class ER_Train_Arguments():
    ev_path='./data/evidence.json'
    train_claim_pickle = './pickles/er_train_claims.pkl'
    er_model_name = 'roberta-large'
    er_training_name = './models/evidence_ranking_training_folder'
    er_save_dir = './models/evidence_ranking_trained_model'
    dev_claims_pickle = './pickles/er_dev_claims.pkl'
    learning_rate=1e-5
    num_train_epochs=2
    hard_neg_samples = 2
    random_neg_samples = 2
    neg_sample_range_low = 500
    neg_sample_range_high = 1500
    include_dev = True #Change to 'True' for final training run

my_args = ER_Train_Arguments()

### Functions

In [3]:
# Get evidence dataframe from pickle
def build_evidence(path):
    print("Reading evidence from %s ..." % path, end="")
    with open(path, 'r') as f:
        data = json.load(f)
    print("done.")
    evidence_list = []
    for ev_id, text in data.items():
        evidence_list.append([ev_id,text])
    headers = ["id","text"]
    evidence = pd.DataFrame(evidence_list, columns=headers)
    print("Number of ev: ", len(evidence))
    return evidence

# Get claims dataframe from pickle
def get_claims_from_pickle(claim_pickle):
    print("Getting claims from pickle %s." % claim_pickle)
    with open(claim_pickle, 'rb') as f:
        claims = pickle.load(f)
    print("Number of claims: ", len(claims))
    return claims

# Build claim-evidence pairs for training:
def build_pairs(claims, evidence, hard_neg_samples, random_neg_samples, neg_sample_range_low, neg_sample_range_high):
    print("Building training pairs:")
    pairs = []
    no_hard_neg_samples = hard_neg_samples
    no_random_neg_samples = random_neg_samples

    for _, row in tqdm(claims.iterrows(), total=len(claims)):
        evidences = evidence[evidence['id'].isin(row['evidences'])]['text'].to_list()
        for e in evidences:
            pairs.append([row['text'],e,1])
        hard_negative_samples = evidence[evidence['id'].isin(row['top_10k_consolidated'][neg_sample_range_low:neg_sample_range_high])].sample(len(evidences) * no_hard_neg_samples)
        for _, hns in hard_negative_samples.iterrows():
            pairs.append([row['text'],hns['text'],0])
        random_negative_samples = evidence.sample(len(evidences) * no_random_neg_samples)
        for _ , rns in random_negative_samples.iterrows():
            pairs.append([row['text'], rns['text'],0])

    headers = ["claim_text","ev_text", "labels"]
    pairs = pd.DataFrame(pairs, columns=headers)
    print("Number of training pairs:",len(pairs))
    return pairs

# Preprocessing function for mapping claim=ev pair text
def preprocess_function(item):
    
    claim = item['claim_text']
    evidence = item['ev_text']
    encoded_input = tokenizer(
        [[claim,evidence]],
        add_special_tokens=True,
        max_length=128, 
        truncation = True,
        padding='max_length', 
        return_attention_mask=True, 
        return_tensors='pt' 
    )

    return {
        'input_ids': encoded_input['input_ids'].squeeze(),
        'attention_mask': encoded_input['attention_mask'].squeeze(),
        'labels': item['labels']
    }


### Get Data

In [4]:
claims = get_claims_from_pickle(my_args.train_claim_pickle)
if my_args.include_dev:
    claims = pd.concat([claims,get_claims_from_pickle(my_args.dev_claims_pickle)])
evidence = build_evidence(my_args.ev_path)
pairs = build_pairs(claims, evidence, my_args.hard_neg_samples, my_args.random_neg_samples, my_args.neg_sample_range_low, my_args.neg_sample_range_high)

Getting claims from pickle ./pickles/er_train_claims.pkl.
Number of claims:  1228
Getting claims from pickle ./pickles/er_dev_claims.pkl.
Number of claims:  154
Reading evidence from ./data/evidence.json ...done.
Number of ev:  1208827
Building training pairs:


100%|██████████| 1382/1382 [01:56<00:00, 11.88it/s]

Number of training pairs: 23065





### Training Arguments

In [5]:
model_name = my_args.er_model_name
training_name = my_args.er_training_name
save_dir = my_args.er_save_dir

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

dataset = Dataset.from_pandas(pairs)
encoded_dataset = dataset.map(preprocess_function)
encoded_dataset.shuffle()

os.environ["WANDB_DISABLED"] = "true"

args = TrainingArguments(
    training_name,
    learning_rate=my_args.learning_rate,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=my_args.num_train_epochs,
    weight_decay=0.01,
    push_to_hub=False,
    report_to=None,
    save_strategy='no'
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

Map:   0%|          | 0/23065 [00:00<?, ? examples/s]

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### Train Model

In [6]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: ev_text, claim_text. If ev_text, claim_text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 23065
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5768


  0%|          | 0/5768 [00:00<?, ?it/s]

{'loss': 0.369, 'learning_rate': 9.133148404993066e-06, 'epoch': 0.17}
{'loss': 0.3153, 'learning_rate': 8.266296809986132e-06, 'epoch': 0.35}
{'loss': 0.2919, 'learning_rate': 7.399445214979196e-06, 'epoch': 0.52}
{'loss': 0.245, 'learning_rate': 6.5325936199722614e-06, 'epoch': 0.69}
{'loss': 0.2216, 'learning_rate': 5.665742024965326e-06, 'epoch': 0.87}
{'loss': 0.2221, 'learning_rate': 4.798890429958391e-06, 'epoch': 1.04}
{'loss': 0.1539, 'learning_rate': 3.932038834951457e-06, 'epoch': 1.21}
{'loss': 0.1383, 'learning_rate': 3.0651872399445217e-06, 'epoch': 1.39}
{'loss': 0.1448, 'learning_rate': 2.198335644937587e-06, 'epoch': 1.56}
{'loss': 0.1521, 'learning_rate': 1.331484049930652e-06, 'epoch': 1.73}
{'loss': 0.111, 'learning_rate': 4.646324549237171e-07, 'epoch': 1.91}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 2727.2704, 'train_samples_per_second': 16.914, 'train_steps_per_second': 2.115, 'train_loss': 0.21179461644525832, 'epoch': 2.0}


TrainOutput(global_step=5768, training_loss=0.21179461644525832, metrics={'train_runtime': 2727.2704, 'train_samples_per_second': 16.914, 'train_steps_per_second': 2.115, 'train_loss': 0.21179461644525832, 'epoch': 2.0})

### Save Model

In [7]:
trainer.save_model(save_dir)

Saving model checkpoint to ./models/evidence_ranking_trained_model
Configuration saved in ./models/evidence_ranking_trained_model\config.json
Model weights saved in ./models/evidence_ranking_trained_model\pytorch_model.bin
