# Fact Verification and Extraction of Climate-Related Claims

# TRAINER: Claim Classification (Step 3)

### Imports

In [1]:
import pickle
import json
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import os

### Training Arguments

In [2]:
class CC_Train_Arguments():
    ev_path='./data/evidence.json'
    train_claims_path = './data/train-claims.json'
    cc_model_name = 'roberta-large-mnli'
    cc_training_name = './models/claim_classification_training_folder'
    cc_save_dir = './models/claim_classification_trained_model'
    dev_claims_path = './data/dev-claims.json'
    dev_claims_pickle = './pickles/dev_claims.pkl'
    bootstrap_pickle = './pickles/bootstrap_pairs.pkl'
    include_bootstrap = False
    learning_rate=1e-5
    num_train_epochs=2
    include_dev = True # Change to 'True' to include dev in final train run

my_args = CC_Train_Arguments()

### Functions

In [3]:
# Map claim labels to numbered labels
label_dict ={
    "REFUTES" : 0,
    "NOT_ENOUGH_INFO":1,
    "SUPPORTS": 2,    
    "DISPUTED":3
}

# Build evidence dataframe
def build_evidence(path):
    print("Reading evidence from %s ..." % path, end="")
    with open(path, 'r') as f:
        data = json.load(f)
    print("done.")
    evidence_list = []
    for ev_id, text in data.items():
        evidence_list.append([ev_id,text])
    headers = ["id","text"]
    evidence = pd.DataFrame(evidence_list, columns=headers)
    print("Number of ev: ", len(evidence))
    return evidence

# Build claims dataframe
def build_claims(path):

    print("Reading claims from %s ..." % path,end="")
    with open(path, 'r') as f:
        data = json.load(f)
    claims = []
    print("done.")
    for claim, info in data.items():
        claims.append([claim,info['claim_text'],info['claim_label'],info['evidences']])
    headers = ["id","text","claim_label","evidences"]
    claims = pd.DataFrame(claims, columns=headers)
    print("Number of claims: ", len(claims))
    return claims

# Build ev-claim pairs for training
def build_pairs(claims, evidence):

    print("Building pairs ...")

    pairs = []
    
    for _, row in tqdm(claims.iterrows(), total=len(claims)):
        if row['claim_label'] != "DISPUTED":
            evidences = evidence[evidence['id'].isin(row['evidences'])]['text'].to_list()
            for e in evidences:
                pairs.append([row['text'],e,label_dict[row['claim_label']]])

    headers = ["claim_text","ev_text", "labels"]
    pairs = pd.DataFrame(pairs, columns=headers)
    print("Total training pairs:",len(pairs))
    for item in label_dict:
        print("%15s: %d" % (item, len(pairs[pairs['labels']==label_dict[item]])))
        
    return pairs

# Append bootstrapped pairs (for dev only - not used in final implementation)
def append_bootstrapped_pairs(pairs):
    try:
        with open(my_args.bootstrap_pickle, 'rb') as f:
            bootstrap_pairs = pickle.load(f)
        pairs = pd.concat([pairs,bootstrap_pairs])
        print("Added %d bootstrapped pairs. Total pairs now %d" % (len(bootstrap_pairs), len(pairs)))
        for item in label_dict:
            print("%15s: %d" % (item, len(pairs[pairs['labels']==label_dict[item]])))
    except:
        print("No bootstrapped pairs located.")
    return pairs

# Mapped preprocessing function for ev-claim text pairs
def preprocess_function(item):
    
    claim = item['claim_text']
    evidence = item['ev_text']
    encoded_input = tokenizer(
        [[claim,evidence]],
        add_special_tokens=True,
        max_length=128, 
        truncation = True,
        padding='max_length', 
        return_attention_mask=True, 
        return_tensors='pt' 
    )

    return {
        'input_ids': encoded_input['input_ids'].squeeze(),
        'attention_mask': encoded_input['attention_mask'].squeeze(),
        'labels': item['labels']
    }


### Get Data

In [4]:
claims = build_claims(my_args.train_claims_path)
if my_args.include_dev:
    claims = pd.concat([claims,build_claims(my_args.dev_claims_path)])
evidence = build_evidence(my_args.ev_path)
pairs = build_pairs(claims,evidence)

Reading claims from ./data/train-claims.json ...done.
Number of claims:  1228
Reading claims from ./data/dev-claims.json ...done.
Number of claims:  154
Reading evidence from ./data/evidence.json ...done.
Number of ev:  1208827
Building pairs ...


100%|██████████| 1382/1382 [00:37<00:00, 36.58it/s]

Total training pairs: 4163
        REFUTES: 514
NOT_ENOUGH_INFO: 2135
       SUPPORTS: 1514
       DISPUTED: 0





### Training Arguments

In [5]:
model_name = my_args.cc_model_name
training_name = my_args.cc_training_name
save_dir = my_args.cc_save_dir

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

dataset = Dataset.from_pandas(pairs)
encoded_dataset = dataset.map(preprocess_function)
encoded_dataset.shuffle()

os.environ["WANDB_DISABLED"] = "true"

args = TrainingArguments(
    training_name,
    learning_rate=my_args.learning_rate,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=my_args.num_train_epochs,
    weight_decay=0.01,
    push_to_hub=False,
    report_to=None,
    save_strategy='no'
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset
)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/4163 [00:00<?, ? examples/s]

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### Train Model

In [6]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: claim_text, ev_text. If claim_text, ev_text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4163
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1042


  0%|          | 0/1042 [00:00<?, ?it/s]

{'loss': 0.6615, 'learning_rate': 5.201535508637236e-06, 'epoch': 0.96}
{'loss': 0.3421, 'learning_rate': 4.0307101727447224e-07, 'epoch': 1.92}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 484.6454, 'train_samples_per_second': 17.18, 'train_steps_per_second': 2.15, 'train_loss': 0.49361161398567305, 'epoch': 2.0}


TrainOutput(global_step=1042, training_loss=0.49361161398567305, metrics={'train_runtime': 484.6454, 'train_samples_per_second': 17.18, 'train_steps_per_second': 2.15, 'train_loss': 0.49361161398567305, 'epoch': 2.0})

### Save Model

In [7]:
trainer.save_model(save_dir)

Saving model checkpoint to ./models/claim_classification_trained_model
Configuration saved in ./models/claim_classification_trained_model\config.json
Model weights saved in ./models/claim_classification_trained_model\pytorch_model.bin


# Bootstrapping (Dev only)

### Bootstrap Functions (Used for dev only; not used in final version)

In [8]:
import pickle
import torch

# Get predicted scores
def get_pred_scores(pairs, model, tokenizer):

    print("Getting prediction scores ...")

    S = []
    R = []
    NEI = []

    text_pairs = [[pairs.iloc[j]['claim_text'], pairs.iloc[j]['ev_text']] for j,_ in pairs.iterrows()]
    encodings = tokenizer(text_pairs, 
                            add_special_tokens=True,
                            max_length=128, 
                            truncation=True, 
                            padding='max_length', 
                            return_attention_mask=True, 
                            return_tensors='pt').to('cuda')
    with torch.no_grad():
        outputs = model(**encodings)
    for _, score in enumerate(outputs.logits):
        S.append(score[0].item())
        R.append(score[1].item())
        NEI.append(score[2].item())

    return S, R, NEI

# Get additional ev-claim pairs from bootstraping
def get_bootstrap_pairs():

    claims = build_claims(my_args.train_claims_path)
    if my_args.include_dev:
        claims = pd.concat([claims,build_claims(my_args.dev_claims_path)])
    evidence = build_evidence(my_args.ev_path)

    disputed_claims = claims[claims['claim_label']=="DISPUTED"]
    NEI_Label = ["NOT_ENOUGH_INFO" for i in range(0,len(disputed_claims))]
    disputed_claims['claim_label'] = NEI_Label
    disputed_pairs = build_pairs(disputed_claims,evidence)

    disputed_pairs['S'], disputed_pairs['R'], disputed_pairs['NEI'] = get_pred_scores(disputed_pairs, classifier_model, classifier_tokenizer)
    print("Disputed pairs analysed:", len(disputed_pairs))

    print("Locating max SUPPORTS and REFUTES pairs...")
    bootstrap_pairs = []
    unique_claims = disputed_pairs['claim_text'].unique()
    s_pairs = []
    r_pairs = []
    for uc in unique_claims:
        grouping = disputed_pairs[disputed_pairs['claim_text'] == uc]
        if grouping['S'].max() > 2:
            s_pairs.append(grouping['S'].idxmax())
        if grouping['R'].max() > 2:
            r_pairs.append(grouping['R'].idxmax())

    for s in s_pairs:
        row = {
            "claim_text":disputed_pairs.iloc[s]['claim_text'],
            "ev_text":disputed_pairs.iloc[s]['ev_text'],
            "labels":0
        }
        bootstrap_pairs.append(row)
    for r in r_pairs:
        row = {
            "claim_text":disputed_pairs.iloc[r]['claim_text'],
            "ev_text":disputed_pairs.iloc[r]['ev_text'],
            "labels":1
        }
        bootstrap_pairs.append(row)

    bootstrap_pairs = pd.DataFrame(bootstrap_pairs)
    bootstrap_pairs.head()

    print("Saving %d additional bootstrap pairs to pickle..." % len(bootstrap_pairs), end='')
    with open(my_args.bootstrap_pickle, 'wb') as f:
        pickle.dump(bootstrap_pairs, f)
    print("done.")

# Run bootstrapping training:
def run_bootstrapping():
    model_name = my_args.cc_model_name
    training_name = my_args.cc_training_name
    save_dir = my_args.cc_save_dir

    print("Loading saved model:")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=save_dir).to('cuda')

    dataset = Dataset.from_pandas(bootstrap_pairs)
    encoded_dataset = dataset.map(preprocess_function)
    encoded_dataset.shuffle()

    os.environ["WANDB_DISABLED"] = "true"

    args = TrainingArguments(
        training_name,
        learning_rate=my_args.learning_rate,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        push_to_hub=False,
        report_to=None,
        save_strategy='no'
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=encoded_dataset
    )

    trainer.train()
    trainer.save_model(save_dir)