**Manual setup of required environment variables.**

In [None]:
%env FND_ROOT=/workspace/fnd-building

**Define constands and import all randomness sources first.**

In [None]:
import numpy as np
import random
import torch

from datetime import datetime

**Configuration constants.**

In [None]:
CUDA_DEVICE = 0

# Global original seed for randomness reproducibility.
OG_SEED = 30082010

# Prefix for storing results.
RUN_SUFFIX = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

# Absolute path to root folder of the repository.
FND_ROOT=%env FND_ROOT

DATA_PATH_PREFIX=f"{FND_ROOT}/datasets/datasets-fnd-stance"

**Initialize randomness sources with original seed, for full reproducibility of results.**

In [None]:
np.random.seed(OG_SEED)
torch.manual_seed(OG_SEED)
random.seed(OG_SEED)

**All other imports**

In [None]:
import csv
import gc
import glob
import json
import os
import pandas as pd
import re
import transformers

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn import metrics
from sklearn.utils import class_weight

from torch import cuda
from torch.utils.data import Dataset, DataLoader

from transformers import get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

print(f"Pytorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")

**Setup papermill parameters.** The cell below must be tagged with the 'parameters' tag.
See: https://papermill.readthedocs.io/en/latest/usage-parameterize.html

In [None]:
# papermill parameters

## Must be set to True when running via papermill.
PAPERMILL = False

## Seed to be used in CLM Building mode, where only 1 seed is used.
INITIAL_SEED = 0

## Name for saving model and tokenizer in huggingface hub.
## Requires previous login using HF_TOKEN
## Use command: huggingface-cli login --token $HF_TOKEN
## where HF_TOKEN must be set as an environment variable with your login token.
SAVE_CHECKPOINT_NAME="<user>/<model-tokenizer-name>"

## Whether to run as standalone task or as CLM building task.
## Options: ["standalone", "clm"]
# Standalone task:
#      Use train and validation sets separately, for training an validation.
#      Test with test set.
# CLM building task:
#      Merge train and validation sets for training, test with test set.
RUN_MODE = "standalone"

## Folder to store Excel result files
XLS_RESULTS_FOLDER = "./"

## Set to 1 to store results in S3, set to 0 otherwise.
# Defaults to 0.
STORE_RESULTS_S3 = 0

## Parameters for saving and uploading results.
EXPERIMENT_NAME = "EXP-TBD"
RUN_SETTING = "-1"

# Actual parameters for running the model

## Starting transformers checkpoint.
# To use private checkpoints, user must be logged in
# using the hugging-face-cli login method.
CHECKPOINT = "bert-base-uncased" 

## Number of labels in the dataset
NUM_LABELS = 4

## Number of seeds to explore in standalone mode.
SEEDS_NUM = 1

## Maximum length for BERT tokens
MAX_LEN = 200

## Training parameters
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
EPOCHS = 2
LEARNING_RATE = 1e-06
WEIGHTED_LOSS = True

**Clear CUDA cache and perform garbage collection.**

In [None]:
torch.cuda.empty_cache()
gc.collect()

**Setup CUDA device if GPU is available.**

In [None]:
device = f"cuda:{CUDA_DEVICE}" if cuda.is_available() else 'cpu'
print(device)
print(torch.cuda.get_device_name(CUDA_DEVICE))

**Configure ekphrasis text preprocessor.**

In [None]:
text_processor = TextPreProcessor(
    
    # terms that will be normalized
    normalize=[
        'url', 
        'email',
        'percent',
        'money',
        'phone',
        'user',
        'time', 
        'date',
        'number'
    ],
    
    # terms that will be annotated
    # annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    # corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    # unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

**BERT-specific settings.**

In [None]:
BERT_CONFIG = {
    "attention_probs_dropout_prob": 0.1,
    "hidden_dropout_prob": 0.1,
    "ignore_mismatched_sizes": True
}

**Run-specific settings, taken from constants and papermill parameters.**

In [None]:
settings = {
    "OG_SEED": OG_SEED,
    "SEEDS_NUM": SEEDS_NUM,
    "INITIAL_SEED": INITIAL_SEED,
    "CHECKPOINT": CHECKPOINT,
    "NUM_LABELS": NUM_LABELS,    
    "MAX_LEN": MAX_LEN,    
    "TRAIN_BATCH_SIZE": TRAIN_BATCH_SIZE,
    "VALID_BATCH_SIZE": VALID_BATCH_SIZE,
    "EPOCHS": EPOCHS,
    "LEARNING_RATE": LEARNING_RATE,
    "WEIGHTED_LOSS": WEIGHTED_LOSS,
}

In [None]:
print(f"RUN SETTING: {RUN_SETTING}")

In [None]:
print(settings)

In [None]:
if PAPERMILL:
    print("Importing plain tqdm")
    from tqdm import tqdm    
else:
    print("Importing auto tqdm")
    from tqdm.auto import tqdm   

## Define custom dataaset loader and labels

### Labels: Comment, Deny, Query, Support

In [None]:
idx2label = ["support", "comment", "deny", "query"]
encoded_labels = [0, 1, 2, 3]
label2idx = dict(zip(idx2label, encoded_labels))
label2idx

In [None]:
def make_text_and_labels(jsonfile):
    triplets = []
    with open(jsonfile) as jsonf:        
        data = json.load(jsonf)
        for item in data["Examples"]:
            triplets.append({
                "id": item["id"],
                "tweet_id": item["tweet_id"],
                "raw_text": item["raw_text"],
                "raw_text_prev": item["raw_text_prev"],
                "raw_text_src": item["raw_text_src"],
                "spacy_processed_text": item["spacy_processed_text"],
                "spacy_processed_text_prev": item["spacy_processed_text_prev"],
                "spacy_processed_text_src": item["spacy_processed_text_src"],
                "stance_label": item["stance_label"],
                "triplet_len": len(item["raw_text"]) + len(item["raw_text_prev"]) + len(item["raw_text_src"])
            })

    # Sort by triplet_len ASC
    triplets = sorted(triplets, key=lambda t: t['triplet_len'])
    return triplets

In [None]:
class CustomDataset(Dataset):

    def __init__(self, triplets, tokenizer, max_len):
        self.triplets = triplets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, index):

        max_length = self.max_len - 3
        make_ids = lambda x: self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(x))

        current_triplet = self.triplets[index]

        text_preproc = " ".join(text_processor.pre_process_doc(current_triplet["raw_text"]))
        prev_preproc = " ".join(text_processor.pre_process_doc(current_triplet["raw_text_prev"]))
        src_preproc = " ".join(text_processor.pre_process_doc(current_triplet["raw_text_src"]))
        
        text = make_ids(text_preproc)
        prev = make_ids(prev_preproc)
        src = make_ids(src_preproc)
        
        label = current_triplet["stance_label"]

        segment_A = src + prev
        segment_B = text

        text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + \
                   [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]

        # truncate if exceeds max length
        if len(text_ids) > max_length:
            segment_A = segment_A[0:(max_length // 2)]
            text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + \
                       [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]
            if len(text_ids) > max_length:
                # Truncate also segment B
                segment_B = segment_B[0:(max_length // 2)]
                text_ids = [tokenizer.vocab["[CLS]"]] + segment_A + \
                           [tokenizer.vocab["[SEP]"]] + segment_B + [tokenizer.vocab["[SEP]"]]
                
        token_type_ids = [0] * (len(segment_A)+2) + [1] * (len(segment_B) + 1)
        attention_mask = [1] * len(token_type_ids)
        
        padded_ids = (text_ids + ([0] * (self.max_len - len(text_ids))))
        padded_mask = attention_mask + ([0] * (self.max_len - len(attention_mask)))
        padded_token_type_ids = token_type_ids + ([0] * (self.max_len - len(token_type_ids)))

        assert len(padded_ids) == len(padded_mask) == len(padded_token_type_ids)
        assert len(padded_ids) <= 512
        
        return {
            'ids': torch.tensor(padded_ids, dtype=torch.long),
            'mask': torch.tensor(padded_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(padded_token_type_ids, dtype=torch.long),
            'stance_label': torch.tensor(label, dtype=torch.long),
        }

## Construct training, validation, and test datasets.

In [None]:
TRAIN_TEXTS_FILE = f"{DATA_PATH_PREFIX}/butfit_rumoureval2019_all/train/train.json"
VALIDATION_TEXTS_FILE = f"{DATA_PATH_PREFIX}/butfit_rumoureval2019_all/dev/dev.json"
TEST_TEXTS_FILE = f"{DATA_PATH_PREFIX}/butfit_rumoureval2019_all/test/test_with_labels.json"
TEST_TWITTER_TEXTS_FILE = f"{DATA_PATH_PREFIX}/butfit_rumoureval2019_all/test/test_with_labels_twitter.json"

In [None]:
def show_triplets_per_label(triplets):
    for label in idx2label:
        idx = label2idx[label]
        qty = len(list(filter(lambda x: x['stance_label'] == idx, triplets)))
        print(f"Label {idx} ({label}): {qty}")    

In [None]:
triplets_train = make_text_and_labels(TRAIN_TEXTS_FILE)
print(len(triplets_train))
show_triplets_per_label(triplets_train)

In [None]:
triplets_val = make_text_and_labels(VALIDATION_TEXTS_FILE)
print(len(triplets_val))
show_triplets_per_label(triplets_val)

In [None]:
triplets_test = make_text_and_labels(TEST_TEXTS_FILE)
print(len(triplets_test))
show_triplets_per_label(triplets_test)

In [None]:
triplets_test_twitter = make_text_and_labels(TEST_TWITTER_TEXTS_FILE)
print(len(triplets_test_twitter))
show_triplets_per_label(triplets_test_twitter)

#### CLM Building Mode Only: Merge train and validation data into train data.

In [None]:
if RUN_MODE == "clm":
    triplets_train = triplets_train + triplets_val
    print(len(triplets_train))
    show_triplets_per_label(triplets_train)    

In [None]:
train_params = {
    'batch_size': settings["TRAIN_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

validate_params = {
    'batch_size': settings["TRAIN_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

test_params = {
    'batch_size': settings["VALID_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

## Define training, test, and validation procedures.

In [None]:
# Loss weight vector taken from BUT-FIT implementations.
# weights = [3.8043243885040283, 1.0, 9.309523582458496, 8.90886116027832]
# These weights are normalized so loss on class 1 have a value of 1.
LOSS_WEIGHTS = np.array(class_weight.compute_class_weight(
    "balanced",
    classes=[0, 1, 2, 3],
    y=list(map(lambda t: t["stance_label"], triplets_train))
))
print("Got loss weights: ", LOSS_WEIGHTS)

# Normalize weights wrt minimum loss value
LOSS_WEIGHTS = LOSS_WEIGHTS / np.min(LOSS_WEIGHTS)
print("Got normalized loss weights: ", LOSS_WEIGHTS)

loss_fn = None

# Using this requires extra work because weights are only at mini-batch level
# https://stackoverflow.com/questions/67639540/pytorch-cross-entropy-loss-weights-not-working/67639895#67639895
if settings["WEIGHTED_LOSS"]:
    loss_fn = torch.nn.CrossEntropyLoss(weight=torch.Tensor(LOSS_WEIGHTS).to(device))
else:
    loss_fn = torch.nn.CrossEntropyLoss()

assert loss_fn is not None

In [None]:
def train(epoch, model, training_loader, optimizer = None, scheduler = None):
    
    model.train()    
    loss_acum = 0
    N = 0
    
    for iters , data in tqdm(enumerate(training_loader, 0), total = len(training_loader)):

        # Reset optimizer gradients
        if optimizer:
            optimizer.zero_grad(set_to_none=True)

        # Get model input from custom dataset        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)        
        targets = data['stance_label'].to(device, dtype = torch.long)

        # Make forward and backward passes in the model
        outputs = model(ids, attention_mask=mask, token_type_ids=token_type_ids)        
        logits = outputs["logits"]
        loss = loss_fn(logits, targets)
        loss.backward()

        # Compute accumulated loss for reporting progress
        loss_acum += loss.item()
        if not settings["WEIGHTED_LOSS"]:
            N = N + 1
        else:
            N_update = sum([loss_fn.weight[k].item() for k in data['stance_label']])
            N = N + N_update

        # Update optimizer and scheduler, if any
        if optimizer:
            optimizer.step()
        if scheduler:
            scheduler.step()        

    return loss_acum / N

In [None]:
def validation(model, testing_loader):
    
    model.eval()
    loss_acum=0
    fin_targets=[]
    fin_outputs=[]
    N = 0
    
    with torch.no_grad():
        for iters, data in tqdm(enumerate(testing_loader, 0),total = len(testing_loader)):
            # Get model input from custom dataset
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['stance_label'].to(device, dtype = torch.long)

            # Make forward pass for prediction
            outputs = model(ids, attention_mask=mask, token_type_ids=token_type_ids)
            logits = outputs["logits"]
            loss = loss_fn(logits, targets)

            # Compute accumulated loss for reporting progress
            loss_acum += loss.item()
            if not settings["WEIGHTED_LOSS"]:
                N = N + 1
            else:
                N_update = sum([loss_fn.weight[k].item() for k in data['stance_label']])
                N = N + N_update

            # Compute expected outputs vs model outputs for reporting progress
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.softmax(logits, dim=1).cpu().detach().numpy())
    
    return loss_acum / N, np.array(fin_outputs), np.array(fin_targets)

## Training

**Generate `settings["SEEDS_NUM"]` seeds to explore model performance.**

In [None]:
seeds = np.random.randint(0, 42069, size=settings["SEEDS_NUM"])
seeds

**If running in CLM building mode, set initial seed.**

In [None]:
if RUN_MODE == "clm":
    seeds = [settings["INITIAL_SEED"]]

### Make dataframe to tabulate results

In [None]:
column_names = list(settings.keys())
column_names = column_names + ["seed"]
column_names = column_names + ["epoch"]
column_names = column_names + ["train_loss"]
column_names = column_names + ["val_loss", "val_accuracy", "val_f1_score_micro", "val_f1_score_macro"]
column_names = column_names + ["test_loss", "test_accuracy", "test_f1_score_micro", "test_f1_score_macro"]
column_names = column_names + ["test_twitter_loss", "test_twitter_accuracy", "test_twitter_f1_score_micro", "test_twitter_f1_score_macro"]
column_names

In [None]:
run_results = pd.DataFrame(columns=column_names)
run_results

### Find best model across seeds

In [None]:
best_valid_loss = float('inf')
best_valid_accuracy = 0
best_valid_f1_macro = 0
best_val_loss_epoch = None

_checkpoint = settings["CHECKPOINT"]
_num_labels = settings["NUM_LABELS"]
_max_length = settings["MAX_LEN"]
_learning_rate = settings["LEARNING_RATE"]
_epochs = settings["EPOCHS"]

for seed in seeds:

    # ###########################################
    # ! Setup randomness to use current seed
    print(f"Seed: {seed}")
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    # ###########################################

    print("Train dataset statistics")
    print("Total Tweets: ", len(triplets_train))
    print("Label 0 (support): ", len(list(filter(lambda x: x['stance_label'] == 0, triplets_train))))
    print("Label 1 (comment): ", len(list(filter(lambda x: x['stance_label'] == 1, triplets_train))))
    print("Label 2 (deny): ", len(list(filter(lambda x: x['stance_label'] == 2, triplets_train))))
    print("Label 3 (query): ", len(list(filter(lambda x: x['stance_label'] == 3, triplets_train))))

    print(f"Tokenizer: {_checkpoint}")
    tokenizer = AutoTokenizer.from_pretrained(_checkpoint)
    
    training_set = CustomDataset(triplets_train, tokenizer, _max_length)    
    training_loader = DataLoader(training_set, **train_params)
    
    validation_set = CustomDataset(triplets_val, tokenizer, _max_length)
    validation_loader = DataLoader(validation_set, **validate_params)

    test_set = CustomDataset(triplets_test, tokenizer, _max_length)
    test_loader = DataLoader(test_set, **test_params)

    test_twitter_set = CustomDataset(triplets_test_twitter, tokenizer, _max_length)
    test_twitter_loader = DataLoader(test_twitter_set, **test_params)
    
    # Load pretrained model
    print(f"Model from: {_checkpoint} with {_num_labels} labels")
    model = AutoModelForSequenceClassification.from_pretrained(
        _checkpoint,
        num_labels=_num_labels, **BERT_CONFIG
    )

    print("Model config:")
    print(model.config)
    
    model.to(device)
    optimizer = torch.optim.AdamW(
        params =  model.parameters(),
        lr=_learning_rate,
        weight_decay=0.01
    )
    
    scheduler = None    
    
    for epoch in range(_epochs):
        print(f"===== EPOCH {epoch} / SEED {seed}")

        # ######################################################################################
        # Train
        train_loss = train(epoch, model, training_loader, optimizer, scheduler)

        # ######################################################################################
        # Validation: only in standalone task mode.
        if RUN_MODE == "standalone":        
            val_loss, val_outputs, val_targets = validation(model, validation_loader)
            
            val_outputs_bin = np.argmax(val_outputs, axis=1)
            val_accuracy = metrics.accuracy_score(val_targets, val_outputs_bin)
            val_f1_score_micro = metrics.f1_score(val_targets, val_outputs_bin, average='micro')
            val_f1_score_macro = metrics.f1_score(val_targets, val_outputs_bin, average='macro')
    
            print("Validation Outputs: ")
            print(val_outputs)
            
            print("Validation Outputs bin: ")
            print(val_outputs_bin)
            
            if best_valid_loss > val_loss:
                best_valid_loss = val_loss
                best_val_loss_epoch = epoch
                print(f"Best val loss: {best_valid_loss} at epoch {epoch}")
    
            print(f"Validatiton Accuracy Score = {val_accuracy}")
            print(f"Validation F1 Score (Micro) = {val_f1_score_micro}")
            print(f"Validation F1 Score (Macro) = {val_f1_score_macro}")
            print(f'Train loss: {train_loss}\t Validation loss:{val_loss}')
        else:
            val_loss = -1
            val_accuracy = -1
            val_f1_score_micro = -1
            val_f1_score_macro = -1            

        # ######################################################################################
        # Test with full test set
        test_loss, test_outputs, test_targets = validation(model, test_loader)
        
        test_outputs_bin = np.argmax(test_outputs, axis=1)
        test_accuracy = metrics.accuracy_score(test_targets, test_outputs_bin)
        test_f1_score_micro = metrics.f1_score(test_targets, test_outputs_bin, average='micro')
        test_f1_score_macro = metrics.f1_score(test_targets, test_outputs_bin, average='macro')

        print(f"Test Accuracy Score = {test_accuracy}")
        print(f"Test F1 Score (Micro) = {test_f1_score_micro}")
        print(f"Test F1 Score (Macro) = {test_f1_score_macro}")
        print(f"Test loss: {test_loss}")

        # ######################################################################################
        # Test with Twitter test set
        test_twitter_loss, test_twitter_outputs, test_twitter_targets = validation(model, test_twitter_loader)
        
        test_twitter_outputs_bin = np.argmax(test_twitter_outputs, axis=1)
        test_twitter_accuracy = metrics.accuracy_score(test_twitter_targets, test_twitter_outputs_bin)
        test_twitter_f1_score_micro = metrics.f1_score(test_twitter_targets, test_twitter_outputs_bin, average='micro')
        test_twitter_f1_score_macro = metrics.f1_score(test_twitter_targets, test_twitter_outputs_bin, average='macro')

        print(f"Test Twitter Accuracy Score = {test_twitter_accuracy}")
        print(f"Test Twitter F1 Score (Micro) = {test_twitter_f1_score_micro}")
        print(f"Test Twitter F1 Score (Macro) = {test_twitter_f1_score_macro}")
        print(f"Test Twitter loss: {test_twitter_loss}")

        results_row = {
            **settings,
            'seed': seed,
            'epoch': epoch,
            'run_mode': RUN_MODE,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy,
            'val_f1_score_micro': val_f1_score_micro,
            'val_f1_score_macro': val_f1_score_macro,
            'test_loss': test_loss,
            'test_accuracy': test_accuracy,
            'test_f1_score_micro': test_f1_score_micro,
            'test_f1_score_macro': test_f1_score_macro,
            'test_twitter_loss': test_twitter_loss,
            'test_twitter_accuracy': test_twitter_accuracy,
            'test_twitter_f1_score_micro': test_twitter_f1_score_micro,
            'test_twitter_f1_score_macro': test_twitter_f1_score_macro,
        }

        run_results = pd.concat([run_results, pd.DataFrame([results_row])], ignore_index=True)

    # Save model to huggingface model hub. Only in CLM Building mode.
    if RUN_MODE == "clm":
        model.push_to_hub(SAVE_CHECKPOINT_NAME, private=True)
        tokenizer.push_to_hub(SAVE_CHECKPOINT_NAME, private=True)
            
    del model    
    del optimizer
    del scheduler
    torch.cuda.empty_cache()

In [None]:
run_results

In [None]:
if PAPERMILL:
    # Store results file in Excel
    EXCEL_OUTPUT = f"{EXPERIMENT_NAME}_SETTING_{RUN_SETTING:02d}_RUN_{RUN_SUFFIX}.xlsx"
    run_results.to_excel(f"{XLS_RESULTS_FOLDER}/{EXCEL_OUTPUT}")

    if STORE_RESULTS_S3 == 1:
        # Upload results to S3. Depends on environment variables.
        # We use $$ to force environment variable and to be able to combine it with local variable {EXCEL_OUTPUT}
        !aws s3 cp {XLS_RESULTS_FOLDER}/{EXCEL_OUTPUT} s3://$$S3_BUCKET/ --endpoint-url=$$S3_ENDPOINT

In [None]:
print("END")