**Manual setup of required environment variables.**

In [None]:
%env FND_ROOT=/workspace/fnd-building

**Define constants and import all randomness sources first.**

In [None]:
import torch
import random
import numpy as np

from datetime import datetime

**Configuration constants.**

In [None]:
CUDA_DEVICE = 0

# Global original seed for randomness reproducibility.
OG_SEED = 30082010

# Prefix for storing results.
RUN_SUFFIX = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

# Total number of tweets available per author
TOTAL_AUTHOR_LEN = 100

# Absolute path to root folder of the repository.
FND_ROOT=%env FND_ROOT

DATA_PATH_PREFIX=f"{FND_ROOT}/datasets/datasets-fnd-bot"

**Initialize randomness sources with original seed, for full reproducibility of results.**

In [None]:
np.random.seed(OG_SEED)
torch.manual_seed(OG_SEED)
random.seed(OG_SEED)

**All other imports**

In [None]:
import csv
import gc
import glob
import json
import os
import pandas as pd
import re
import transformers

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn import metrics
from sklearn.utils.random import sample_without_replacement

from torch import cuda
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from typing import List

from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

print(f"Pytorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")

**Setup papermill parameters.**
The cell below must be tagged with the 'parameters' tag.
See: https://papermill.readthedocs.io/en/latest/usage-parameterize.html

In [None]:
# papermill parameters

## Must be set to True when running via papermill.
PAPERMILL = False

## Seed to be used in CLM Building mode, where only 1 seed is used.
INITIAL_SEED = 0

## Name for saving model and tokenizer in huggingface hub.
## Requires previous login using HF_TOKEN
## Use command: huggingface-cli login --token $HF_TOKEN
## where HF_TOKEN must be set as an environment variable with your login token.
SAVE_CHECKPOINT_NAME="<user>/<model-tokenizer-name>"

## Whether to run as standalone task or as CLM building task.
## Options: ["standalone", "clm"]
# Standalone task:
#      Use train and validation sets separately, for training an validation.
#      Test with test set.
# CLM building task:
#      Merge train and validation sets for training, test with test set.
RUN_MODE = "standalone"

## Folder to store Excel result files
XLS_RESULTS_FOLDER = "./"

## Set to 1 to store results in S3, set to 0 otherwise.
# Defaults to 0.
STORE_RESULTS_S3 = 0

## Parameters for saving and uploading results.
EXPERIMENT_NAME = "EXP-TBD"
RUN_SETTING = "-1"

# Actual parameters for running the model

## Language to use, options: ["EN", "ES"] for English and Spanish respectively.
BOT_LANG = "EN"

## Starting transformers checkpoint.
# To use private checkpoints, user must be logged in
# using the hugging-face-cli login method.
CHECKPOINT = "bert-base-uncased" 

## Number of labels in the dataset
NUM_LABELS = 2

## Number of seeds to explore in standalone mode.
SEEDS_NUM = 1

## Maximum length for BERT tokens
MAX_LEN = 512

## Training parameters
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1e-05
DROPOUT = 0.1

## Number of tweets to extract from each author
# using the sampling strategy provided.
AUTHOR_LEN = 15

## Sampling strategy to use
# when extracting AUTHOR_LEN tweets for each author.
# Options: ["head", "tail", "head+tail"]-
SAMPLING_STRATEGY = "head"

**Clear CUDA cache and perform garbage collection.**

In [None]:
torch.cuda.empty_cache()
gc.collect()

**Setup CUDA device if GPU is available.**

In [None]:
device = f"cuda:{CUDA_DEVICE}" if cuda.is_available() else 'cpu'
print(device)
print(torch.cuda.get_device_name(CUDA_DEVICE))

**Configure ekphrasis text preprocessor.**

In [None]:
text_processor = TextPreProcessor(
    
    # terms that will be normalized
    normalize=[
        'url',
        'email',
        'percent',
        'money',
        'phone',
        'user',
        'time',
        'date',
        'number'
    ],
    
    # terms that will be annotated
    # annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    # corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    # unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

**BERT-specific settings.**

In [None]:
BERT_CONFIG = {
    "ignore_mismatched_sizes": True,
}

**Run-specific settings, taken from constants and papermill parameters.**

In [None]:
settings = {
    "OG_SEED": OG_SEED,
    "SEEDS_NUM": SEEDS_NUM,
    "INITIAL_SEED": INITIAL_SEED,
    "CHECKPOINT": CHECKPOINT,    
    "NUM_LABELS": NUM_LABELS,    
    "MAX_LEN": MAX_LEN,
    "DROPOUT": DROPOUT,
    "TRAIN_BATCH_SIZE": TRAIN_BATCH_SIZE,
    "VALID_BATCH_SIZE": VALID_BATCH_SIZE,
    "EPOCHS": EPOCHS,
    "LEARNING_RATE": LEARNING_RATE,
    "AUTHOR_LEN": AUTHOR_LEN,
    "SAMPLING_STRATEGY": SAMPLING_STRATEGY,
}

In [None]:
print(f"RUN SETTING: {RUN_SETTING}")

In [None]:
print(settings)

In [None]:
if PAPERMILL:
    print("Importing plain tqdm")
    from tqdm import tqdm    
else:
    print("Importing auto tqdm")
    from tqdm.auto import tqdm    

## Define custom dataset loader and labels

### Labels: Bot, Human

In [None]:
idx2label = ["bot", "human"]
encoded_labels = LabelEncoder().fit_transform(idx2label)
label2idx = dict(zip(idx2label, encoded_labels))
label2idx

In [None]:
def make_text_and_labels(jsonfile):
    author_texts = []
    labels = []
    with open(jsonfile) as jsonf:
        i = 0
        for line in jsonf:
            json_contents = json.loads(line)
            author_texts.append(json_contents["text"])
            labels.append(json_contents["label"])
            i = i + 1
        
    return author_texts, labels

In [None]:
def make_strategy_handler(strategy):
    """Create an item builder based on the chosen strategy.

    In all cases, AUTHOR_LEN tweets are inspected.
    If AUTHOR_LEN < TOTAL_AUTHOR_LEN, the sampling is randomized without repetition.
    
    For each sampled tweet, N=MAX_LEN//TOTAL_AUTHOR_LEN
    characters are extracted according to each strategy:

    - head (default): the first N characters are extracted.
    - tail: the last N characters are extracted.
    - head+tail: the first//N2 are concatenated with the last N/2 characters.
    """
    def sample_tweets(author_texts: List[str]):
        """Returns the sample of tweets to process.

        If AUTHOR_LEN < TOTAL_AUTHOR_LEN, the sampling is randomized without repetition.
        Otherwise, when AUTHOR_LEN == TOTAL_AUTHOR_LEN all tweets are returned, 
        however their order is randomized.
        """
        sample = sample_without_replacement(TOTAL_AUTHOR_LEN, AUTHOR_LEN)
        sampled_tweets = [author_texts[idx] for idx in sample]
        return sampled_tweets

    def extract_head(text, n):
        extracted_text = text[0:n]
        return extracted_text

    def extract_tail(text, n):
        extracted_text = text[len(text)-n:]
        return extracted_text

    def extract_head_tail(text, n):
        extracted_text = f"{extract_head(text, n//2)}{extract_tail(text,n//2)}"
        return extracted_text
    
    def item_builder(author_texts: List[str]):
        N = MAX_LEN // AUTHOR_LEN
        sampled_tweets = sample_tweets(author_texts)
        extraction_function = None
        if strategy == "head":
            extraction_function = extract_head
        elif strategy == "tail":
            extraction_function = extract_tail
        elif strategy == "head+tail":
            extraction_function = extract_head_tail

        assert extraction_function is not None
        
        extracted_tweets = list(map(lambda x: (extraction_function(" ".join(text_processor.pre_process_doc(x)), N)), sampled_tweets))
        return " ".join(extracted_tweets)

    return item_builder

In [None]:
class CustomDataset(Dataset):

    def __init__(self, author_texts, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.author_texts = author_texts
        self.targets = labels        
        self.max_len = max_len
        self.item_builder = make_strategy_handler(settings["SAMPLING_STRATEGY"])

    def __len__(self):
        return len(self.author_texts)

    def __getitem__(self, index):
        
        author_texts = str(self.author_texts[index])        
        split_author_texts = author_texts.split("<sep>")[0:-1]       
        
        author_texts = self.item_builder(split_author_texts)
        inputs = self.tokenizer.encode_plus(
            author_texts,
            None,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [None]:
train_params = {
    'batch_size': settings["TRAIN_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

test_params = {
    'batch_size': settings["VALID_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

## Construct training, validation, and test datasets

In [None]:
assert BOT_LANG in ["EN", "ES"]
if BOT_LANG == "EN":
    print("Loading dataset in English")
    TRAIN_TEXTS_FILE = f"{DATA_PATH_PREFIX}/1_PAN19_bot_ep_training/PAN19_bot_ep_training_en.json"
    VALIDATION_TEXTS_FILE = f"{DATA_PATH_PREFIX}/2_PAN19_bot_ep_test/PAN19_bot_ep_test_en.json"
    TEST_TEXTS_FILE = f"{DATA_PATH_PREFIX}/3_PAN19_bot_ep_earlybirds/PAN19_bot_ep_earlybirds_en.json"
elif BOT_LANG == "ES":
    print("Loading dataset in Spanish")
    TRAIN_TEXTS_FILE = f"{DATA_PATH_PREFIX}/1_PAN19_bot_ep_training/PAN19_bot_ep_training_es.json"
    VALIDATION_TEXTS_FILE = f"{DATA_PATH_PREFIX}/2_PAN19_bot_ep_test/PAN19_bot_ep_test_es.json"
    TEST_TEXTS_FILE = f"{DATA_PATH_PREFIX}/3_PAN19_bot_ep_earlybirds/PAN19_bot_ep_earlybirds_es.json"

In [None]:
author_texts_train, labels_train = make_text_and_labels(TRAIN_TEXTS_FILE)
assert len(author_texts_train) == len(labels_train)
print(f"Total Training Tweets: {len(author_texts_train)}")
print(f"Total Training Tweets with label 0: {len(list(filter(lambda x: x == 0, labels_train)))}")
print(f"Total Training Tweets with label 1: {len(list(filter(lambda x: x == 1, labels_train)))}")

In [None]:
author_texts_val, labels_val = make_text_and_labels(VALIDATION_TEXTS_FILE)
assert len(author_texts_val) == len(labels_val)
print(f"Total Validation Tweets: {len(author_texts_val)}")
print(f"Total Validation Tweets with label 0: {len(list(filter(lambda x: x == 0, labels_val)))}")
print(f"Total Validation Tweets with label 1: {len(list(filter(lambda x: x == 1, labels_val)))}")

In [None]:
author_texts_test, labels_test = make_text_and_labels(TEST_TEXTS_FILE)
assert len(author_texts_test) == len(labels_test)
print(f"Total Test Tweets: {len(author_texts_test)}")
print(f"Total Test Tweets with label 0: {len(list(filter(lambda x: x == 0, labels_test)))}")
print(f"Total Test Tweets with label 1: {len(list(filter(lambda x: x == 1, labels_test)))}")

#### CLM Building Mode Only: Merge train and validation data into train data.

In [None]:
if RUN_MODE == "clm":
    author_texts_train = author_texts_train + author_texts_val
    labels_train = labels_train + labels_val
    assert len(author_texts_train) == len(labels_train)
    print(f"Total Merged Training Tweets: {len(author_texts_train)}")
    print(f"Total Merged Training Tweets with label 0: {len(list(filter(lambda x: x == 0, labels_train)))}")
    print(f"Total Merged Training Tweets with label 1: {len(list(filter(lambda x: x == 1, labels_train)))}")

## Define training, test, and validation procedures.

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
def train(epoch, model, training_loader, optimizer = None, scheduler = None):
    
    model.train()
    loss_acum = 0
    N = 0
    
    for iters , data in tqdm(enumerate(training_loader, 0), total = len(training_loader)):

        # Reset optimizer gradients.
        if optimizer:
            optimizer.zero_grad(set_to_none=True)

        # Get model input from custom dataset
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)        
        targets = data['targets'].to(device, dtype = torch.long)

        # Make forward and backward passes in the model.
        outputs = model(ids, mask, token_type_ids)        
        logits = outputs["logits"]
        loss = loss_fn(logits, targets)
        loss.backward()

        # Compute accumulated loss for reporting progress.
        loss_acum += loss.item()
        N = N + 1

        # Update optimizer and scheduler, if any.
        if optimizer:
            optimizer.step()
        if scheduler:
            scheduler.step()

    return loss_acum / N

In [None]:
def validation(model, testing_loader):
    
    model.eval()
    loss_acum=0
    fin_targets=[]
    fin_outputs=[]
    N = 0
    
    with torch.no_grad():        
        for iters, data in tqdm(enumerate(testing_loader, 0),total = len(testing_loader)):            
            # Get model input from custom dataset
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)

            # Make forward pass for prediction
            outputs = model(ids, mask, token_type_ids)
            logits = outputs["logits"]
            loss = loss_fn(logits, targets)

            # Compute accumulated loss for reporting progress.
            loss_acum += loss.item()
            N = N + 1

            # Compute expected outputs vs model outputs for reporting progress.
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.softmax(logits, dim=1).cpu().detach().numpy())
            
    
    return loss_acum / N, np.array(fin_outputs), np.array(fin_targets)

## Training

**Generate `settings["SEEDS_NUM"]` seeds to explore model performance.**

In [None]:
seeds = np.random.randint(0, 42069, size=settings["SEEDS_NUM"])
seeds

**If running in CLM building mode, set initial seed.**

In [None]:
if RUN_MODE == "clm":
    seeds = [settings["INITIAL_SEED"]]

### Make dataframe to tabulate results

In [None]:
column_names = list(settings.keys())
column_names = column_names + ["seed"]
column_names = column_names + ["epoch"]
column_names = column_names + ["train_loss"]
column_names = column_names + ["val_loss", "val_accuracy", "val_f1_score_micro", "val_f1_score_macro"]
column_names = column_names + ["test_loss", "test_accuracy", "test_f1_score_micro", "test_f1_score_macro"]
column_names

In [None]:
run_results = pd.DataFrame(columns=column_names)
run_results

### Find best model across seeds

In [None]:
best_valid_loss = float('inf')
best_valid_accuracy = 0
best_valid_f1_macro = 0
best_val_loss_epoch = None

_checkpoint = settings["CHECKPOINT"]
_num_labels = settings["NUM_LABELS"]
_max_length = settings["MAX_LEN"]
_learning_rate = settings["LEARNING_RATE"]
_epochs = settings["EPOCHS"]

for seed in seeds:

    # ###########################################
    # ! Setup randomness to use current seed
    print(f"Setting seed: {seed}")
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    # ###########################################

    print("Train dataset statistics")
    print("Total Tweets: ", len(author_texts_train))
    print("Label 0 (bot): ", len(list(filter(lambda x: x == idx2label[label2idx["bot"]], labels_train))))
    print("Label 1 (human): ", len(list(filter(lambda x: x == idx2label[label2idx["human"]], labels_train))))

    print(f"Tokenizer: {_checkpoint}")
    tokenizer = AutoTokenizer.from_pretrained(_checkpoint)
    
    training_set = CustomDataset(author_texts_train, labels_train, tokenizer, MAX_LEN)
    training_loader = DataLoader(training_set, **train_params)
    
    validation_set = CustomDataset(author_texts_val, labels_val, tokenizer, MAX_LEN)
    validation_loader = DataLoader(validation_set, **train_params)
    
    test_set = CustomDataset(author_texts_test, labels_test, tokenizer, MAX_LEN)
    testing_loader = DataLoader(test_set, **test_params)    

    # Load pretrained model
    print(f"Model from: {_checkpoint} with {_num_labels} labels")
    model = AutoModelForSequenceClassification.from_pretrained(
        _checkpoint,
        num_labels=_num_labels,
        **BERT_CONFIG
    )    

    print("Model config:")
    print(model.config)
    
    model.to(device)
    optimizer = torch.optim.AdamW(
        params =  model.parameters(),
        lr=_learning_rate,
        weight_decay=0.01
    )
    
    scheduler = None
    
    for epoch in range(_epochs):
        print(f"===== EPOCH {epoch} / SEED {seed}")

        # ######################################################################################
        # Train        
        train_loss = train(epoch, model, training_loader, optimizer, scheduler)
        print(f"Train loss: {train_loss}")
        
        # ######################################################################################
        # Validation: only in standalone task mode.
        if RUN_MODE == "standalone":
            val_loss, val_outputs, val_targets = validation(model,validation_loader)
            
            val_outputs_bin = np.argmax(val_outputs, axis=1)
            val_accuracy = metrics.accuracy_score(val_targets, val_outputs_bin)
            val_f1_score_micro = metrics.f1_score(val_targets, val_outputs_bin, average='micro')
            val_f1_score_macro = metrics.f1_score(val_targets, val_outputs_bin, average='macro')
            
            if best_valid_loss > val_loss:
                best_valid_loss = val_loss
                best_val_loss_epoch = epoch
                print(f"Best val loss: {best_valid_loss} at epoch {epoch}")
    
            print(f"Validation Accuracy Score = {val_accuracy}")
            print(f"Validation F1 Score (Micro) = {val_f1_score_micro}")
            print(f"Validation F1 Score (Macro) = {val_f1_score_macro}")
            print(f'Validation loss:{val_loss}')
        else:
            val_loss = -1
            val_accuracy = -1
            val_f1_score_micro = -1
            val_f1_score_macro = -1

        # ######################################################################################
        # Test with full test set
        test_loss, test_outputs, test_targets = validation(model, testing_loader)
        
        test_outputs_bin = np.argmax(test_outputs, axis=1)
        test_accuracy = metrics.accuracy_score(test_targets, test_outputs_bin)
        test_f1_score_micro = metrics.f1_score(test_targets, test_outputs_bin, average='micro')
        test_f1_score_macro = metrics.f1_score(test_targets, test_outputs_bin, average='macro')

        print(f"TEST Accuracy Score = {test_accuracy}")
        print(f"TEST F1 Score (Micro) = {test_f1_score_micro}")
        print(f"TEST F1 Score (Macro) = {test_f1_score_macro}")
        print(f"TEST loss: {test_loss}")

        results_row = {
            **settings,
            'seed': seed,
            'epoch': epoch,
            'run_mode': RUN_MODE,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy,
            'val_f1_score_micro': val_f1_score_micro,
            'val_f1_score_macro': val_f1_score_macro,
            'test_loss': test_loss,
            'test_accuracy': test_accuracy,
            'test_f1_score_micro': test_f1_score_micro,
            'test_f1_score_macro': test_f1_score_macro,
        }

        run_results = pd.concat([run_results, pd.DataFrame([results_row])], ignore_index=True)

    # Save model to huggingface model hub. Only in CLM Building mode.
    if RUN_MODE == "clm":
        model.push_to_hub(SAVE_CHECKPOINT_NAME, private=True)
        tokenizer.push_to_hub(SAVE_CHECKPOINT_NAME, private=True)
            
    del model    
    del optimizer
    del scheduler
    torch.cuda.empty_cache()

In [None]:
run_results

In [None]:
if PAPERMILL:
    # Store results in Excel file
    EXCEL_OUTPUT = f"{EXPERIMENT_NAME}_SETTING_{RUN_SETTING:02d}_RUN_{RUN_SUFFIX}.xlsx"
    run_results.to_excel(f"{XLS_RESULTS_FOLDER}/{EXCEL_OUTPUT}")

    if STORE_RESULTS_S3 == 1:
        # Upload results to S3. Depends on environment variables.
        # We use $$ to force environment variable and to be able to combine it with local variable {EXCEL_OUTPUT}
        !aws s3 cp {XLS_RESULTS_FOLDER}/{EXCEL_OUTPUT} s3://$$S3_BUCKET/ --endpoint-url=$$S3_ENDPOINT

In [None]:
print("END")