**Manual setup of required environment variables.**

In [None]:
%env FND_ROOT=/workspace/fnd-building

**Define constants and import all randomness sources first.**

In [None]:
import torch
import random
import numpy as np

from datetime import datetime

**Configuration constants.**

In [None]:
CUDA_DEVICE = 0

# Global original seed for randomness reproducibility.
OG_SEED = 19012016

# Prefix for storing results.
RUN_SUFFIX = f'per_label_{datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}'

# Absolute path to root folder of the repository.
FND_ROOT=%env FND_ROOT

EMBEDDINGS_PREFIX=f"{FND_ROOT}/experiments/embeddings"

**Initialize randomness sources with original seed, for full reproducibility of results.**

In [None]:
np.random.seed(OG_SEED)
torch.manual_seed(OG_SEED)
random.seed(OG_SEED)
run_seed = np.random.randint(0, 42069, size=1)[0]
run_seed

**All other imports**

In [None]:
import gc
import glob
import os
import pandas as pd

from copy import deepcopy

from huggingface_hub import HfApi
from huggingface_hub import snapshot_download

from sklearn.model_selection import train_test_split
from sklearn import metrics

from torch import cuda
from torch.utils.data import Dataset, DataLoader

print(f"Pytorch version: {torch.__version__}")

**Setup papermill parameters.** The cell below must be tagged with the 'parameters' tag. See: https://papermill.readthedocs.io/en/latest/usage-parameterize.html

In [None]:
# papermill parameters

## Must be set to True when running via papermill.
PAPERMILL = False

RUN_SETTING = "-1"

## Whether to load pretrained model.
LOAD_PRETRAINED_MODEL = False

# Specific pre-trained model to use if
# LOAD_PRETRAINED_MODEL is true.
PRETRAINED_MODEL = None

## Whether to save the model after specific epochs.
SAVE_EPOCH_SNAPSHOTS = False

# TODO: check how to pass the list via papermill parameters.
# Epoch snapshots when SAVE_EPOCH_SNAPSHOTS = True
EPOCH_SNAPSHOTS= [2, 4]
EPOCH_SNAPSHOTS_LABELS = ["EPOCH", "EPOCH_DATA"]

EXPERIMENT_SUFFIX = ""

## Folder to store Excel result files
XLS_RESULTS_FOLDER = "./"

## Set to 1 to store results in S3, set to 0 otherwise.
# Defaults to 0.
STORE_RESULTS_S3 = 0

USE_VALIDATION_SET = True
VALIDATION_SET_SIZE = 0.15
TEST_SET_SIZE = 0.15

# Model-specific parameters

# Number of epochs to run.
# When saving snapshots, this value must be set
# to max(EPOCH_SNAPSHOTS) + 1
EPOCHS = 1000

TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
LEARNING_RATE = 1e-04

## Number of GRU Layers
# 1: single GRU, 2+: stacked GRU
GRU_NUM_LAYERS = 2  

## Whether the GRU is bidirectional or not.
BIDIRECTIONAL = True

## Number of hidden units
UNITS = 1024

## Embedding to use for training, validation, and test.
EMBEDDING_SNAPSHOT = "BASE0_RN_RAW_BERT_embedding"

# Dropout value for layer after GRU
DROPOUT = 0.2

# Dropout value inside GRU
INTERNAL_DROPOUT = 0.2

In [None]:
print(EPOCH_SNAPSHOTS)
EPOCH_SNAPSHOTS_LABELS_BY_NUM = { k: v for k, v in list(zip(EPOCH_SNAPSHOTS, EPOCH_SNAPSHOTS_LABELS)) }
EPOCH_SNAPSHOTS_LABELS_BY_NUM

**Clear CUDA cache and perform garbage collection.**

In [None]:
torch.cuda.empty_cache()
gc.collect()

**Setup CUDA device if GPU is available.**

In [None]:
device = f"cuda:{CUDA_DEVICE}" if cuda.is_available() else 'cpu'
print(device)
print(torch.cuda.get_device_name(CUDA_DEVICE))

**Run-specific settings, taken from constants and papermill parameters.**

In [None]:
settings = {
    "OG_SEED": OG_SEED,
    "SAVE_EPOCH_SNAPSHOTS": SAVE_EPOCH_SNAPSHOTS,
    "EPOCH_SNAPSHOTS": EPOCH_SNAPSHOTS,
    "EPOCHS": EPOCHS,    
    "TRAIN_BATCH_SIZE": TRAIN_BATCH_SIZE,
    "VALID_BATCH_SIZE": VALID_BATCH_SIZE,
    "LEARNING_RATE": LEARNING_RATE,
    "GRU_NUM_LAYERS": GRU_NUM_LAYERS,
    "BIDIRECTIONAL": BIDIRECTIONAL,
    "UNITS": UNITS,
    "DROPOUT": DROPOUT,
    "INTERNAL_DROPOUT": INTERNAL_DROPOUT,   
}

In [None]:
print(settings)

In [None]:
if PAPERMILL:
    print("Importing plain tqdm")
    from tqdm import tqdm    
else:
    print("Importing auto tqdm")
    from tqdm.auto import tqdm    

**Load X and y from precomputed embeddings and generate train, test, val split.**

In [None]:
X = None
y = None

print(f"Loading Xy from snapshot {EMBEDDING_SNAPSHOT}")

X = np.load(f"{EMBEDDINGS_PREFIX}/X_{EMBEDDING_SNAPSHOT}.npy")
y = np.load(f"{EMBEDDINGS_PREFIX}/y_{EMBEDDING_SNAPSHOT}.npy")

assert X is not None
assert y is not None
assert len(X) == len(y)

print(f"Loaded Xy from snapshot: {EMBEDDING_SNAPSHOT}")
print(X.shape)
print(y.shape)

In [None]:
#separación en datos de entrenamiento y prueba
if USE_VALIDATION_SET:
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=VALIDATION_SET_SIZE)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=TEST_SET_SIZE)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SET_SIZE)

**Define custom dataset loader and specify loader parameters.**

In [None]:
class CustomDataset(Dataset):

    def __init__(self, examples, labels):
        assert len(examples) == len(labels)
        self.examples = examples
        self.labels = labels

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):

        example = self.examples[index]

        # From one-hot encoded to categorical label
        label = np.argmax(self.labels[index])
        
        return {
            'example': torch.tensor(example),            
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
_, tree_max_num_seq, emb_size = X.shape
_, num_categories = y.shape

In [None]:
print("tree_max_num_seq: ", tree_max_num_seq)
print("emb size: ", emb_size)
print("num_categories: ", num_categories)

In [None]:
if USE_VALIDATION_SET:
    print("Train+val shapes")
    print(f"X train+val: {X_train_val.shape}")
    print(f"y train+val: {y_train_val.shape}")

In [None]:
print("Train shapes")
print(f"X train: {X_train.shape}")
print(f"y train: {y_train.shape}")

In [None]:
if USE_VALIDATION_SET:
    print("Val shapes")
    print(f"X val: {X_val.shape}")
    print(f"y val: {y_val.shape}")

In [None]:
print("Test shapes")
print(f"X test: {X_test.shape}")
print(f"y test: {y_test.shape}")

In [None]:
train_params = {
    'batch_size': settings["TRAIN_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

validate_params = {
    'batch_size': settings["VALID_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

test_params = {
    'batch_size': settings["VALID_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

**Bidirectional Stacked GRU for fake news detection.**

In [None]:
class FND_BI_GRU(torch.nn.Module):
    """
    Bidirectional GRU for fake news classification.

    Pytorch reimplementation of model in Providel&Mendoza (2020).
    """

    def __init__(
        self,
        # _tree_max_num_seq,
        _emb_size,
        _units=200,
        _num_layers=2,
        _num_categories=2,
        _bidirectional=True,
        _internal_dropout=0.1,
        _dropout=0.1,
    ):
        super(FND_BI_GRU, self).__init__()

        self.input_size = _emb_size
        self.hidden_size = _units
        self.num_layers = _num_layers # GRU_NUM_LAYERS
        self.output_size = _num_categories
        self.bidirectional = _bidirectional # BIDIRECTIONAL
        self.bidirectional_factor = 2 if self.bidirectional else 1

        self.gru = torch.nn.GRU(
            input_size=self.input_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=self.bidirectional,
            dropout=_internal_dropout,
        )
        self.dropout = torch.nn.Dropout(_dropout)
        self.fc = torch.nn.Linear(
            self.hidden_size * self.bidirectional_factor, 1 # 1 because it's binary classification
        )

    def forward(self, x):
        # Apply GRU.
        h0 = torch.zeros(
            self.num_layers * self.bidirectional_factor, x.size(0), self.hidden_size
        ).to(x.device)
        out, _ = self.gru(x, h0)

        # Apply dropout.
        out = self.dropout(
            out[:, -1, :]
        )  

        # Apply linear layer.
        out = self.fc(out)

        return out

## Define traning, test, and validation procedures.

In [None]:
loss_fn = torch.nn.BCEWithLogitsLoss()

In [None]:
def train(epoch, model, training_loader, optimizer=None, scheduler=None):
    
    model.train()
    loss_acum = 0
    N = 0
    
    for iters, data in tqdm(enumerate(training_loader, 0), total=len(training_loader)):

        # Reset optimizer gradients.
        if optimizer:
            optimizer.zero_grad(set_to_none=True)

        # Make forward and backward passes in the model.
        x = data["example"].to(device)
        targets = data["label"].to(device)
        logits = model(x)
        logits = logits.squeeze(1)
        
        loss = loss_fn(logits, targets.float())
        loss.backward()

        # Compute accumulated loss for reporting progress.
        loss_acum += loss.item()
        N = N + 1

        # Update optimizer and scheduler, if any.
        if optimizer:
            optimizer.step()

        if scheduler:
            scheduler.step()

    return loss_acum / N

In [None]:
def validate(model, testing_loader):
    
    model.eval()
    loss_acum = 0
    fin_targets = []
    fin_outputs = []
    N = 0
    
    with torch.no_grad():
        for iters, data in tqdm(
            enumerate(testing_loader, 0), total=len(testing_loader)
        ):
            x = data["example"].to(device)
            targets = data["label"].to(device)

            # Make forward pass for prediction.
            logits = model(x)
            logits = logits.squeeze(1)
            
            loss = loss_fn(logits, targets.float())

            # Compute accumulated loss for reporting progress.
            loss_acum += loss.item()
            N = N + 1

            # Compute expected outputs vs model outputs for reporting progress.
            # We use sigmoid activation as it is a binary classification problem.
            fin_outputs.extend(torch.sigmoid(logits).cpu().detach().numpy())
            fin_targets.extend(targets.cpu().detach().numpy().tolist())

    return loss_acum / N, np.array(fin_outputs), np.array(fin_targets)

## Make dataframe to tabulate results

In [None]:
column_names = list(settings.keys())
column_names = column_names + ["seed"]
column_names = column_names + ["epoch"]
column_names = column_names + ["train_loss"]

if USE_VALIDATION_SET:
    column_names = column_names + ["val_loss", "val_accuracy", "val_f1_score_micro", "val_f1_score_macro"]
    
column_names = column_names + ["test_loss", "test_accuracy", "test_f1_score_micro", "test_f1_score_macro"]
column_names = column_names + ["test_f1_score_label0", "test_f1_score_label1"]
column_names

In [None]:
run_results = pd.DataFrame(columns=column_names)
run_results

In [None]:
print(f"Seed: {run_seed}")
np.random.seed(run_seed)
torch.manual_seed(run_seed)
random.seed(run_seed)

training_set = CustomDataset(X_train, y_train)
training_loader = DataLoader(training_set, **train_params)

if USE_VALIDATION_SET:
    validation_set = CustomDataset(X_val, y_val)
    validation_loader = DataLoader(validation_set, **validate_params)

test_set = CustomDataset(X_test, y_test)
testing_loader = DataLoader(test_set, **test_params)

In [None]:
# Create GRU with given settings
_units = settings["UNITS"]
_num_layers = settings["GRU_NUM_LAYERS"]
_bidirectional = settings["BIDIRECTIONAL"]
_internal_dropout = settings["INTERNAL_DROPOUT"]
_dropout = settings["DROPOUT"]

model = FND_BI_GRU(
    _emb_size=emb_size,
    _units=_units,
    _num_layers=_num_layers,
    _num_categories=num_categories,
    _bidirectional=_bidirectional,
    _internal_dropout=_internal_dropout,
    _dropout=_dropout
)

# TODO add config to load or not pre-trained weights
# ####### LOAD PRE-TRAINED WEIGHTS #######
if LOAD_PRETRAINED_MODEL and PRETRAINED_MODEL:
    snapshot_download(f"eprovidel/{PRETRAINED_MODEL}", local_dir='./repo')
    pattern = f"./repo/{PRETRAINED_MODEL}*.pt"
    for filename in glob.glob(pattern, recursive=False):
        print(f"PRE-TRAINED WEIGHTS FOUND AT: {filename}")
        model.load_state_dict(torch.load(filename))
        break

print("Model")
print(model)
model.to(device)

_epochs = settings["EPOCHS"]
_epoch_snapshots = settings["EPOCH_SNAPSHOTS"]
_save_epoch_snapshots = settings["SAVE_EPOCH_SNAPSHOTS"]
_learning_rate = settings["LEARNING_RATE"]

optimizer = torch.optim.Adagrad(
    params=model.parameters(),
    lr=_learning_rate,
    weight_decay=0.01
)

scheduler = None

for epoch in range(_epochs):
    print(f"===== EPOCH {epoch} / SEED {run_seed}")

    # ######################################################################################
    # Train
    train_loss = train(epoch, model, training_loader, optimizer, scheduler)
    print(f"Train loss: {train_loss}")

    # ######################################################################################
    # Validation
    if USE_VALIDATION_SET:
        val_loss, val_outputs, val_targets = validate(model, validation_loader)
        
        val_outputs_bin = (val_outputs >= 0.5)
        val_accuracy = metrics.accuracy_score(val_targets, val_outputs_bin)
        val_f1_score_micro = metrics.f1_score(
            val_targets, val_outputs_bin, average="micro"
        )
        val_f1_score_macro = metrics.f1_score(
            val_targets, val_outputs_bin, average="macro"
        )           
    
        print(f"Accuracy Score = {val_accuracy}")
        print(f"F1 Score (Micro) = {val_f1_score_micro}")
        print(f"F1 Score (Macro) = {val_f1_score_macro}")
        print(f"Validation loss:{val_loss}")

    # ######################################################################################
    # Test
    test_loss, test_outputs, test_targets = validate(model, testing_loader)
    
    test_outputs_bin = (test_outputs >= 0.5)
    test_accuracy = metrics.accuracy_score(test_targets, test_outputs_bin)
    test_f1_score_micro = metrics.f1_score(test_targets, test_outputs_bin, average='micro')
    test_f1_score_macro = metrics.f1_score(test_targets, test_outputs_bin, average='macro')

    # Per class f1_score_macro, when labels are binary.
    test_f1_score_macro_binary0 = metrics.f1_score(test_targets, test_outputs_bin, average='binary', pos_label=0)
    test_f1_score_macro_binary1 = metrics.f1_score(test_targets, test_outputs_bin, average='binary', pos_label=1)

    print(f"TEST Accuracy Score = {test_accuracy}")
    print(f"TEST F1 Score (Micro) = {test_f1_score_micro}")
    print(f"TEST F1 Score (Macro) = {test_f1_score_macro}")
    print(f"TEST F1 Score (Macro) for label 0 = {test_f1_score_macro_binary0}")
    print(f"TEST F1 Score (Macro) for label 1 = {test_f1_score_macro_binary1}")
    print(f"TEST loss: {test_loss}")

    results_row = {
        **settings,
        'seed': run_seed,
        'epoch': epoch,
        'train_loss': train_loss,        
        'test_loss': test_loss,
        'test_accuracy': test_accuracy,
        'test_f1_score_micro': test_f1_score_micro,
        'test_f1_score_macro': test_f1_score_macro,
        'test_f1_score_label0': test_f1_score_macro_binary0,
        'test_f1_score_label1': test_f1_score_macro_binary1
    }

    if USE_VALIDATION_SET:
        results_row = {
            **results_row,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy,
            'val_f1_score_micro': val_f1_score_micro,
            'val_f1_score_macro': val_f1_score_macro,
        }        

    run_results = pd.concat([run_results, pd.DataFrame([results_row])], ignore_index=True)

    # ###############################################################
    # Save epoch snapshot when required
    if PAPERMILL:        
        if SAVE_EPOCH_SNAPSHOTS:
            if epoch in EPOCH_SNAPSHOTS:
                print(f"SAVING MODEL AT EPOCH SNAPSHOT: {epoch}")
                epoch_label = EPOCH_SNAPSHOTS_LABELS_BY_NUM.get(epoch, f"epoch_{epoch}")
    
                # Save model state with deepcopy
                model_state = deepcopy(model.state_dict())
                    
                # Save model as native pytorch binary
                new_model_repo_path = f"FND_GRU_RNN_{RUN_SETTING:02d}_{epoch_label}_{epoch}_{RUN_SUFFIX}"
                print(f"Saving model with path: {new_model_repo_path}")
                torch.save(model_state, f"./{new_model_repo_path}.pt")

### Upload all .pt files to S3

In [None]:
if STORE_RESULTS_S3 == 1:
    !find ./ -name "*.pt" | xargs -I{} aws s3 cp ./{} s3://$S3_BUCKET/ --endpoint-url=$S3_ENDPOINT

### Upload all .pt files to huggingface hub as generic artifcat

In [None]:
if SAVE_EPOCH_SNAPSHOTS:
    hfAPI = HfApi()
    for filename in glob.glob('./*.pt', recursive=False):
        try:
            filename_base = os.path.basename(filename).split('.pt')[0]
            print(f"Uploading model {filename_base}")    
            repo = hfAPI.create_repo(f"eprovidel/{filename_base}", private=True)
            hfAPI.upload_file(path_or_fileobj=f"./{filename_base}.pt", path_in_repo=f"{filename_base}.pt", repo_id=repo.repo_id)
        except Exception as exc:
            print(exc)

## Save results to S3

In [None]:
if PAPERMILL:
    # Store results in Excel file
    EXCEL_OUTPUT = f"FND_GRU_RNN_{EXPERIMENT_SUFFIX}_{RUN_SETTING}_RUN_{RUN_SUFFIX}.xlsx"
    # !python send_bot.py f"{EXCEL_OUTPUT} ready!"
    run_results.to_excel(f"{XLS_RESULTS_FOLDER}/{EXCEL_OUTPUT}")

    if STORE_RESULTS_S3 == 1:
        print(f"Uploading {EXCEL_OUTPUT} to S3")
        # Upload results to S3. Depends on environment variables.
        # We use $$ to force environment variable and to be able to combine it with local variable {EXCEL_OUTPUT}
        !aws s3 cp ./{EXCEL_OUTPUT} s3://$$S3_BUCKET/ --endpoint-url=$$S3_ENDPOINT    

### Remove .pt files to avoid filling the disk

In [None]:
print("End")