# Embedding BERT

In [None]:
CUDA_DEVICE = 0

In [None]:
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
torch.__version__

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
device = f"cuda:{CUDA_DEVICE}" if cuda.is_available() else 'cpu'
print(device)
print(torch.cuda.get_device_name(CUDA_DEVICE))

In [None]:
import numpy as np
import pandas as pd
import random

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
# "Constants"
RUN_PREFIX = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
RUN_PREFIX
TAG = 'RN'

In [None]:
# PAPERMILL PARAMETERS
PAPERMILL = False
EXPERIMENT_NAME = "EXP-TBD"
RUN_SETTING = "-1"

# Model-specific parameters
OG_SEED=42
OG_SEED_1=15
EMBEDDING_SNAPSHOT = "BASE0_RN_RAW_BERT_embedding"
DROPOUT = 0.2
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1000
LEARNING_RATE = 0.0001 # 1e-04  # 0.00001 # 0.001
GRU_NUM_LAYERS = 2  # 1: single GRU, 2+: stacked GRU
BIDIRECTIONAL = True
INTERNAL_DROPOUT = 0.2
UNITS = 1024

In [None]:
settings = {
    "OG_SEED": OG_SEED,
    "DROPOUT": DROPOUT,
    "TRAIN_BATCH_SIZE": TRAIN_BATCH_SIZE,
    "VALID_BATCH_SIZE": VALID_BATCH_SIZE,    
    "EPOCHS": EPOCHS,
    "LEARNING_RATE": LEARNING_RATE,
    "BIDIRECTIONAL": BIDIRECTIONAL,
    "INTERNAL_DROPOUT": INTERNAL_DROPOUT,
    "UNITS": UNITS,
}

In [None]:
if PAPERMILL:
    print("Importing plain tqdm")
    from tqdm import tqdm    
else:
    print("Importing auto tqdm")
    from tqdm.auto import tqdm    

In [None]:
np.random.seed(OG_SEED_1)
seed1 = np.random.randint(0, 42069, size=1)[0]
seed1

In [None]:
print(f"Seed: {seed1}")
np.random.seed(seed1)
torch.manual_seed(seed1)
random.seed(seed1)

In [None]:
X = None
y = None

print(f"Loading Xy from snapshot {EMBEDDING_SNAPSHOT}")

X = np.load(f"embeddings/X_{EMBEDDING_SNAPSHOT}.npy")
y = np.load(f"embeddings/y_{EMBEDDING_SNAPSHOT}.npy")

assert X is not None
assert y is not None
assert len(X) == len(y)

print(f"Loaded Xy from snapshot: {EMBEDDING_SNAPSHOT}")
print(X.shape)
print(y.shape)

In [None]:
#separación en datos de entrenamiento y prueba
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.15)

In [None]:
class CustomDataset(Dataset):

    def __init__(self, examples, labels):
        assert len(examples) == len(labels)
        self.examples = examples
        self.labels = labels

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):

        example = self.examples[index]

        # From one-hot encoded to categorical label
        label = np.argmax(self.labels[index])
        
        return {
            'example': torch.tensor(example),            
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
_, tree_max_num_seq, emb_size = X.shape
_, num_categories = y.shape

In [None]:
print("tree_max_num_seq: ", tree_max_num_seq)
print("emb size: ", emb_size)
print("num_categories: ", num_categories)

In [None]:
print("Train+val shapes")
print(f"X train+val: {X_train_val.shape}")
print(f"y train+val: {y_train_val.shape}")

In [None]:
print("Train shapes")
print(f"X train: {X_train.shape}")
print(f"y train: {y_train.shape}")

In [None]:
print("Val shapes")
print(f"X val: {X_val.shape}")
print(f"y val: {y_val.shape}")

In [None]:
print("Test shapes")
print(f"X test: {X_test.shape}")
print(f"y test: {y_test.shape}")

In [None]:
train_params = {
    'batch_size': settings["TRAIN_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

validate_params = {
    'batch_size': settings["VALID_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

test_params = {
    'batch_size': settings["VALID_BATCH_SIZE"],
    'shuffle': True,
    'num_workers': 0
}

## BI-GRU torch

In [None]:
class FND_BI_GRU(torch.nn.Module):
    """Bidirectional GRU for fake news classification.

    Pytorch reimplementation of model in Providel&Mendoza (2020)."""

    def __init__(
        self, _tree_max_num_seq, _emb_size, _num_categories, _units=200, _dropout=0.1
    ):
        super(FND_BI_GRU, self).__init__()

        self.input_size = _emb_size
        self.hidden_size = _units
        self.num_layers = GRU_NUM_LAYERS
        self.output_size = _num_categories
        self.bidirectional = BIDIRECTIONAL
        self.bidirectional_factor = 2 if self.bidirectional else 1

        self.gru = torch.nn.GRU(
            input_size=self.input_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=self.bidirectional,
            dropout=INTERNAL_DROPOUT,
        )
        self.dropout = torch.nn.Dropout(_dropout)
        self.fc = torch.nn.Linear(
            self.hidden_size * self.bidirectional_factor, 1 #self.output_size
        )

    def forward(self, x):
        h0 = torch.zeros(
            self.num_layers * self.bidirectional_factor, x.size(0), self.hidden_size
        ).to(x.device)
        out, _ = self.gru(x, h0)

        out = self.dropout(
            out[:, -1, :]
        )  # Take the last output sequence and apply dropout
        out = self.fc(out)

        ## Decode hidden state of the last time step
        # out = self.fc(out[:, -1, :])
        # out = self.softmax(out)

        return out

## Loss function

In [None]:
loss_fn = torch.nn.BCEWithLogitsLoss()

## Train & Validation

In [None]:
def train(epoch, model, training_loader, optimizer=None, scheduler=None):
    model.train()
    loss_acum = 0
    N = 0

    for iters, data in tqdm(enumerate(training_loader, 0), total=len(training_loader)):
        ## Reiniciamos los gradientes para que no se acumulen
        if optimizer:
            optimizer.zero_grad(set_to_none=True)

        # Model evaluation and loss computation/propagation
        x = data["example"].to(device)
        targets = data["label"].to(device)
        logits = model(x)
        logits = logits.squeeze(1)
        
        loss = loss_fn(logits, targets.float())
        loss.backward()

        # Here we could add support for weighted loss.
        loss_acum += loss.item()
        N = N + 1

        if optimizer:
            optimizer.step()

        if scheduler:
            scheduler.step()

    return loss_acum / N

In [None]:
def validate(model, testing_loader):
    model.eval()
    loss_acum = 0
    fin_targets = []
    fin_outputs = []
    N = 0
    with torch.no_grad():
        for iters, data in tqdm(
            enumerate(testing_loader, 0), total=len(testing_loader)
        ):
            x = data["example"].to(device)
            targets = data["label"].to(device)

            logits = model(x)
            logits = logits.squeeze(1)
            
            loss = loss_fn(logits, targets.float())

            # Here we could add support for weighted loss.
            loss_acum += loss.item()
            N = N + 1

            # The model output already comes from a softmax layer
            fin_outputs.extend(torch.sigmoid(logits).cpu().detach().numpy())
            fin_targets.extend(targets.cpu().detach().numpy().tolist())

    return loss_acum / N, np.array(fin_outputs), np.array(fin_targets)

## Entrenamiento

In [None]:
np.random.seed(settings["OG_SEED"])
seed = np.random.randint(0, 42069, size=1)[0]
seed

## Make dataframe to tabulate results

In [None]:
column_names = list(settings.keys())
column_names = column_names + ["seed"]
column_names = column_names + ["epoch"]
column_names = column_names + ["train_loss"]
column_names = column_names + ["val_loss", "val_accuracy", "val_f1_score_micro", "val_f1_score_macro"]
column_names = column_names + ["test_loss", "test_accuracy", "test_f1_score_micro", "test_f1_score_macro", "test_f1_score_label0", "test_f1_score_label1"]
column_names

In [None]:
run_results = pd.DataFrame(columns=column_names)
run_results

In [None]:
from copy import deepcopy

In [None]:
best_valid_loss = float("inf")
best_valid_accuracy = 0
best_valid_f1_macro = 0
best_val_loss_epoch = None
best_model_state = None

_epochs = settings["EPOCHS"]
_learning_rate = settings["LEARNING_RATE"]
_units = settings["UNITS"]
_dropout = settings["DROPOUT"]


print(f"Seed: {seed}")
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

training_set = CustomDataset(X_train, y_train)
training_loader = DataLoader(training_set, **train_params)

validation_set = CustomDataset(X_val, y_val)
validation_loader = DataLoader(validation_set, **validate_params)

test_set = CustomDataset(X_test, y_test)
testing_loader = DataLoader(test_set, **test_params)

# Load pretrained model
print("Model")
model = FND_BI_GRU(
    tree_max_num_seq, emb_size, num_categories, _units=_units, _dropout=_dropout
)
print(model)

model.to(device)
print("Cargado correctamente para GPU")

# optimizer = None
# optimizer = torch.optim.AdamW(
#     params=model.parameters(),
#     lr=_learning_rate,
#     weight_decay=0.001
# )
optimizer = torch.optim.Adagrad(
    params=model.parameters(),
    lr=_learning_rate,
    weight_decay=0.01
)

scheduler = None

for epoch in range(_epochs):
    print(f"===== EPOCH {epoch} / SEED {seed}")

    # ######################################################################################
    # Train
    train_loss = train(epoch, model, training_loader, optimizer, scheduler)

    # ######################################################################################
    # Validation
    val_loss, val_outputs, val_targets = validate(model, validation_loader)
    val_outputs_bin = (val_outputs >= 0.5) #np.argmax(val_outputs, axis=1)

    # print("Predictions: ", val_outputs_bin)
    # print("Targets: ", val_targets)

    val_accuracy = metrics.accuracy_score(val_targets, val_outputs_bin)
    val_f1_score_micro = metrics.f1_score(
        val_targets, val_outputs_bin, average="micro"
    )
    val_f1_score_macro = metrics.f1_score(
        val_targets, val_outputs_bin, average="macro"
    )

    if best_valid_loss > val_loss:
        best_valid_loss = val_loss
        best_val_loss_epoch = epoch
        print(f"Best val loss: {best_valid_loss} at epoch {epoch} -- saving best model state")
        best_model_state = deepcopy(model.state_dict())        

    print(f"Accuracy Score = {val_accuracy}")
    print(f"F1 Score (Micro) = {val_f1_score_micro}")
    print(f"F1 Score (Macro) = {val_f1_score_macro}")
    print(f"Train loss: {train_loss}\t Validation loss:{val_loss}")

    # ######################################################################################
    # Test
    test_loss, test_outputs, test_targets = validate(model, testing_loader)
    test_outputs_bin = (test_outputs >= 0.5) # np.argmax(test_outputs, axis=1)
    test_accuracy = metrics.accuracy_score(test_targets, test_outputs_bin)
    test_f1_score_micro = metrics.f1_score(test_targets, test_outputs_bin, average='micro')
    test_f1_score_macro = metrics.f1_score(test_targets, test_outputs_bin, average='macro')
    
    test_f1_score_macro_binary0 = metrics.f1_score(test_targets, test_outputs_bin, average='binary', pos_label=0)
    test_f1_score_macro_binary1 = metrics.f1_score(test_targets, test_outputs_bin, average='binary', pos_label=1)
    

    print(f"TEST Accuracy Score = {test_accuracy}")
    print(f"TEST F1 Score (Micro) = {test_f1_score_micro}")
    print(f"TEST F1 Score (Macro) = {test_f1_score_macro}")
    print(f"TEST loss: {test_loss}")

    results_row = {
        **settings,
        'seed': seed,
        'epoch': epoch,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
        'val_f1_score_micro': val_f1_score_micro,
        'val_f1_score_macro': val_f1_score_macro,
        'test_loss': test_loss,
        'test_accuracy': test_accuracy,
        'test_f1_score_micro': test_f1_score_micro,
        'test_f1_score_macro': test_f1_score_macro,
        'test_f1_score_label0': test_f1_score_macro_binary0,
        'test_f1_score_label1': test_f1_score_macro_binary1,
    }

    run_results = pd.concat([run_results, pd.DataFrame([results_row])], ignore_index=True)

## Save model to huggingface hub as generic artifact

if PAPERMILL:
    # Store model in huggingface hub
    from huggingface_hub import HfApi
    new_model_repo_path = f"GRURNN-{EXPERIMENT_NAME}_EPOCH_{best_val_loss_epoch}_{RUN_SETTING}_{RUN_PREFIX}"
    new_model_repo_path
    torch.save(best_model_state, f"./{new_model_repo_path}.pt")
    hfAPI = HfApi()
    repo = hfAPI.create_repo(f"eprovidel/{new_model_repo_path}")
    hfAPI.upload_file(path_or_fileobj=f"./{new_model_repo_path}.pt", path_in_repo=f"{new_model_repo_path}.pt", repo_id=repo.repo_id)

In [None]:
del model
del optimizer
del scheduler
torch.cuda.empty_cache()

In [None]:
run_results

In [None]:
%env AWS_ACCESS_KEY_ID=0046bfa75805a4a0000000001
%env AWS_SECRET_ACCESS_KEY=K004HXx/xI+XSXFYYQAFY8UY+jsx5Yk
%env S3_ENDPOINT=https://s3.us-west-004.backblazeb2.com
%env S3_BUCKET=multitask-fnd

In [None]:
if PAPERMILL:
    # Store results in Excel file
    EXCEL_OUTPUT = f"{EXPERIMENT_NAME}_SETTING_{RUN_SETTING}_RUN_{RUN_PREFIX}.xlsx"
    run_results.to_excel(EXCEL_OUTPUT)

    print(f"Uploading {EXCEL_OUTPUT} to S3")

    # Upload results to S3. Depends on environment variables.
    # We use $$ to force environment variable and to be able to combine it with local variable {EXCEL_OUTPUT}
    !aws s3 cp ./{EXCEL_OUTPUT} s3://$$S3_BUCKET/ --endpoint-url=$$S3_ENDPOINT
    !aws s3 cp ./output_{EXPERIMENT_NAME}_{RUN_SETTING}.txt s3://$$S3_BUCKET/ --endpoint-url=$$S3_ENDPOINT

In [None]:
print("End")