## Main Analysis Notebook

For all pre-processing of the raw data, and analysis with HuggingFace

### Pure HF Training

In [None]:
#!conda install -c conda-forge datasets evaluate ipykernel jupyter jupyterlab keras nb_conda_kernels openpyxl pytorch scikit-learn transformers tqdm wandb
#!ipython kernel install --user --name=cc2
#!pip install transformers -U
#!pip install tokenizers==0.12.1 #maybe

In [None]:
# TODO 1: Implement negex benchmark 
# TODO 2: Implement word overlap benchmark
# TODO 3: Implement benchmarks based on Vader (polarity detection)

# TODO 4: Implement final evaluation (all test stuff)


In [1]:
!pwd

/oak/stanford/groups/rbaltman/dnsosa/covid_lit_contra_claims/notebooks


In [2]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [4]:
import os

import torch
import wandb

from datasets import load_dataset, Dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import AdamW, AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler,  Trainer, TrainingArguments
from tqdm.notebook import tqdm

import evaluate


print("Packages loaded.")

SEED = 42

root_dir = os.path.abspath("..")
mednli_train_path = os.path.join(root_dir, 'input/mednli/mli_train_v1.jsonl')
mednli_dev_path = os.path.join(root_dir, 'input/mednli/mli_dev_v1.jsonl')
mednli_test_path = os.path.join(root_dir, 'input/mednli/mli_test_v1.jsonl')
mancon_xml_path = os.path.join(root_dir, 'input/manconcorpus/ManConCorpus.xml')
roam_path = os.path.join(root_dir, 'input/cord-training/Roam_annotations_trainvaltest_split_V2.xlsx')

in_dataset = "mednli"
val_set_name = "val"
#val_set_mapper[{"multinli": "validation_matched"}]

config = dict(
    truncation = True,
    mancon_neutral_frac = 1,
    train_val_frac = 0.8,
    num_epochs = 8,
    batch_size = 8,
    wandb_log_interval = 10,
    dataset = in_dataset,
    learning_rate = 3e-5
)

wandb.init(project='Contra Claims 10_22', config=config)
%env "WANDB_NOTEBOOK_NAME" "Main CC Pipeline Analysis Notebook"

print("WandB initialized.")


    
checkpoint = "allenai/biomed_roberta_base"
#checkpoint = "bert-base-uncased"
#checkpoint = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
#checkpoint = "gsarti/biobert-nli"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

print("Tokenizer loaded.")

tokenized_datasets = preprocess_nli_corpus_for_pytorch(config['dataset'], tokenizer=tokenizer, truncation=config['truncation'])

print(f"{in_dataset} tokenized.")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# NOTE: Change from 100
train_dataloader = DataLoader(
    tokenized_datasets["train"].select(range(1000)), shuffle=True, batch_size=config['batch_size'], collate_fn=data_collator
)
#eval_dataloader = DataLoader(
#    tokenized_datasets[val_set_name], batch_size=config['batch_size'], collate_fn=data_collator
#)
eval_dataloader = train_dataloader

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
wandb.watch(model, log_freq=100)


print("Model loaded.")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

print(f"Using device {device}.")

num_training_steps = config['num_epochs'] * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

#progress_bar = tqdm(range(num_training_steps))

print("Beginning training...")
print(f"# Epochs: {config['num_epochs']}")
model.train()

#for epoch in range(config['num_epochs']):
for epoch in tqdm(range(config['num_epochs'])):
    for batch_idx, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        #progress_bar.update(1)
        
        if batch_idx % config['wandb_log_interval'] == 0:
            wandb.log({"epoch": epoch, "training_loss": loss})

print("Training complete.")
print("Beginning evaluation...")

acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1', average='macro')
precision_metric = evaluate.load('precision', average='macro')
recall_metric = evaluate.load('recall', average='macro')

model.eval()
#for batch_idx, batch in enumerate(eval_dataloader):
for batch_idx, batch in enumerate(tqdm(eval_dataloader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    for metric in [acc_metric, f1_metric, precision_metric, recall_metric]:
        metric.add_batch(predictions=predictions, references=batch["labels"])

results = acc_metric.compute()
for metric in [f1_metric, precision_metric, recall_metric]:
    results.update(metric.compute(average='macro'))
    
wandb.log(results)
#torch.onnx.export(model, batch, "model.onnx")
#wandb.save("model.onnx")

print(f"Results: {results}")
print("Evaluation complete.")



Packages loaded.


[34m[1mwandb[0m: Currently logged in as: [33mdnsosa[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: "WANDB_NOTEBOOK_NAME"="Main CC Pipeline Analysis Notebook"
WandB initialized.
Tokenizer loaded.


Using custom data configuration default-7d9106e9c4160845
Found cached dataset json (/Users/dnsosa/.cache/huggingface/datasets/json/default-7d9106e9c4160845/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/11232 [00:00<?, ?ex/s]

  0%|          | 0/1395 [00:00<?, ?ex/s]

  0%|          | 0/1422 [00:00<?, ?ex/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

mednli tokenized.


Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['classi

Model loaded.
Using device cpu.
Beginning training...
# Epochs: 8




  0%|          | 0/8 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
import covid_lit_contra_claims as clcc
from covid_lit_contra_claims.data.constants import model_id_mapper
from covid_lit_contra_claims.data.DataLoader import load_train_datasets, load_additional_eval_datasets
from covid_lit_contra_claims.data.DataExperiments import prepare_training_data

from transformers import AutoTokenizer


#out_dir = 
model = "biobert"
#train_datasets = "multinli_mednli_mancon_roam_roamAll_roamPH_roamDD_roamDDPH"
train_datasets = "roam_roamAll_roamPH"
eval_datasets = train_datasets
truncation = True
train_prep_experiment = "sequential"
data_ratios = 2
SEED = 42

# Loading tokenizer here because needed in data loading and model loading
checkpoint = model_id_mapper[model]
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load training and evaluation datasets
train_dataset_dict, val_dataset_dict, test_dataset_dict = load_train_datasets(train_datasets, tokenizer,
                                                                              truncation=truncation,
                                                                              SEED=SEED)

# Two versions of CovidNLI: One where test is a separate network from train
eval_dataset_dict = load_additional_eval_datasets(eval_datasets, tokenizer,
                                                  truncation=truncation,
                                                  SEED=SEED)

# Conduct any input preprocessing for various experiments
# Note currently only using data_ratio parameter for training data, NOT val data.


====Creating roam Dataset object for train/val/test...====


  0%|                                                                                                                                                                                                                                            | 0/1 [00:01<?, ?ba/s]
  0%|                                                                                                                                                                                                                                            | 0/1 [00:00<?, ?ba/s]
  0%|                                                                                                                                                                                                                                            | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                      

====...done.====
====Creating roamAll Dataset object for train/val/test...====


Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
  0%|                                                                                                                                                                                                           

====...done.====
====Creating roamPH Dataset object for train/val/test...====


Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
  0%|                                                                                                                                                                                                           

====...done.====
====Creating roam Dataset object for evaluation only...====


  0%|                                                                                                                                                                                                                                            | 0/1 [00:00<?, ?ba/s]
  0%|                                                                                                                                                                                                                                            | 0/1 [00:00<?, ?ba/s]
  0%|                                                                                                                                                                                                                                            | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                      

====...done.====
====Creating roamAll Dataset object for evaluation only...====


Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
  0%|                                                                                                                                                                                                           

====...done.====
====Creating roamPH Dataset object for evaluation only...====


Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
Casting the dataset:   0%|                                                                                                                                                                                                                       | 0/1 [00:00<?, ?ba/s]
  0%|                                                                                                                                                                                                           

====...done.====





In [4]:
prepared_train_dataset_dict = prepare_training_data(train_dataset_dict, train_prep_experiment, SEED=SEED, data_ratios=data_ratios)
prepared_train_dataset_dict

Sequential training data preparation. Dataset list will not be perturbed.


OrderedDict([('roam',
              Dataset({
                  features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
                  num_rows: 434
              })),
             ('roamAll',
              Dataset({
                  features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
                  num_rows: 740
              })),
             ('roamPH',
              Dataset({
                  features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
                  num_rows: 171
              }))])

In [5]:
import os

from covid_lit_contra_claims.models.Training import train_model

out_dir = os.path.join("/oak/stanford/groups/rbaltman/dnsosa/covid_lit_contra_claims/", "output")
epochs=8
batch_size=2
learning_rate = 1e-3

training_args = {'train_datasets': train_datasets,
                 'eval_datasets': eval_datasets,
                 'epochs': epochs,
                 'batch_size': batch_size,
                 'learning_rate': learning_rate,
                 'truncation': truncation,
                 'train_prep_experiment': train_prep_experiment,
                 'data_ratios': data_ratios}
trained_model, overall_results = train_model(model, tokenizer, prepared_train_dataset_dict, val_dataset_dict,
                                             training_args=training_args, out_dir=out_dir, SEED=SEED)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mdnsosa[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


WandB initialized.


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Model loaded.


NVIDIA A100-PCIE-40GB with CUDA capability sm_80 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA A100-PCIE-40GB GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



Using device cuda.
Created a DataLoader for corpus 'roam'...
Created a learning rate scheduler for corpus 'roam'...
Beginning training...
# Epochs: 8


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [6]:
torch.cuda.is_available()

True

In [7]:
model_id_mapper = {"biobert": "dmis-lab/biobert-base-cased-v1.2",
                   "bioclinbert": "emilyalsentzer/Bio_ClinicalBERT",
                   "scibert": "allenai/scibert_scivocab_uncased",
                   "pubmedbert": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
                   "roberta": "roberta-base"}

In [15]:
import torch
import numpy as np
import wandb

from transformers import AdamW, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler

# Set random seeds
torch.manual_seed(SEED)
np.random.seed(SEED)

model_id = "biobert"

# Configs and init for WandB
additional_configs = {"mancon_neutral_frac": 1,
                      "mancon_train_frac": .67,
                      "wandb_log_interval": 10}
#config = training_args.update(additional_configs)
config = dict(training_args, **additional_configs)
wandb.init(project='COVID Drug Contra Claims', config=config)
print("WandB initialized.")

# Load the model, initialize the optimizer
checkpoint = model_id_mapper[model_id]
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
wandb.watch(model, log_freq=1)
print("Model loaded.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


WandB initialized.


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

NameError: name 'WANDB_LOG_FREQ' is not defined

In [16]:
config

{'train_datasets': 'roam_roamAll_roamPH_roamDD_roamDDPH',
 'eval_datasets': 'roam_roamAll_roamPH_roamDD_roamDDPH',
 'epochs': 8,
 'batch_size': 2,
 'learning_rate': 0.001,
 'truncation': True,
 'train_prep_experiment': 'shuffled',
 'data_ratios': 2,
 'mancon_neutral_frac': 1,
 'mancon_train_frac': 0.67,
 'wandb_log_interval': 10}

In [14]:
training_args

{'train_datasets': 'roam_roamAll_roamPH_roamDD_roamDDPH',
 'eval_datasets': 'roam_roamAll_roamPH_roamDD_roamDDPH',
 'epochs': 8,
 'batch_size': 2,
 'learning_rate': 0.001,
 'truncation': True,
 'train_prep_experiment': 'shuffled',
 'data_ratios': 2,
 'mancon_neutral_frac': 1,
 'mancon_train_frac': 0.67,
 'wandb_log_interval': 10}