# Onclusive Machine Learning Challenge
## Build an ML system to verify the veracity of claims in dataset PUBHEALTH.
### Outlining the steps and demonstrate the performance of the final model.  

In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import mlflow
import gc

In [2]:
# torch.cuda.is_available = lambda : False
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
dataset = load_dataset("health_fact")

Using custom data configuration default
Reusing dataset health_fact (C:\Users\david\.cache\huggingface\datasets\health_fact\default\1.1.0\99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
dataset = dataset.filter(lambda example: example['label'] != -1)

Loading cached processed dataset at C:\Users\david\.cache\huggingface\datasets\health_fact\default\1.1.0\99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19\cache-41180d96da5bd5f3.arrow
Loading cached processed dataset at C:\Users\david\.cache\huggingface\datasets\health_fact\default\1.1.0\99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19\cache-e37781bb9e63676d.arrow
Loading cached processed dataset at C:\Users\david\.cache\huggingface\datasets\health_fact\default\1.1.0\99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19\cache-bd791c5953b7d1ab.arrow


In [6]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["main_text"], truncation=True)  # , padding="max_length"

tokenized_datasets = dataset.map(tokenize_function, batched=True)



  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [8]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096",num_labels=4)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weig

In [10]:
torch.cuda.memory_allocated()/1024**3,torch.cuda.memory_reserved()/1024**3

(0.0, 0.0)

In [11]:
# del model
# gc.collect()
# torch.cuda.memory_allocated()/1024**3,torch.cuda.memory_reserved()/1024**3

In [12]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cpu")
# model.to(device)

In [13]:
metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
batch_size = 1
training_args = TrainingArguments(
    output_dir="models/long",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True
    )

In [15]:
trainer = Trainer(
    model=model,  # .half()
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    # train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using amp fp16 backend


In [16]:
torch.cuda.memory_allocated()/1024**3,torch.cuda.memory_reserved()/1024**3

(0.5545644760131836, 0.609375)

In [17]:
old_collator = trainer.data_collator
trainer.data_collator = lambda data: dict(old_collator(data))

In [18]:
mlflow.end_run()
print(torch.cuda.memory_allocated()/1024**3,torch.cuda.memory_reserved()/1024**3)
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated()/1024**3,torch.cuda.memory_reserved()/1024**3)

0.5545644760131836 0.609375
0.5545644760131836 0.609375


In [19]:
trainer.train()
# print(torch.cuda.memory_allocated()/1024**3,torch.cuda.memory_reserved()/1024**3)

The following columns in the training set  don't have a corresponding argument in `LongformerForSequenceClassification.forward` and have been ignored: subjects, claim_id, sources, date_published, claim, fact_checkers, explanation, main_text.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 300


  0%|          | 0/300 [00:00<?, ?it/s]

Initializing global attention on CLS token...
Input ids are automatically padded from 1697 to 2048 to be a multiple of `config.attention_window`: 512
Initializing global attention on CLS token...
Input ids are automatically padded from 596 to 1024 to be a multiple of `config.attention_window`: 512
Initializing global attention on CLS token...
Input ids are automatically padded from 1886 to 2048 to be a multiple of `config.attention_window`: 512
Initializing global attention on CLS token...
Input ids are automatically padded from 2080 to 2560 to be a multiple of `config.attention_window`: 512


RuntimeError: CUDA out of memory. Tried to allocate 62.00 MiB (GPU 0; 6.00 GiB total capacity; 5.19 GiB already allocated; 0 bytes free; 5.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [7]:
trainer.save_model()

NameError: name 'trainer' is not defined

In [23]:
# pipe = pipeline("text-classification",model="./test_trainer/")

loading configuration file ./test_trainer/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "vocab_size": 30522
}

loading configuration file ./test_trainer/config.json
Model config DistilBertConfig {
  "_name_or_path": "di

In [24]:
pred = pipe(dataset['test']['main_text'])

Disabling tokenizer parallelism, we're using DataLoader multithreading already
