In [None]:
!pip install pandas
!pip install datasets
!pip install transformers
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

## Preparing our Data

In this notebook, we'll start by using a local dataset (instead of using a dataset stored at Hugging Face).
Let's load data for our classification task.

### Loading dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Importing the dataset
oparticles = pd.read_excel('/content/drive/Shareddrives/PLN/Assignment 2/data/OpArticles.xlsx')
oparticles = oparticles.drop(columns=['article_id', 'title', 'authors', 'meta_description','keywords', 'publish_date', 'url_canonical'])
oparticles = oparticles.rename(columns={'body': 'tokens', 'topics':'label'})

le = LabelEncoder()
oparticles['label'] = le.fit_transform(oparticles['label'])

print(oparticles.info())
print(oparticles.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  373 non-null    object
 1   label   373 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.0+ KB
None
                                              tokens  label
0  O poeta espanhol António Machado escrevia, uns...      6
1  “O mais excelente quadro posto a uma luz logo ...      6
2  1. As sociedades humanas parecem ser regidas p...      6
3  Este foi um Mundial incrível. Vimos actuações ...      6
4  O futebol sempre foi um jogo aparentemente sim...      6


In [None]:
adus = pd.read_excel('/content/drive/Shareddrives/PLN/Assignment 2/data/OpArticles_ADUs.xlsx')
adus = adus.drop(columns=['article_id', 'annotator', 'node','ranges'])
adus['label'].replace(['Value', 'Value(+)', 'Value(-)', 'Fact', 'Policy'],[0,1,2,3,4], inplace=True)

print(adus.info())
print(adus.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16743 entries, 0 to 16742
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  16743 non-null  object
 1   label   16743 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 261.7+ KB
None
                                              tokens  label
0           O facto não é apenas fruto da ignorância      0
1  havia no seu humor mais jornalismo (mais inves...      0
2                              É tudo cómico na FIFA      0
3  o que todos nós permitimos que esta organizaçã...      0
4            não nos fazem rir à custa dos poderosos      0


For ease of usage with Transformer models, we convert the dataset into a Hugging Face dataset and split it into train, validation and test sets.

In [None]:
from datasets import Dataset

oparticles_hf = Dataset.from_pandas(oparticles)

In [None]:
from datasets import DatasetDict

# 90% train, 10% test+validation
train_test = oparticles_hf.train_test_split(test_size=0.1, shuffle=True, seed=42)

# Split the 10% test+validation set in half test, half validation
valid_test = train_test['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)

# gather everyone if you want to have a single DatasetDict
train_valid_test_oparticles = DatasetDict({
    'train': train_test['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

In [None]:
train_valid_test_oparticles

DatasetDict({
    train: Dataset({
        features: ['tokens', 'label'],
        num_rows: 335
    })
    validation: Dataset({
        features: ['tokens', 'label'],
        num_rows: 19
    })
    test: Dataset({
        features: ['tokens', 'label'],
        num_rows: 19
    })
})

In [None]:
adus_hf = Dataset.from_pandas(adus)

# 90% train, 10% test+validation
train_test = adus_hf.train_test_split(test_size=0.1, shuffle=True, seed=42)

# Split the 10% test+validation set in half test, half validation
valid_test = train_test['test'].train_test_split(test_size=0.5, shuffle=True, seed=42)

# gather everyone if you want to have a single DatasetDict
train_valid_test_adus = DatasetDict({
    'train': train_test['train'],
    'validation': valid_test['train'],
    'test': valid_test['test']
})

In [None]:
train_valid_test_adus

DatasetDict({
    train: Dataset({
        features: ['tokens', 'label'],
        num_rows: 15068
    })
    validation: Dataset({
        features: ['tokens', 'label'],
        num_rows: 837
    })
    test: Dataset({
        features: ['tokens', 'label'],
        num_rows: 838
    })
})

## Fine-tuning a pretrained model

### Tokenizer

We first load the tokenizer for our model:

In [None]:
from transformers import AutoTokenizer

def get_tokenizer(name):
    return AutoTokenizer.from_pretrained(name)

Now we need to [preprocess](https://huggingface.co/docs/transformers/preprocessing) our data.

Obtaining the length of the longest sequences in our data splits

In [None]:
def find_max_length(dataset):
    return len(max(dataset, key=lambda x: len(x.split())).split())

train_max_length = find_max_length(train_valid_test_oparticles["train"]["tokens"])
val_max_length = find_max_length(train_valid_test_oparticles["validation"]["tokens"])
test_max_length = find_max_length(train_valid_test_oparticles["test"]["tokens"])

print(f"Longest sequence in train set has {train_max_length} words")
print(f"Longest sequence in val set has {val_max_length} words")
print(f"Longest sequence in test set has {test_max_length} words")

Longest sequence in train set has 2932 words
Longest sequence in val set has 1382 words
Longest sequence in test set has 1208 words


Tokenize entire dataset

In [None]:
def tokenize_dataset(sample, max_length_):
    return tokenizer(sample["tokens"], truncation=True, max_length=max_length_, padding="max_length")

def get_tokenized_data(dataset, max_length):
    return dataset.map(lambda batch: tokenize_dataset(batch, max_length), batched=True)

### Loading the model

Since we want to use the model for classification, we should load it with an appropriate classification head:

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

def get_model(name, num_labels_):
    model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=num_labels_, ignore_mismatched_sizes=True)
    model.cuda() # Use GPU

    return model

### Fine-tuning

The next step is to [fine-tune](https://huggingface.co/docs/transformers/training) the model with our train data. To do so, we can make use of a [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer).
There are several aspects of training that you can specify via [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments).

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
from transformers import DataCollatorWithPadding

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def get_trainingArgs():
    return TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        data_seed=42,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1"
    )

def get_trainer(model_, args_, dataset_, tokenizer_, data_collator_, compute_metrics_):
    return Trainer(
        model=model_,
        args=args_,
        train_dataset=dataset_["train"],
        eval_dataset=dataset_["validation"],
        tokenizer=tokenizer_,
        data_collator=data_collator_,
        compute_metrics=compute_metrics_
    )

In [None]:
model_name = "neuralmind/bert-base-portuguese-cased"

In [None]:
tokenizer = get_tokenizer(model_name)
tokenized_dataset = get_tokenized_data(train_valid_test_oparticles, 510)
model = get_model(model_name, 8)

trainer = get_trainer(
    model,
    get_trainingArgs(),
    tokenized_dataset,
    tokenizer,
    DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics
    )

# Train Model
display(trainer.train())

# Check performance in validation set
display(trainer.evaluate())

# Check how the model fares in our test set.
display(trainer.predict(test_dataset=tokenized_dataset["test"]))

# Save model for future use
trainer.save_model('/content/drive/Shareddrives/PLN/Assignment 2/models/domain/' + model_name)

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.909428,0.368421,0.304762,0.508333,0.302083
2,No log,1.613191,0.736842,0.67381,0.729167,0.708333
3,No log,1.407847,0.736842,0.664583,0.71875,0.708333
4,No log,1.331093,0.736842,0.664583,0.71875,0.708333
5,No log,1.292226,0.736842,0.664583,0.71875,0.708333


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 19
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-21
Configuration saved in ./results/checkpoint-21/config.json
Model weights saved in ./results/checkpoint-21/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-21/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-21/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you 

TrainOutput(global_step=105, training_loss=1.5165142240978422, metrics={'train_runtime': 198.3464, 'train_samples_per_second': 8.445, 'train_steps_per_second': 0.529, 'total_flos': 439013139372000.0, 'train_loss': 1.5165142240978422, 'epoch': 5.0})

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 19
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


{'epoch': 5.0,
 'eval_accuracy': 0.7368421052631579,
 'eval_f1': 0.6738095238095239,
 'eval_loss': 1.6131914854049683,
 'eval_precision': 0.7291666666666666,
 'eval_recall': 0.7083333333333333,
 'eval_runtime': 0.7234,
 'eval_samples_per_second': 26.264,
 'eval_steps_per_second': 2.765}

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 19
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))


PredictionOutput(predictions=array([[-2.18317270e-01,  7.13726163e-01,  1.80869848e-01,
        -4.96376038e-01, -5.70772812e-02,  1.49075389e-01,
         1.35978581e-02, -2.21554443e-01],
       [ 6.38670623e-01, -3.86216998e-01,  3.30334663e-01,
         3.22024822e-02, -2.67550886e-01,  1.28340736e-01,
        -1.60645247e-02, -4.22625303e-01],
       [-2.01503590e-01,  1.11880212e-03, -1.88627746e-02,
        -2.23736659e-01, -4.87186611e-02,  1.40093699e-01,
        -8.25822949e-02,  2.13240132e-01],
       [-3.22720796e-01,  1.05734095e-01, -2.80375838e-01,
         4.77276951e-01, -6.89464062e-02, -4.03653860e-01,
        -4.56289172e-01,  2.84725606e-01],
       [ 2.97685057e-01,  3.81348506e-02,  3.23991328e-01,
        -1.63957521e-01, -9.11994725e-02,  5.91799542e-02,
         4.57213640e-01, -4.89210635e-01],
       [ 5.19497097e-01, -1.15277529e-01,  7.54323125e-01,
         7.05788136e-02, -2.75892317e-01,  2.00394273e-01,
        -1.03676356e-01, -7.29891062e-01],
     

Saving model checkpoint to /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased
Configuration saved in /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/config.json
Model weights saved in /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/pytorch_model.bin
tokenizer config file saved in /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/tokenizer_config.json
Special tokens file saved in /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/special_tokens_map.json


In [None]:
!rm -rf ./results/

In [None]:
tokenizer = get_tokenizer('/content/drive/Shareddrives/PLN/Assignment 2/models/domain/' + model_name)
tokenized_dataset = get_tokenized_data(train_valid_test_adus, 81)
model = get_model('/content/drive/Shareddrives/PLN/Assignment 2/models/domain/' + model_name, 5)

trainer = get_trainer(
    model,
    get_trainingArgs(),
    tokenized_dataset,
    tokenizer,
    DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics
    )

# Train Model
display(trainer.train())

# Check performance in validation set
display(trainer.evaluate())

# Check how the model fares in our test set.
display(trainer.predict(test_dataset=tokenized_dataset["test"]))

Didn't find file /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/added_tokens.json. We won't load it.
loading file /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/vocab.txt
loading file /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/tokenizer.json
loading file None
loading file /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/special_tokens_map.json
loading file /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/tokenizer_config.json


  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file /content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/Shareddrives/PLN/Assignment 2/models/domain/neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_f

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0826,0.980855,0.578256,0.519845,0.618552,0.479499
2,0.767,1.002711,0.600956,0.568294,0.579099,0.576142
3,0.6093,1.090517,0.594982,0.572454,0.56557,0.587142
4,0.5037,1.232254,0.598566,0.572706,0.574938,0.578156
5,0.4219,1.339704,0.585424,0.571945,0.561964,0.587846


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 837
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-942
Configuration saved in ./results/checkpoint-942/config.json
Model weights saved in ./results/checkpoint-942/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-942/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-942/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 837
  Batch size = 16
Sa

TrainOutput(global_step=4710, training_loss=0.6693539111224456, metrics={'train_runtime': 1284.4222, 'train_samples_per_second': 58.657, 'train_steps_per_second': 3.667, 'total_flos': 3136111306859880.0, 'train_loss': 0.6693539111224456, 'epoch': 5.0})

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 837
  Batch size = 16


{'epoch': 5.0,
 'eval_accuracy': 0.5985663082437276,
 'eval_f1': 0.5727055470053468,
 'eval_loss': 1.2322536706924438,
 'eval_precision': 0.5749383605410278,
 'eval_recall': 0.578156330115093,
 'eval_runtime': 4.5344,
 'eval_samples_per_second': 184.59,
 'eval_steps_per_second': 11.688}

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tokens. If tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 838
  Batch size = 16


PredictionOutput(predictions=array([[ 3.2541964 ,  1.3051281 , -2.185289  , -2.0068898 ,  0.2792441 ],
       [ 4.9965925 , -1.8325574 ,  0.06497824,  0.548347  , -2.697576  ],
       [ 4.048762  , -2.2640746 ,  0.9949892 , -0.94218725, -0.67046696],
       ...,
       [ 3.7981615 , -3.1667192 ,  1.5487077 ,  2.0239959 , -3.4811769 ],
       [ 1.0810051 , -3.400689  ,  4.1544876 ,  0.9413587 , -2.4644291 ],
       [ 2.1570187 , -3.8541389 ,  2.9824128 ,  1.9183352 , -2.9619596 ]],
      dtype=float32), label_ids=array([0, 0, 0, 0, 2, 0, 0, 0, 3, 1, 3, 2, 3, 2, 3, 1, 0, 0, 2, 0, 0, 2,
       1, 0, 2, 0, 2, 2, 0, 0, 3, 0, 3, 0, 0, 0, 4, 2, 0, 0, 0, 3, 3, 0,
       0, 2, 0, 3, 3, 2, 2, 4, 0, 0, 1, 3, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0,
       0, 0, 0, 0, 1, 2, 0, 0, 2, 4, 3, 4, 0, 0, 1, 0, 0, 0, 3, 0, 2, 0,
       0, 2, 1, 2, 3, 4, 0, 4, 0, 3, 3, 0, 0, 0, 0, 2, 0, 2, 3, 0, 0, 0,
       0, 0, 1, 3, 3, 0, 2, 0, 2, 2, 0, 1, 3, 0, 0, 0, 0, 0, 4, 0, 2, 0,
       3, 3, 0, 0, 0, 2, 1, 1, 0, 0, 0, 2, 0

In [None]:
!rm -rf ./results/