In [None]:
"""

Reference:
1. https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=KXmFds8js6P8
2. https://huggingface.co/docs/transformers/training

"""

In [1]:
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import torch
import torch.utils.data as data_utils
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
df = pd.read_csv('cleaned_non-vectorized_data.csv')

In [12]:
df.head()

Unnamed: 0,Text,tokens,disgust,joy,anger,surprised,sad,fear,neutral
0,come mert ’ today let u take care lunch enjoy ...,"['come', 'mert', '’', 'today', 'let', 'u', 'ta...",0,0,0,0,0,0,1
1,nxt gt lay 20 staff tech 's latest cutback rb_...,"['nxt', 'gt', 'lay', '20', 'staff', 'tech', ""'...",0,0,0,0,0,0,1
2,layoff 20 workforce 100 employee sf bay area h...,"['layoff', '20', 'workforce', '100', 'employee...",0,0,0,0,0,0,1
3,today ’ lunch special smoked pork sausage onio...,"['today', '’', 'lunch', 'special', 'smoked', '...",0,0,0,0,0,0,1
4,come mert ’ today grab salmon cake two home co...,"['come', 'mert', '’', 'today', 'grab', 'salmon...",0,0,0,0,0,0,1


### 1. Preprocessing

In [2]:
z1 = tokenizer(df.loc[0, 'Text'], padding="max_length", truncation=True, max_length=128)['input_ids']
list_out = []
for i in range(1, len(df)):
    try:
        z2 = tokenizer(df.loc[i, 'Text'], padding="max_length", truncation=True, max_length=128)['input_ids']

        z1 = np.vstack((z1, z2))
    except:
        list_out.append(i)
#         print(i)
        continue

text = torch.tensor(z1)
df2 = df.copy()

df2.drop(df2.index[list_out], inplace=True)
df2 = df2.reset_index()

labels = list(df.columns[2:])
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# arr_label = torch.tensor(df2.loc[:, labels].values)

# train = data_utils.TensorDataset(text, arr_label)
# eval_data = data_utils.TensorDataset(text, arr_label)
# train_loader = data_utils.DataLoader(train, batch_size=100, shuffle=True)
# eval_loader = data_utils.DataLoader(eval_data, batch_size=100, shuffle=True)

In [3]:
df2 = df2.drop(columns=['tokens'])

In [4]:
train, test = train_test_split(df2, test_size=0.1)
train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)

### 2. Tokenizer and prepare dataset

In [5]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

'''
rerference: https://discuss.huggingface.co/t/sending-a-dataset-or-datasetdict-to-a-gpu/17208/4

'''

def preprocess_data(examples):
    # take a batch of texts
    text = examples["Text"]
    
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
#     print(examples)

    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

In [6]:
encoded_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=train_dataset.column_names)
endoded_valid = test_dataset.map(preprocess_data, batched=True, remove_columns=test_dataset.column_names)



  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### 3. Train

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
batch_size = 8
metric_name = "f1"

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [9]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [10]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset,
    eval_dataset=endoded_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



In [11]:
trainer.train()

***** Running training *****
  Num examples = 8351
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5220
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Chih-Shen Hsu/.netrc


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.2519,0.252223,0.647348,0.775749,0.518319
2,0.2103,0.236006,0.69814,0.810345,0.561422
3,0.1552,0.254439,0.699906,0.814277,0.568966
4,0.1284,0.273046,0.705882,0.819412,0.58944
5,0.1046,0.283851,0.697343,0.810981,0.588362


***** Running Evaluation *****
  Num examples = 928
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english\checkpoint-1044
Configuration saved in bert-finetuned-sem_eval-english\checkpoint-1044\config.json
Model weights saved in bert-finetuned-sem_eval-english\checkpoint-1044\pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english\checkpoint-1044\tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english\checkpoint-1044\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 928
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english\checkpoint-2088
Configuration saved in bert-finetuned-sem_eval-english\checkpoint-2088\config.json
Model weights saved in bert-finetuned-sem_eval-english\checkpoint-2088\pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english\checkpoint-2088\tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english\chec

TrainOutput(global_step=5220, training_loss=0.17672351384071555, metrics={'train_runtime': 873.9789, 'train_samples_per_second': 47.776, 'train_steps_per_second': 5.973, 'total_flos': 2746673829984000.0, 'train_loss': 0.17672351384071555, 'epoch': 5.0})

### 4. Save

In [None]:
trainer.save_model('fine_tune_bert')