In [3]:
"""

Reference:
1.  https://discuss.huggingface.co/t/sending-a-dataset-or-datasetdict-to-a-gpu/17208/4
2. https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=KXmFds8js6P8
3. https://huggingface.co/docs/transformers/training
4. https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/

"""

'\nReference:\n1. https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=KXmFds8js6P8\n2. https://huggingface.co/docs/transformers/training\n'

## 0. Import api

In [34]:
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import torch
import torch.utils.data as data_utils
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/ma

## 1. Preprocessing

First, we need to read the tweet data of our crawler, transformer training has reference to the method of torch data composition, so we organized into the form of dataset and dataloader for analysis.


In [35]:
#　You can find the csv file in our github.
df = pd.read_csv('cleaned_non-vectorized_data_2.csv')
z1 = tokenizer(df.loc[0, 'Text'], padding="max_length", truncation=True, max_length=128)['input_ids']
list_out = []
for i in range(1, len(df)):
    try:
        z2 = tokenizer(df.loc[i, 'Text'], padding="max_length", truncation=True, max_length=128)['input_ids']

        z1 = np.vstack((z1, z2))
    except:
        list_out.append(i)
#         print(i)
        continue

text = torch.tensor(z1)
df2 = df.copy()

df2.drop(df2.index[list_out], inplace=True)
df2 = df2.reset_index()

labels = list(df.columns[2:])
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# arr_label = torch.tensor(df2.loc[:, labels].values)

# train = data_utils.TensorDataset(text, arr_label)
# eval_data = data_utils.TensorDataset(text, arr_label)
# train_loader = data_utils.DataLoader(train, batch_size=100, shuffle=True)
# eval_loader = data_utils.DataLoader(eval_data, batch_size=100, shuffle=True)

df2 = df2.drop(columns=['tokens'])

train, test = train_test_split(df2, test_size=0.01)
train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)

## 2. Tokenizer and prepare dataset
Since bert's analysis requires tokenizer, we first obtained it from the hugging face and then tokenized our collated dataloader.

In [36]:

'''
rerference: https://discuss.huggingface.co/t/sending-a-dataset-or-datasetdict-to-a-gpu/17208/4

'''
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):

    text = examples["Text"]
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

In [37]:
encoded_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=train_dataset.column_names)
endoded_valid = test_dataset.map(preprocess_data, batched=True, remove_columns=test_dataset.column_names)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## 3. Train

We obtain the pretrained weight from the hugging face, then define the loss function and the training parameters, and finally start the training.

In [38]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "disgust",
    "1": "joy",
    "2": "anger",
    "3": "surprised",
    "4": "sad",
    "5": "fear",
    "6": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "anger": 2,
    "disgust": 0,
    "fear": 5,
    "joy": 1,
    "neutral": 6,
    "sad": 4,
    "surprised": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "n

In [39]:
batch_size = 8
metric_name = "f1"

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-multilabel-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.015,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [40]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.75):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [41]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset,
    eval_dataset=endoded_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



In [42]:
trainer.train()

***** Running training *****
  Num examples = 9133
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5710


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.2559,0.218342,0.473684,0.65868,0.322581
2,0.209,0.203539,0.651685,0.755907,0.505376
3,0.1452,0.244627,0.687179,0.789812,0.537634
4,0.1087,0.254192,0.673575,0.780637,0.537634
5,0.0887,0.266485,0.69697,0.798064,0.537634


***** Running Evaluation *****
  Num examples = 93
  Batch size = 8
Saving model checkpoint to bert-finetuned-multilabel-english/checkpoint-1142
Configuration saved in bert-finetuned-multilabel-english/checkpoint-1142/config.json
Model weights saved in bert-finetuned-multilabel-english/checkpoint-1142/pytorch_model.bin
tokenizer config file saved in bert-finetuned-multilabel-english/checkpoint-1142/tokenizer_config.json
Special tokens file saved in bert-finetuned-multilabel-english/checkpoint-1142/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 93
  Batch size = 8
Saving model checkpoint to bert-finetuned-multilabel-english/checkpoint-2284
Configuration saved in bert-finetuned-multilabel-english/checkpoint-2284/config.json
Model weights saved in bert-finetuned-multilabel-english/checkpoint-2284/pytorch_model.bin
tokenizer config file saved in bert-finetuned-multilabel-english/checkpoint-2284/tokenizer_config.json
Special tokens file saved in bert-finetuned-multi

TrainOutput(global_step=5710, training_loss=0.16345449370803433, metrics={'train_runtime': 797.427, 'train_samples_per_second': 57.265, 'train_steps_per_second': 7.161, 'total_flos': 3003876432672000.0, 'train_loss': 0.16345449370803433, 'epoch': 5.0})

## 4. Save model

In [43]:
trainer.save_model('fine_tune_bert_1')

Saving model checkpoint to fine_tune_bert_1
Configuration saved in fine_tune_bert_1/config.json
Model weights saved in fine_tune_bert_1/pytorch_model.bin
tokenizer config file saved in fine_tune_bert_1/tokenizer_config.json
Special tokens file saved in fine_tune_bert_1/special_tokens_map.json


## 5. Load model and evaluate

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import numpy as np

id2label = {0: 'disgust', 1: 'joy', 2: 'anger', 
            3: 'surprised', 4: 'sad', 
            5: 'fear', 6: 'neutral'} 
label2id = {'disgust': 0, 'joy': 1, 'anger': 2, 'surprised': 3, 'sad': 4, 'fear': 5, 'neutral': 6}
labels = ['disgust', 'joy', 'anger', 'surprised', 'sad', 'fear', 'neutral']
tokenizer = AutoTokenizer.from_pretrained("./fine_tune_bert_2")
model = AutoModelForSequenceClassification.from_pretrained("./fine_tune_bert_2", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)
# model = model.to('cpu')

df = pd.read_csv('serve_data.csv')
list_company = ['postmates', 'instacart', 'grubhub', 'ubereats', 'doordash', 'nan', ' DoorDash', ' Grubhub', 'Instacart', ' Ubereats']
for i in range(0, len(df)):
    if str(df.loc[i]['company']) in list_company:
        continue
    else:
        if str(df.loc[i]['company'])[-4:] == '2022':
                continue
        else:
            df.loc[i]['text'] = str(df.loc[i]['text']) + ' ' + str(df.loc[i]['company'])
            df.loc[i]['company'] = ''
            print(i, str(df.loc[i]['company']))
print('-' * 100)

list_emotion = []
for i in range(0, len(df)):
    try:
        text = df.loc[i, 'text']

        encoding = tokenizer(text, return_tensors="pt")
        encoding = {k: v.to('cpu') for k,v in encoding.items()}

        outputs = model(**encoding)
        logits = outputs.logits
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits.squeeze().cpu())
        predictions = np.zeros(probs.shape)
        predictions[np.where(probs >= 0.5)] = 1
        # turn predicted id's into actual label names
        predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
    #     print(predicted_labels)
        if len(predicted_labels) == 0:
            print(i)
            predictions[np.where(probs >= 0.2)] = 1
            predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
            
            if len(predicted_labels) == 0:
                print(i, 'last')
        list_emotion.append(predicted_labels)
    except:
        print(i, 'out')
        list_emotion.append(['nan'])

In [None]:
df2 = df.copy()
df2['emotion'] = ''
df2['emotion'] = list_emotion
df2.to_csv('serve_data_emotion_v5.csv')