In [1]:
# This is the training script for fine-tuning bert on 
# unaltered GPT data and manually labelled data
# For better performance/generalization, look for augmented dataset
# Read README.md for comments and details.

In [2]:
# all classes

classes = ["banking","valuation","household","real estate","corporate","external","sovereign","technology", "climate", "energy", "health", "eu"]


In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,top_k_accuracy_score
import math
import pickle
from datasets import Dataset

In [4]:
# load bert-based and finbert
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(classes))
finbert = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert', use_fast =True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
# weights transfer for encoder layers only 
finbert_weights = finbert.state_dict()
model_weights = model.state_dict()
del finbert_weights["bert.pooler.dense.weight"]
del finbert_weights["bert.pooler.dense.bias"]
del finbert_weights["classifier.weight"]
del finbert_weights["classifier.bias"]
finbert_weights["bert.pooler.dense.weight"] = model_weights["bert.pooler.dense.weight"]
finbert_weights["bert.pooler.dense.bias"] = model_weights["bert.pooler.dense.bias"]
finbert_weights["classifier.weight"] = model_weights["classifier.weight"]
finbert_weights["classifier.bias"] = model_weights["classifier.bias"]

model.load_state_dict(finbert_weights)


<All keys matched successfully>

In [6]:
# flatten to one list for all 3

# manual labelled
with open('train_data.pickle', 'rb') as file:
    train = pickle.load(file)

# gpt labelled p1
with open('gpt.pickle', 'rb') as file:
    gpt = pickle.load(file)

# gpt labelled p2
with open('gpt_p2.pickle', 'rb') as file:
    gpt2 = pickle.load(file)
    
gpt = [item for sublist in gpt for item in sublist]
gpt2 = [item for sublist in gpt2 for item in sublist]

mixed = gpt + gpt2

In [7]:
print(len(mixed))

2458


In [8]:
# change to sampling methods instead of argmax if treat GPT-3 data as probabilities.
sample = 1

text_max = [item["text"] for i in range(sample) for item in train]
label_max = [np.argmax(item["dist"]) for i in range(sample) for item in train]

text_max_mixed = [item["text"] for i in range(sample) for item in mixed]
label_max_mixed = [np.argmax(item["dist"]) for i in range(sample) for item in mixed]

comb_text = text_max + text_max_mixed
comb_label = label_max + label_max_mixed

In [9]:
assert len(comb_text) == len(comb_label)

In [10]:
# random shuffle
import random
temp = list(zip(comb_text, comb_label))
random.shuffle(temp)
comb_text, comb_label = zip(*temp)

In [11]:
print(len(comb_text))

2719


In [12]:
train_text, test_text = comb_text[:math.ceil(len(comb_text)*0.9)], comb_text[math.ceil(len(comb_text)*0.9):]
train_label, test_label = comb_label[:math.ceil(len(comb_label)*0.9)], comb_label[math.ceil(len(comb_label)*0.9):]

In [13]:
train_dataset = Dataset.from_dict({"text":train_text, "label":train_label})
test_dataset = Dataset.from_dict({"text":test_text, "label":test_label})

In [14]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

100%|██████████| 1/1 [00:00<00:00,  7.41ba/s]
100%|██████████| 1/1 [00:00<00:00, 62.49ba/s]


In [15]:
train_dataset["input_ids"]

tensor([[  101,  2445,  2008,  ...,     0,     0,     0],
        [  101,  6847, 10665,  ...,     0,     0,     0],
        [  101,  9308,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  1998,  2117,  ...,     0,     0,     0],
        [  101,  2021,  1045,  ...,     0,     0,     0],
        [  101,  3098, 12629,  ...,     0,     0,     0]])

In [16]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    top3 = top_k_accuracy_score(labels, pred.predictions,k=3)
    top2 = top_k_accuracy_score(labels, pred.predictions,k=2)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'top3': top3,
         'top2': top2
    }

training_args = TrainingArguments(
    
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    evaluation_strategy='epoch',
    save_strategy = "epoch",
    logging_dir='./logs',
    save_total_limit = 1, # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end=True,
)
    
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [17]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2448
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3060
  5%|▌         | 153/3060 [00:17<05:24,  8.95it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 271
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))

  5%|▌         | 153/3060 [00:17<05:24,  8.95it/s]Saving model checkpoint to ./results\checkpoint-153
Configuration saved in ./results\checkpoint-153\config.json


{'eval_loss': 2.1546473503112793, 'eval_accuracy': 0.31365313653136534, 'eval_f1': 0.09809622869556318, 'eval_precision': 0.10405982905982906, 'eval_recall': 0.11748707771104683, 'eval_top3': 0.6273062730627307, 'eval_top2': 0.5129151291512916, 'eval_runtime': 0.3567, 'eval_samples_per_second': 759.785, 'eval_steps_per_second': 14.018, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-153\pytorch_model.bin
Deleting older checkpoint [results\checkpoint-612] due to args.save_total_limit
 10%|█         | 306/3060 [00:36<05:02,  9.10it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 271
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))

 10%|█         | 306/3060 [00:36<05:02,  9.10it/s]Saving model checkpoint to ./results\checkpoint-306
Configuration saved in ./results\checkpoint-306\config.json


{'eval_loss': 1.6896413564682007, 'eval_accuracy': 0.4870848708487085, 'eval_f1': 0.1992874550961087, 'eval_precision': 0.18733516162912825, 'eval_recall': 0.2299918161652701, 'eval_top3': 0.7306273062730627, 'eval_top2': 0.6642066420664207, 'eval_runtime': 0.3191, 'eval_samples_per_second': 849.336, 'eval_steps_per_second': 15.67, 'epoch': 2.0}


Model weights saved in ./results\checkpoint-306\pytorch_model.bin
Deleting older checkpoint [results\checkpoint-765] due to args.save_total_limit
 13%|█▎        | 397/3060 [00:48<04:35,  9.68it/s]

In [None]:
trainer.evaluate()

100%|██████████| 62/62 [00:09<00:00,  6.72it/s]


{'eval_loss': 1.0243563652038574,
 'eval_accuracy': 0.5766404864454016,
 'eval_f1': 0.5316538589564456,
 'eval_precision': 0.546277783485254,
 'eval_recall': 0.5233829825328863,
 'eval_top3': 0.8826957182670383,
 'eval_top2': 0.7907271345325564,
 'eval_runtime': 9.3876,
 'eval_samples_per_second': 420.447,
 'epoch': 20.0}