In [1]:
import pandas as pd
import numpy as np
import math 
import tensorflow as tf

In [3]:
df = pd.read_csv("../data/sample/news_headlines_train.csv")
df

Unnamed: 0,text,sentiment
0,"In addition , a further 29 employees can be la...",-1
1,The authorisation is in force until the end of...,0
2,The value of the deal was not disclosed .,0
3,You need to be ready when the window opens up ...,0
4,Major Order in India Comptel Corporation has r...,1
...,...,...
3188,The Insolvency Act regulates the amount of deb...,0
3189,We have also cut our price projections for pap...,-1
3190,"Tyrvaan Sanomat , published twice a week by Ty...",0
3191,"pct lower at 4,442.10 .",0


In [4]:
def normalise(text):
    text = text.lower()
    return text

df['text'] = df['text'].apply(normalise)

In [5]:
df.rename(columns = {'sentiment':'labels'}, inplace = True)

In [6]:
df.loc[df['labels'] == 0, 'labels'] = 2
df.loc[df['labels'] == 1, 'labels'] = 0
df.loc[df['labels'] == -1, 'labels'] = 1
#0: Positive
#1: Negative
#2: Neutral 

In [7]:
df.labels.value_counts()

2    1898
0     908
1     387
Name: labels, dtype: int64

In [8]:
import sklearn
from sklearn.model_selection import train_test_split

In [9]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["labels"])

In [10]:
#!pip install datasets
from datasets import Dataset

In [11]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset = Dataset.from_pandas(valid_df)

In [12]:
train_dataset

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 639
})

In [13]:
#!pip install transformers
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification
)

In [14]:
model_name = "ProsusAI/finbert"

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
def tokenize(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_test_dataset = train_dataset.map(tokenize, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
tokenized_train_dataset

Dataset({
    features: ['text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 639
})

In [18]:
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(model_name)

In [19]:
MODEL_PATH = "../results/models/FinBERT_v1.0"

In [20]:
train_args = TrainingArguments(
    MODEL_PATH,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio = 0.1,
    evaluation_strategy='epoch',
    seed=42
)

In [21]:
from sklearn.metrics import accuracy_score

def evaluation(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, preds)}

In [22]:
data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model_init=model_init,
    args=train_args,
    train_dataset= tokenized_train_dataset,
    eval_dataset= tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=evaluation,
)

loading configuration file config.json from cache at /Users/teoweiming/.cache/huggingface/hub/models--ProsusAI--finbert/snapshots/54bddcea2cca580dd1df6a88d33242dcf4c61a71/config.json
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_s

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 639
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 200
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.114998,0.971831
2,No log,0.033156,0.99687
3,No log,0.010784,1.0
4,No log,0.006853,1.0
5,No log,0.005564,1.0


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 639
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 639
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can 

TrainOutput(global_step=200, training_loss=0.09375311851501465, metrics={'train_runtime': 4978.8429, 'train_samples_per_second': 0.642, 'train_steps_per_second': 0.04, 'total_flos': 92717160038874.0, 'train_loss': 0.09375311851501465, 'epoch': 5.0})

In [None]:
trainer.save_model(MODEL_PATH)

Saving model checkpoint to ./results/models/FinBERT_v1.0
Configuration saved in ./results/models/FinBERT_v1.0/config.json
Model weights saved in ./results/models/FinBERT_v1.0/pytorch_model.bin
tokenizer config file saved in ./results/models/FinBERT_v1.0/tokenizer_config.json
Special tokens file saved in ./results/models/FinBERT_v1.0/special_tokens_map.json


In [None]:
trained_finbert = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

loading configuration file ./results/models/FinBERT_v1.0/config.json
Model config BertConfig {
  "_name_or_path": "./results/models/FinBERT_v1.0",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading w

In [None]:
tokenized_test_dataset

Dataset({
    features: ['text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 639
})

In [None]:
trained_model = Trainer(
    trained_finbert,
    tokenizer=tokenizer,
)
output = trained_model.predict(
    test_dataset=tokenized_test_dataset
)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 639
  Batch size = 8


In [None]:
output

PredictionOutput(predictions=array([[ 3.198282  , -2.0638375 , -2.492503  ],
       [-1.3433713 , -2.8260388 ,  4.231693  ],
       [-2.3078167 , -1.7832673 ,  4.581837  ],
       ...,
       [ 3.305683  , -2.9761944 , -1.9258841 ],
       [-2.3671315 ,  3.6954017 , -0.52519596],
       [-1.4758866 , -2.708682  ,  4.298575  ]], dtype=float32), label_ids=array([0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 0, 2, 1, 2, 2, 1, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 1,
       2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 2, 2, 0, 2, 2, 0, 2, 2, 1, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2,
       1, 2, 2, 0, 0, 1, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 1, 1, 0, 0, 1,
       2, 1, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 0, 0,
       1, 2, 2, 2, 2, 2, 0, 2, 0, 0, 1, 2, 2, 2, 1, 0, 2, 2, 0, 2, 2, 2,
       2, 2, 0, 2, 2, 2, 2, 2, 1, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1,
       0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2,

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['labels'] = encoder.fit_transform(df['labels'])
encoder.inverse_transform([np.argmax(i) for i in output.predictions])
preds = [np.argmax(i) for i in output.predictions]

In [None]:
accuracy_score(valid_df.labels, preds)

1.0