In [1]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import transformers

## LOAD DATASET

In [3]:
df = pd.read_csv('my-dataset-train.csv')
df.head()

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [4]:
df = df.dropna(subset=['text', 'label'])

In [5]:
df_train, df_test = train_test_split(df, stratify=df['label'], test_size=0.1, random_state=30)
df_train, df_val = train_test_split(df_train, stratify=df_train['label'], test_size=0.1, random_state=30)
print(df_train.shape, df_test.shape, df_val.shape)

(7729, 2) (955, 2) (859, 2)


## LOAD PRE-TRAINED MODEL

In [6]:
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

  return self.fget.__get__(instance, owner)()


In [7]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

dataset_train = dataset_train.map(tokenize_function, batched=True)
dataset_val = dataset_val.map(tokenize_function, batched=True)

dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
# dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 7729/7729 [00:02<00:00, 3048.32 examples/s]
Map: 100%|██████████| 859/859 [00:00<00:00, 2905.45 examples/s]


In [8]:
def compute_metric(eval_pred):
    predictions, label = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, label) }

torch.mps.set_per_process_memory_fraction(0.0)

training_args = TrainingArguments(output_dir="test_trainer", 
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=5,
                                  weight_decay=0.01,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='accuracy'
                                  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metric
)

trainer.train()

  0%|          | 2/1210 [01:18<13:15:44, 39.52s/it]

KeyboardInterrupt: 

In [None]:
model.eval()
trainer.predict(dataset_test).metrics

In [None]:
trainer.save_model('finbert-fintuned/')