In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
splits = {'train': 'train.csv', 'validation': 'val.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/gtfintechlab/finer-ord/" + splits["train"])

#### Baseline model: **FinBERT**

In [5]:
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-pretrain")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-pretrain")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def read_conllu_like_iob2(filepath):
    sentences = []
    labels = []
    sentence, label = [], []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == '':
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            elif line.startswith('#'):
                continue  # Skip metadata
            else:
                parts = line.split('\t')
                if len(parts) >= 3:  # Ensure enough columns
                    word, tag = parts[1], parts[2]
                    sentence.append(word)
                    label.append(tag)

    # Add last sentence if file doesn't end with newline
    if sentence:
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

sentences, tags = read_conllu_like_iob2('en_ewt-ud-train.iob2')
sentences2, tags2 = read_conllu_like_iob2('en_ewt-ud-dev.iob2')

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Vocabulary
word_vocab = {word for sentence in sentences for word in sentence}
word2idx = {word: idx + 2 for idx, word in enumerate(word_vocab)}
word2idx['PAD'] = 0
word2idx['UNK'] = 1

# Tags
all_tags = [tag for tag_list in tags for tag in tag_list]
tag_encoder = LabelEncoder()
tag_encoder.fit(all_tags)
tag2idx = {tag: idx for idx, tag in enumerate(tag_encoder.classes_)}

# Encode
X = [[word2idx.get(word, word2idx['UNK']) for word in sentence] for sentence in sentences]
y = [[tag2idx[tag] for tag in label] for label in tags]

# Padding
max_len = max(len(seq) for seq in X)  # You may also fix a max_len
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

In [3]:
import tensorflow as tf

batch_size = 32
dataset = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(len(X)).batch(batch_size)

In [14]:
from transformers import  AutoModelForTokenClassification, Trainer, TrainingArguments
import torch

tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-pretrain")
model = AutoModelForTokenClassification.from_pretrained("yiyanghkust/finbert-pretrain", num_labels=len(tag2idx))




Some weights of BertForTokenClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
pip install datasets

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['sentence'], truncation=True, padding=True, is_split_into_words=True)
    labels = [label + [tag2idx['O']] * (len(tokenized_input['input_ids']) - len(label)) 
              for label, tokenized_input in zip(examples['labels'], tokenized_inputs)]
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

from datasets import Dataset

# Convert dictionaries to Dataset
train_dataset = Dataset.from_dict({'sentence': sentences, 'labels': tags})
eval_dataset = Dataset.from_dict({'sentence': sentences2, 'labels': tags2})

# training
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy="epoch",   
    learning_rate=0.0001
)

trainer = Trainer(
    model=model,                       
    args=training_args,              
    train_dataset=train_dataset,      
    eval_dataset=eval_dataset          
)

trainer.train()


ModuleNotFoundError: No module named 'datasets'

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=16,
    num_train_epochs=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()
