In [1]:
from datasets import load_dataset

ds = load_dataset("gtfintechlab/finer-ord")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Access the training split
train_dataset = ds["train"]

# Access the validation split
validation_dataset = ds["validation"]

# Access the test split
test_dataset = ds["test"]

In [3]:
unique_labels = set(ds["train"]["gold_label"])
print(unique_labels)

{0, 1, 2, 3, 4, 5, 6}


In [11]:
import pandas as pd

# Convert the training dataset to a pandas DataFrame
train_df = pd.DataFrame(train_dataset)
val_df = pd.DataFrame(validation_dataset)

In [13]:
label_mapping = {0:'O', 1: 'PER_B', 2:'PER_I', 3:'LOC_B', 4:'LOC_I', 5:'ORG_B', 6:'ORG_I'}

def convert_labels(example):
    example["entity_label"] = label_mapping[example["gold_label"]]
    return example

train_df["entity_label"] = train_df["gold_label"].map(label_mapping)
val_df["entity_label"] = val_df["gold_label"].map(label_mapping)

id2label = {v: k for k, v in label_mapping.items()}

# View the first few rows
val_df.tail(10)

Unnamed: 0,gold_label,gold_token,doc_idx,sent_idx,entity_label
10223,0,individuals,155,19,O
10224,0,along,155,19,O
10225,3,Florida,155,19,LOC_B
10226,0,’s,155,19,O
10227,3,First,155,19,LOC_B
10228,4,Coast,155,19,LOC_I
10229,0,.,155,19,O
10230,0,#,155,20,O
10231,0,#,155,20,O
10232,0,#,155,20,O


In [14]:
trainds = train_dataset.map(convert_labels)
valds = validation_dataset.map(convert_labels)

Map: 100%|██████████| 10233/10233 [00:01<00:00, 8316.66 examples/s]


In [7]:
def group_by_punctuation(dataset, punctuations={".", "?", "!"}):
    sentences = []  
    labels = []
    current_sentence, current_labels = [], []

    for entry in dataset:
        word = entry["gold_token"]
        label = entry["entity_label"]

        current_sentence.append(word)
        current_labels.append(label)

        if word in punctuations:
            sentences.append(current_sentence)
            labels.append(current_labels)
            current_sentence, current_labels = [], []

    # Capture the last sentence if it didn't end with punctuation
    if current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)

    return sentences, labels

sentences, labels = group_by_punctuation(trainds)


In [8]:
def create_label_mapping(labels_list):
    unique_labels = set(label for labels in labels_list for label in labels)
    label2id = {label: i for i, label in enumerate(sorted(unique_labels))}
    id2label = {i: label for label, i in label2id.items()}
    return label2id, id2label

In [49]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, label2id, max_len=128):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        # Ensure tokens are strings and convert to list if needed
        tokens = [str(token) for token in self.sentences[idx]]
        label_tags = self.labels[idx]

        # Debug print to verify input
        print(f"Tokens type: {type(tokens)}, first few: {tokens[:5]}")
        print(f"Labels type: {type(label_tags)}, first few: {label_tags[:5]}")

        # Truncate to max length
        tokens = tokens[:self.max_len]
        label_tags = label_tags[:self.max_len]

        # Ensure labels match tokens length
        assert len(tokens) == len(label_tags), f"Tokens and labels length mismatch: {len(tokens)} vs {len(label_tags)}"

        # Tokenize with careful handling
        try:
            encoding = self.tokenizer(
                tokens,
                is_split_into_words=True,
                padding='max_length',
                truncation=True,
                max_length=self.max_len,
                return_tensors='pt'
            )
        except Exception as e:
            print(f"Tokenization error: {e}")
            print(f"Problematic tokens: {tokens}")
            raise

        # Initialize labels with ignore index
        labels = torch.full((len(encoding['input_ids'][0]),), -100, dtype=torch.long)
        
        # Map labels correctly
        word_ids = encoding.word_ids(batch_index=0)
        previous_word_idx = None
        for i, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            
            # Ensure label exists for the word index
            if word_idx < len(label_tags):
                # Only label the first subword of each word
                if word_idx != previous_word_idx:
                    try:
                        labels[i] = self.label2id[label_tags[word_idx]]
                    except KeyError:
                        # Handle unknown labels
                        labels[i] = -100
                        print(f"Warning: Unknown label {label_tags[word_idx]}")
            
            previous_word_idx = word_idx
        
        # Add labels to encoding
        encoding['labels'] = labels
        
        # Squeeze and convert to dictionary
        return {key: val.squeeze(0) for key, val in encoding.items()}

In [50]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoModelForMaskedLM

# Load data
train_sentences, train_labels = group_by_punctuation(trainds)
dev_sentences, dev_labels = group_by_punctuation(valds)

# Label mapping
label2id, id2label = create_label_mapping(train_labels + dev_labels)
num_labels = len(label2id)

# Prepare datasets
# Model
tokenizer = AutoTokenizer.from_pretrained("FinanceInc/finbert-pretrain")
tokenizer.model_max_length = 512
model = AutoModelForTokenClassification.from_pretrained(
    "FinanceInc/finbert-pretrain", 
    num_labels=num_labels,  # Number of unique entity labels
    id2label=id2label,      # Mapping of label IDs to label names
    label2id=label2id,      # Mapping of label names to label IDs
    problem_type="token_classification"  # Explicitly specify token classification
)
#model = AutoModelForMaskedLM.from_pretrained("FinanceInc/finbert-pretrain") 

train_dataset = NERDataset(train_sentences, train_labels, tokenizer, label2id)
dev_dataset = NERDataset(dev_sentences, dev_labels, tokenizer, label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at FinanceInc/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
import torch

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


Tokens type: <class 'list'>, first few: ['“', 'The', 'two', 'companies', 'know']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['Ouch', '.']
Labels type: <class 'list'>, first few: ['O', 'O']
Tokens type: <class 'list'>, first few: ['TOKYO', '—', 'The', 'Trans', '-']
Labels type: <class 'list'>, first few: ['LOC_B', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['Defending', 'the', 'double', '-', 'digit']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['Fund', 'assets', 'not', 'invested', 'in']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['The', 'average', 'whistleblower', 'is', 'an']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['The', 'central', 'bank', 'probably', 'wo']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', '

Epoch,Training Loss,Validation Loss
1,0.0587,0.103709
2,0.0217,0.110741


Tokens type: <class 'list'>, first few: ['It', "'s", 'just', 'the', 'latest']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['The', 'company', "'s", 'stock', 'is']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['"', 'With', 'a', 'development', 'finance']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['The', 'case', 'has', 'been', 'remanded']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['This', 'acquisition', '-', 'hungry', 'strategy']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['He', 'has', 'since', 'risen', 'through']
Labels type: <class 'list'>, first few: ['O', 'O', 'O', 'O', 'O']
Tokens type: <class 'list'>, first few: ['According', 'to', 'Dr', 'Shinozawa', ',']
Labels type: <class 

TrainOutput(global_step=190, training_loss=0.07641489560666838, metrics={'train_runtime': 1347.219, 'train_samples_per_second': 4.498, 'train_steps_per_second': 0.141, 'total_flos': 395882481392640.0, 'train_loss': 0.07641489560666838, 'epoch': 2.0})

In [None]:
tokens

In [52]:
trainer.save_model("./saved_model")  # Saves model weights
tokenizer.save_pretrained("./saved_model")  # Saves tokenizer

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

In [None]:
config = model.config

# Modify the configuration (for example, changing the problem_type)
config.problem_type = 'multi_label_classification'  # For some reason won't accept token token_classification 
model.save_pretrained("./saved_model", config=config)
tokenizer.save_pretrained("./saved_model")

multi_label_classification


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

### Reloading model

In [67]:
model = AutoModelForTokenClassification.from_pretrained("./saved_model")
print(model.config.problem_type)
model.config.problem_type = 'token_classification'
print(model.config.problem_type)
tokenizer = AutoTokenizer.from_pretrained("./saved_model")

multi_label_classification
token_classification


### NER Attempt

In [None]:
tweets = pd.read_csv("test_data/proc_data.csv")
tweets['Tweet Text'] = tweets['Tweet Text'].str[2:-1]
tweet = tweets['Tweet Text']

from transformers import pipeline

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")  # "simple" groups subwords

Device set to use cpu


In [83]:
entities_list = []
k = 10
for sentence in tweet[:k]:
    entities = ner_pipeline(sentence)
    extracted_entities = [{"entity": ent["entity_group"], "word": ent["word"], "start": ent["start"], "end": ent["end"]} for ent in entities]
    entities_list.append(extracted_entities)

# Print results
for i, (sent, ents) in enumerate(zip(tweet[:k], entities_list)):
    print(f"Sentence {i+1}: {sent}")
    print("Entities:", ents)
    print()

Sentence 1: It all begins today! I will see you at 11:00 A.M. for the swearing-in. THE MOVEMENT CONTINUES - THE WORK BEGINS!
Entities: []

Sentence 2: Today we are not merely transferring power from one Administration to another or from one party to another ‚Äì but we are transferring...
Entities: []

Sentence 3: power from Washington D.C. and giving it back to you the American People. # InaugurationDay
Entities: [{'entity': 'LOC_B', 'word': 'washington', 'start': 11, 'end': 21}, {'entity': 'LOC_I', 'word': 'd. c.', 'start': 22, 'end': 26}]

Sentence 4: What truly matters is not which party controls our government but whether our government is controlled by the people.
Entities: []

Sentence 5: January 20th 2017 will be remembered as the day the people became the rulers of this nation again.
Entities: []

Sentence 6: The forgotten men and women of our country will be forgotten no longer. From this moment on it‚Äôs going to be # AmericaFirst
Entities: [{'entity': 'ORG_B', 'word': 'ao', 