In [10]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, BertConfig, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Setting the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [2]:
# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')

# Load the configuration of the pre-trained BERT model
config = BertConfig.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
config.num_labels = 3  # Update the number of labels to 3 for 'O', 'B-ORG', 'I-ORG'

model = BertForTokenClassification(config)


In [17]:
# Load and prepare data
df = pd.read_csv('../data/stock/stock.csv')
df['sentence'] = "Our analysis focuses on " + df['Company Name'] + "."
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

sentences = ["Our analysis focuses on " + name + "." for name in df['Company Name']]

In [18]:
def prepare_labels(sentences):
    labels_aligned = []

    for sentence in sentences:
        # Tokenize the sentence
        tokens = tokenizer.tokenize(sentence)
        labels = ['O'] * len(tokens)  
        start_index = 4  

        if start_index < len(labels):
            labels[start_index] = 'B-ORG'
            for i in range(start_index + 1, len(tokens)):
                if tokens[i].startswith('##') or tokens[i] in {',', '.'}:
                    labels[i] = 'I-ORG'
                else:
                    break

        labels_aligned.append(labels)

    return labels_aligned

In [19]:
# Prepare labels
labels = prepare_labels(sentences)

# Convert labels to IDs
label_dict = {'O': 0, 'B-ORG': 1, 'I-ORG': 2}
labels_ids = [[label_dict[label] for label in sent_labels] for sent_labels in labels]

# Tokenize inputs and align labels with tokens
encoding = tokenizer(sentences, truncation=True, padding=True, return_tensors="pt")
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [20]:
labels_aligned = []
for i, label in enumerate(labels_ids):
    label_aligned = []
    word_ids = encoding.word_ids(batch_index=i)
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None or word_idx != previous_word_idx:
            label_aligned.append(label[word_idx] if word_idx is not None else -100)
        else:
            label_aligned.append(-100)
        previous_word_idx = word_idx
    labels_aligned.append(label_aligned)

In [21]:

class CompanyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
def encode_and_align_labels(df, tokenizer):
    sentences = df['sentence'].tolist()
    labels = prepare_labels(sentences, tokenizer)  
    label_dict = {'O': 0, 'B-ORG': 1, 'I-ORG': 2}

    tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, return_tensors="pt")
    labels_ids = [[label_dict[label] for label in sent_labels] for sent_labels in labels]

    labels_aligned = []
    for i, (label, encoding) in enumerate(zip(labels_ids, tokenized_inputs.encodings)):
        word_ids = encoding.word_ids  #  word IDs 
        label_aligned = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None or word_idx != previous_word_idx:
                label_aligned.append(label[word_idx] if word_idx is not None else -100)
            else:
                label_aligned.append(-100)
            previous_word_idx = word_idx
        labels_aligned.append(torch.tensor(label_aligned, dtype=torch.long).to(device))

    input_ids = tokenized_inputs['input_ids'].to(device)
    attention_mask = tokenized_inputs['attention_mask'].to(device)
    return CompanyDataset(input_ids, attention_mask, labels_aligned)

In [22]:
train_dataset = CompanyDataset(encoding, labels_aligned)

In [23]:
# training arguments
training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=3,              
    per_device_train_batch_size=16, 
    warmup_steps=500,                
    weight_decay=0.01,              
    logging_dir='./logs',           
    logging_steps=10,
    evaluation_strategy="epoch",     
)


In [24]:
train_dataset = encode_and_align_labels(train_df, tokenizer)
eval_sentences = encode_and_align_labels(val_df, tokenizer)

# prepare labels and encode them
eval_labels = prepare_labels(eval_sentences)
eval_labels_ids = [[label_dict[label] for label in sent_labels] for sent_labels in eval_labels]

# Encode the eval data
eval_encoding = tokenizer(eval_sentences, truncation=True, padding=True, return_tensors="pt")

# Align labels for eval dataset
eval_labels_aligned = []
for i, label in enumerate(eval_labels_ids):
    label_aligned = []
    word_ids = eval_encoding.word_ids(batch_index=i)
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None or word_idx != previous_word_idx:
            label_aligned.append(label[word_idx] if word_idx is not None else -100)
        else:
            label_aligned.append(-100)
        previous_word_idx = word_idx
    eval_labels_aligned.append(label_aligned)

# eval dataset
eval_dataset = CompanyDataset(eval_encoding, eval_labels_aligned)


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset 
)

In [16]:
# Start training
trainer.train()

  0%|          | 0/39 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.7613, 'grad_norm': 13.175254821777344, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.77}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6071951389312744, 'eval_runtime': 4.8727, 'eval_samples_per_second': 0.616, 'eval_steps_per_second': 0.205, 'epoch': 1.0}
{'loss': 0.7139, 'grad_norm': 8.446364402770996, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.54}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.502784252166748, 'eval_runtime': 5.3477, 'eval_samples_per_second': 0.561, 'eval_steps_per_second': 0.187, 'epoch': 2.0}
{'loss': 0.5651, 'grad_norm': 5.8338775634765625, 'learning_rate': 3e-06, 'epoch': 2.31}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6190041303634644, 'eval_runtime': 6.1238, 'eval_samples_per_second': 0.49, 'eval_steps_per_second': 0.163, 'epoch': 3.0}
{'train_runtime': 255.2692, 'train_samples_per_second': 2.339, 'train_steps_per_second': 0.153, 'train_loss': 0.6130123688624456, 'epoch': 3.0}


TrainOutput(global_step=39, training_loss=0.6130123688624456, metrics={'train_runtime': 255.2692, 'train_samples_per_second': 2.339, 'train_steps_per_second': 0.153, 'total_flos': 24906482314614.0, 'train_loss': 0.6130123688624456, 'epoch': 3.0})