In [None]:
import numpy as np
import pandas as pd
import random
import evaluate
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

import transformers
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, AdamW
from datasets import load_dataset, DatasetDict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from huggingface_hub import login

In [None]:
# Load the BTC dataset
btc_dataset = load_dataset('tner/btc')

In [None]:
train_dataset = btc_dataset['train']
test_dataset = btc_dataset['test']

# Display example tweets and labels
print("Example tweets: \n")
for example in train_dataset['tokens'][:3]:
    print(example)

# Display example tweets and labels
print("Example labels: \n")
for label in train_dataset['tags'][:3]:
    print(label)

# Check train/test splits
train_size = len(train_dataset)
test_size = len(test_dataset)
print(f"\nTrain Size: {train_size}")
print(f"Test Size: {test_size}")

# Visualize the distribution of labels
labels = test_dataset['tags']
plt.hist(labels, bins=round(len(labels)/100), edgecolor='black')
plt.title('Distribution of Labels')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.show()

In [None]:
### Split 'train' further to create a 'validation' set, directly using DatasetDict for clarity

split_datasets = btc_dataset['train'].train_test_split(train_size=0.7, seed=42)
btc_with_validation_dataset = DatasetDict({
    'train': split_datasets['train'],
    'validation': split_datasets['test'],  # Rename test split as the validation dataset
    'test': btc_dataset['test']  # Include original test set
})
btc_with_validation_dataset

In [None]:
# Create a function to sub-sample each dataset split
def sub_sample_dataset(dataset_dict, percentage=0.1, seed=42):
    random.seed(seed)  # Ensures reproducibility
    subsets = {}
   
    for split in ['train', 'validation', 'test']:
        size = int(dataset_dict[split].num_rows * percentage)
        indices = random.sample(range(dataset_dict[split].num_rows), size)
        subsets[split] = dataset_dict[split].select(indices)
   
    return DatasetDict(subsets)

# Use the function to sub-sample the dataset
subset_dataset_dict = sub_sample_dataset(btc_with_validation_dataset)
subset_dataset_dict

In [None]:
odel_checkpoint = "distilbert/distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Check if the tokenizer is fast
print(tokenizer.is_fast)

# Print tokens and word IDs of an example of the training set
inputs = tokenizer(subset_dataset_dict["train"][0]["tokens"], is_split_into_words=True)
print(inputs.tokens(), '\n', inputs.word_ids())