In [1]:
%pip install datasets transformers

Collecting datasets
  Using cached datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.2-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.2.0,>=2023.1.0 (from fsspec[http]<=2024.2.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Collecting aioh



## Prepare Dataset for BERT 

In [43]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("midas/duc2001", "raw")["test"]

def preprocess_dataset_direct_use(dataset):
    processed_data = []

    for item in dataset:
        # Directly use tokens and BIO tags from the dataset
        # but only apply lower to token
        tokens = [token.lower() for token in item['document']]
        bio_tags = item['doc_bio_tags'] 
        
        processed_data.append({'tokens': tokens, 'labels': bio_tags})

    return processed_data

# Assuming direct compatibility
processed_dataset = preprocess_dataset_direct_use(dataset)

# Display a sample of the processed data
print("Sample of processed data:")
for data in processed_dataset[:1]:  # Displaying the first sample
    print("Tokens:", data['tokens'][:10])
    print("Labels:", data['labels'][:10])


Sample of processed data:
Tokens: ['here', ',', 'at', 'a', 'glance', ',', 'are', 'developments', 'today', 'involving']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [53]:
%pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bertNote: you may need to restart the kernel to use updated packages.

  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl.metadata (86 kB)
     ---------------------------------------- 0.0/86.7 kB ? eta -:--:--
     ---------------------------------------- 86.7/86.7 kB 2.5 MB/s eta 0:00:00
Collecting boto3 (from pytorch_pretrained_bert)
  Downloading boto3-1.34.82-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.35.0,>=1.34.82 (from boto3->pytorch_pretrained_bert)
  Downloading botocore-1.34.82-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->pytorch_pretrained_bert)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3->pytorch_pretrained_bert)
  Downloading s3transfer-0.10.1-py3-none-any.whl.metadata (1.7 kB)
Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
   ---------------------------------------- 0.0/123.8 kB ? eta -:--:--



In [56]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


# Load the preprocessed dataset
processed_dataset = preprocess_dataset_direct_use(dataset)  # This assumes the function is defined as before

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Extract tokens and labels
tokens = [data['tokens'] for data in processed_dataset]
labels = [data['labels'] for data in processed_dataset]

# Map labels into integers
tag2idx = {'B': 0, 'I': 1, 'O': 2}
tags_vals = ['B', 'I', 'O']

# Convert tokens to BERT input IDs and attention masks, and labels to indices
input_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in tokens]
input_ids = pad_sequences(input_ids, maxlen=75, dtype="long", truncating="post", padding="post")
tags = [[tag2idx.get(l) for l in lab] for lab in labels]
tags = pad_sequences(tags, maxlen=75, value=tag2idx["O"], padding="post", dtype="long", truncating="post")

attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

# Split the dataset into training and validation sets
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)

# Convert all of our data into torch tensors, the required datatype for our model
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

# Create the DataLoader for our training set
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# Create the DataLoader for our validation set
valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=32)


In [58]:
%pip install torch torchvision torchaudio

Collecting torchvision
  Downloading torchvision-0.17.2-cp310-cp310-win_amd64.whl.metadata (6.6 kB)
Collecting torchaudio
  Downloading torchaudio-2.2.2-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Downloading torchvision-0.17.2-cp310-cp310-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ----- ---------------------------------- 0.2/1.2 MB 3.1 MB/s eta 0:00:01
   ------------ --------------------------- 0.4/1.2 MB 3.7 MB/s eta 0:00:01
   ----------------- ---------------------- 0.5/1.2 MB 3.7 MB/s eta 0:00:01
   ----------------------- ---------------- 0.7/1.2 MB 3.7 MB/s eta 0:00:01
   ------------------------------ --------- 0.9/1.2 MB 4.0 MB/s eta 0:00:01
   ---------------------------------------  1.2/1.2 MB 4.1 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 3.7 MB/s eta 0:00:00
Downloading torchaudio-2.2.2-cp310-cp310-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   

In [59]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from tqdm import trange
import pandas as pd

# Assuming the processed_dataset is already defined and loaded as before

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the model
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),  # The number of output labels. 2 for binary classification.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)

model.to(device)

# Setting custom optimization parameters.
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

epochs = 4
max_grad_norm = 1.0

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,  # Default value
    num_training_steps=total_steps
)


ImportError: 
BertForTokenClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
