In [3]:
%pip install datasets transformers --quiet

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install pytorch_pretrained_bert --quiet

Note: you may need to restart the kernel to use updated packages.


## Prepare Dataset for BERT 

In [5]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("midas/duc2001", "raw")["test"]

def preprocess_dataset_direct_use(dataset):
    processed_data = []

    for item in dataset:
        # Directly use tokens and BIO tags from the dataset
        # apply lower to token
        tokens = [token.lower() for token in item['document']]
        bio_tags = item['doc_bio_tags'] 
        
        processed_data.append({'tokens': tokens, 'labels': bio_tags})

    return processed_data

processed_dataset = preprocess_dataset_direct_use(dataset)

# Display a sample of the processed data
print("Sample of processed data:")
for data in processed_dataset[:1]:  # Displaying the first sample
    print("Tokens:", data['tokens'][:10])
    print("Labels:", data['labels'][:10])


  from .autonotebook import tqdm as notebook_tqdm


Sample of processed data:
Tokens: ['here', ',', 'at', 'a', 'glance', ',', 'are', 'developments', 'today', 'involving']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Extract tokens and labels
tokens = [data['tokens'] for data in processed_dataset]
labels = [data['labels'] for data in processed_dataset]

# Map labels into integers
tag2idx = {'B': 0, 'I': 1, 'O': 2}
tags_vals = ['B', 'I', 'O']

# Convert tokens to BERT input IDs and attention masks, and labels to indices
# Padding value(ensure all sequences of tags uniform length) = 'O' => does not affect
input_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in tokens]
input_ids = pad_sequences(input_ids, maxlen=75, dtype="long", truncating="post", padding="post")
tags = [[tag2idx.get(l) for l in lab] for lab in labels]
tags = pad_sequences(tags, maxlen=75, value=tag2idx["O"], padding="post", dtype="long", truncating="post")

# to focus on the meaningful part of the input
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

# Split the dataset into training and validation sets
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)

# Convert all of our data into torch tensors, the required datatype for our model
# Ensure labels are long type
tr_inputs = torch.tensor(tr_inputs, dtype=torch.long)
val_inputs = torch.tensor(val_inputs, dtype=torch.long)
tr_tags = torch.tensor(tr_tags, dtype=torch.long)  
val_tags = torch.tensor(val_tags, dtype=torch.long)
tr_masks = torch.tensor(tr_masks, dtype=torch.long)
val_masks = torch.tensor(val_masks, dtype=torch.long)

# Create the DataLoader for our training set
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# Create the DataLoader for our validation set
valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=32)


In [7]:
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import torch

# Define the model
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),  # The number of output labels. 3 for our case of B,I,O
    # output_attentions=False,  # Whether the model returns attentions weights.
    # output_hidden_states=False,  # Whether the model returns all hidden-states.
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Setting custom optimization parameters.
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

epochs = 4
max_grad_norm = 1.0

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,    
    num_warmup_steps=0,  # Default value
    num_training_steps=total_steps
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from tqdm import trange

# Function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in trange(epochs, desc="Epoch"):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_loss / len(train_dataloader)            
    print(f'\nAverage Training Loss: {avg_train_loss:.2f}')
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure the model's performance on our validation set.
    
    # Put the model in evaluation mode
    model.eval()
    eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0
    
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    print(f'Validation Accuracy: {eval_accuracy/nb_eval_steps:.2f}')

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]


Average Training Loss: 0.47


Epoch:  25%|██▌       | 1/4 [02:19<06:57, 139.12s/it]

Validation Accuracy: 0.90

Average Training Loss: 0.33


Epoch:  50%|█████     | 2/4 [05:23<05:31, 165.53s/it]

Validation Accuracy: 0.90

Average Training Loss: 0.30


Epoch:  75%|███████▌  | 3/4 [08:30<02:55, 175.69s/it]

Validation Accuracy: 0.90

Average Training Loss: 0.27


Epoch: 100%|██████████| 4/4 [11:44<00:00, 176.14s/it]

Validation Accuracy: 0.90





In [9]:
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


Validation Accuracy: 0.8963440860215054


In [10]:
# Save the model and the tokenizer
model.save_pretrained('./model_save/')
tokenizer.save_pretrained('./model_save/')

# Load the model and the tokenizer
model = BertForTokenClassification.from_pretrained('./model_save/')
tokenizer = BertTokenizer.from_pretrained('./model_save/')


In [11]:
def keywordextract(sentence, model, tokenizer, device):
    # Tokenize input
    tokens = tokenizer.encode_plus(sentence, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).squeeze().tolist()  # Convert to list for easier processing

    # Display tokens and their corresponding predicted tags
    tokenized_sentence = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
    print("Tokens and their predicted tags:")
    for token, prediction in zip(tokenized_sentence, predictions):
        print(f"{token}: {tags_vals[prediction]}")
    return predictions


In [12]:
text = """The articles have been grouped into 30 clusters 
manually by NIST annotators for multidocument summarization, and the documents 
within each cluster were topic-related or relevant.
The manually labeled clusters were considered as
the ground truth clusters or gold clusters. In order
to investigate existing clustering algorithms, the
documents in the clusters were mixed together to
form the whole document set for automatic clustering."""
print(keywordextract(text, model, tokenizer, device))

Tokens and their predicted tags:
[CLS]: O
the: O
articles: O
have: O
been: O
grouped: O
into: O
30: O
clusters: O
manually: O
by: O
ni: O
##st: O
ann: O
##ota: O
##tors: O
for: O
multi: O
##do: O
##cum: O
##ent: O
sum: O
##mar: O
##ization: O
,: O
and: O
the: O
documents: O
within: O
each: O
cluster: O
were: O
topic: O
-: O
related: O
or: O
relevant: O
.: O
the: O
manually: O
labeled: O
clusters: O
were: O
considered: O
as: O
the: O
ground: O
truth: O
clusters: O
or: O
gold: O
clusters: O
.: O
in: O
order: O
to: O
investigate: O
existing: O
cluster: O
##ing: O
algorithms: O
,: O
the: O
documents: O
in: O
the: O
clusters: O
were: O
mixed: O
together: O
to: O
form: O
the: O
whole: O
document: O
set: O
for: O
automatic: O
cluster: O
##ing: O
.: O
[SEP]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PA

In [13]:
from collections import Counter

# Flatten the list of labels to get the overall distribution
all_labels = [label for sublist in labels for label in sublist]

# Count each type of label
label_counter = Counter(all_labels)
print("Label distribution:", label_counter)

Label distribution: Counter({'O': 253311, 'B': 4365, 'I': 3273})
