In [1]:
%pip install datasets transformers

Note: you may need to restart the kernel to use updated packages.


## Prepare Dataset for BERT 

In [2]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("midas/duc2001", "raw")["test"]

def preprocess_dataset_direct_use(dataset):
    processed_data = []

    for item in dataset:
        # Directly use tokens and BIO tags from the dataset
        # but only apply lower to token
        tokens = [token.lower() for token in item['document']]
        bio_tags = item['doc_bio_tags'] 
        
        processed_data.append({'tokens': tokens, 'labels': bio_tags})

    return processed_data

# Assuming direct compatibility
processed_dataset = preprocess_dataset_direct_use(dataset)

# Display a sample of the processed data
print("Sample of processed data:")
for data in processed_dataset[:1]:  # Displaying the first sample
    print("Tokens:", data['tokens'][:10])
    print("Labels:", data['labels'][:10])


  from .autonotebook import tqdm as notebook_tqdm


Sample of processed data:
Tokens: ['here', ',', 'at', 'a', 'glance', ',', 'are', 'developments', 'today', 'involving']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [3]:
%pip install pytorch_pretrained_bert

Note: you may need to restart the kernel to use updated packages.


In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


# Load the preprocessed dataset
processed_dataset = preprocess_dataset_direct_use(dataset)  # This assumes the function is defined as before

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Extract tokens and labels
tokens = [data['tokens'] for data in processed_dataset]
labels = [data['labels'] for data in processed_dataset]

# Map labels into integers
tag2idx = {'B': 0, 'I': 1, 'O': 2}
tags_vals = ['B', 'I', 'O']

# Convert tokens to BERT input IDs and attention masks, and labels to indices
input_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in tokens]
input_ids = pad_sequences(input_ids, maxlen=75, dtype="long", truncating="post", padding="post")
tags = [[tag2idx.get(l) for l in lab] for lab in labels]
tags = pad_sequences(tags, maxlen=75, value=tag2idx["O"], padding="post", dtype="long", truncating="post")

attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

# Split the dataset into training and validation sets
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)

# Convert all of our data into torch tensors, the required datatype for our model
# Ensure labels are long type
tr_inputs = torch.tensor(tr_inputs, dtype=torch.long)
val_inputs = torch.tensor(val_inputs, dtype=torch.long)
tr_tags = torch.tensor(tr_tags, dtype=torch.long)  
val_tags = torch.tensor(val_tags, dtype=torch.long)
tr_masks = torch.tensor(tr_masks, dtype=torch.long)
val_masks = torch.tensor(val_masks, dtype=torch.long)

# Create the DataLoader for our training set
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# Create the DataLoader for our validation set
valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=32)


In [8]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from tqdm import trange
import pandas as pd

# Assuming the processed_dataset is already defined and loaded as before

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the model
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),  # The number of output labels. 2 for binary classification.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)

model.to(device)

# Setting custom optimization parameters.
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

epochs = 4
max_grad_norm = 1.0

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,  # Default value
    num_training_steps=total_steps
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in trange(epochs, desc="Epoch"):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_loss / len(train_dataloader)            
    print(f'\nAverage Training Loss: {avg_train_loss:.2f}')
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure the model's performance on our validation set.
    
    # Put the model in evaluation mode
    model.eval()
    eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0
    
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    print(f'Validation Accuracy: {eval_accuracy/nb_eval_steps:.2f}')

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]


Average Training Loss: 0.53


Epoch:  25%|██▌       | 1/4 [01:58<05:55, 118.58s/it]

Validation Accuracy: 0.90

Average Training Loss: 0.35


Epoch:  50%|█████     | 2/4 [03:49<03:47, 114.00s/it]

Validation Accuracy: 0.90

Average Training Loss: 0.32


Epoch:  75%|███████▌  | 3/4 [05:38<01:51, 111.97s/it]

Validation Accuracy: 0.90

Average Training Loss: 0.30


Epoch: 100%|██████████| 4/4 [07:36<00:00, 114.10s/it]

Validation Accuracy: 0.90





In [21]:
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


Validation Accuracy: 0.8963440860215054


In [10]:
# Save the model and the tokenizer
model.save_pretrained('./model_save/')
tokenizer.save_pretrained('./model_save/')

# Load the model and the tokenizer
model = BertForTokenClassification.from_pretrained('./model_save/')
tokenizer = BertTokenizer.from_pretrained('./model_save/')


In [34]:
def keywordextract(sentence, model, tokenizer, device):
    # Tokenize input
    tkns = tokenizer.tokenize(sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tkns)
    # Prepare for model input
    input_ids = torch.tensor([indexed_tokens]).to(device)
    attention_mask = torch.tensor([[1]*len(indexed_tokens)]).to(device)  # Assuming all tokens are not padding

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)

    keywords = []
    for token_id, prediction_idx in zip(input_ids[0], predictions[0]):
        # Extracting tokens classified as 'B' or 'I'
        if prediction_idx.item() in [tag2idx['B'], tag2idx['I']]:
            keywords.append(tokenizer.convert_ids_to_tokens([token_id.item()])[0])

    return keywords

# Example use
text = "The solution is based upon an abstract representation of the mobile object system."
extracted_keywords = keywordextract(text, model, tokenizer, device)
print("Extracted Keywords:", extracted_keywords)


Extracted Keywords: []


In [35]:
def keyword_extract(sentence):
    # Tokenize the input sentence and create the attention mask
    inputs = tokenizer.encode_plus(sentence, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    
    # Convert logits to list of predicted tag indices
    predictions = torch.argmax(logits, dim=2)
    
    # Decode the ids to tokens and tags
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
    tag_indices = predictions[0].tolist()
    
    # Extract keywords based on the tag indices
    keywords = [tokens[i] for i, tag_idx in enumerate(tag_indices) if tag_idx in [tag2idx['B'], tag2idx['I']]]

    return keywords

# Example usage
text = "The solution is based upon an abstract representation of the mobile object system."
keywords = keyword_extract(text)
print("Extracted Keywords:", keywords)

Extracted Keywords: []


In [38]:
def keywordextract(sentence, model, tokenizer, device):
    tkns = tokenizer.tokenize(sentence)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tkns)
    input_ids = torch.tensor([indexed_tokens]).to(device)
    attention_mask = torch.tensor([[1] * len(indexed_tokens)]).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)

    # Debug: Print raw predictions for inspection
    print("Raw predictions:", predictions)
    print("Token-wise predictions:", list(zip(tkns, predictions[0].tolist())))

    keywords = [tkns[i] for i, label_idx in enumerate(predictions[0]) if label_idx in [tag2idx['B'], tag2idx['I']]]

    return keywords

# Assuming model, tokenizer, device, and tag2idx are already defined
text = "Wine is an alcoholic drink made from fermented fruit. Yeast consumes the sugar in the fruits and converts it to ethanol and carbon dioxide, releasing heat in the process. Though wine can be made from a variety of fruit crops such as plum, cherry, pomegranate, blueberry, currant and elderberry, it is most often made from grapes, and the term wine generally refers to grape wine when used without a qualifier."
extracted_keywords = keywordextract(text, model, tokenizer, device)
print("Extracted Keywords:", extracted_keywords)


Raw predictions: tensor([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]])
Token-wise predictions: [('wine', 2), ('is', 2), ('an', 2), ('alcoholic', 2), ('drink', 2), ('made', 2), ('from', 2), ('fe', 2), ('##rm', 2), ('##ented', 2), ('fruit', 2), ('.', 2), ('yeast', 2), ('consume', 2), ('##s', 2), ('the', 2), ('sugar', 2), ('in', 2), ('the', 2), ('fruits', 2), ('and', 2), ('converts', 2), ('it', 2), ('to', 2), ('ethanol', 2), ('and', 2), ('carbon', 2), ('dioxide', 2), (',', 2), ('releasing', 2), ('heat', 2), ('in', 2), ('the', 2), ('process', 2), ('.', 2), ('though', 2), ('wine', 2), ('can', 2), ('be', 2), ('made', 2), ('from', 2), ('a', 2), ('variety', 2), ('of', 2), ('fruit', 2), ('crops', 2), ('such', 2), ('as', 2), ('plum', 2), (',', 2), (

In [39]:
def keyword_extract(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Model prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    predictions = torch.argmax(logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
    tag_indices = predictions[0].tolist()

    # Debug: Print raw predictions and their corresponding tokens
    print("Tokens and Predicted Tags:")
    for token, tag_idx in zip(tokens, tag_indices):
        print(f"{token}: {tag_idx}")

    # Extract keywords
    keywords = [tokens[i] for i, tag_idx in enumerate(tag_indices) if tag_idx != tag2idx['O']]
    return keywords

text = "Wine is an alcoholic drink made from fermented fruit. Yeast consumes the sugar in the fruits and converts it to ethanol and carbon dioxide, releasing heat in the process. Though wine can be made from a variety of fruit crops such as plum, cherry, pomegranate, blueberry, currant and elderberry, it is most often made from grapes, and the term wine generally refers to grape wine when used without a qualifier."
keywords = keyword_extract(text)
print("Extracted Keywords:", keywords)


Tokens and Predicted Tags:
[CLS]: 2
wine: 2
is: 2
an: 2
alcoholic: 2
drink: 2
made: 2
from: 2
fe: 2
##rm: 2
##ented: 2
fruit: 2
.: 2
yeast: 2
consume: 2
##s: 2
the: 2
sugar: 2
in: 2
the: 2
fruits: 2
and: 2
converts: 2
it: 2
to: 2
ethanol: 2
and: 2
carbon: 2
dioxide: 2
,: 2
releasing: 2
heat: 2
in: 2
the: 2
process: 2
.: 2
though: 2
wine: 2
can: 2
be: 2
made: 2
from: 2
a: 2
variety: 2
of: 2
fruit: 2
crops: 2
such: 2
as: 2
plum: 2
,: 2
cherry: 2
,: 2
po: 2
##me: 2
##gra: 2
##nate: 2
,: 2
blue: 2
##berry: 2
,: 2
curran: 2
##t: 2
and: 2
elder: 2
##berry: 2
,: 2
it: 2
is: 2
most: 2
often: 2
made: 2
from: 2
grapes: 2
,: 2
and: 2
the: 2
term: 2
wine: 2
generally: 2
refers: 2
to: 2
grape: 2
wine: 2
when: 2
used: 2
without: 2
a: 2
qualifier: 2
.: 2
[SEP]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 2
[PAD]: 