## Prepare Dataset for BERT 

In [2]:
duc2001_dataset = load_dataset("midas/duc2001", "raw")["test"]
duc2001_dataset

Dataset({
    features: ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata'],
    num_rows: 308
})

In [28]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
from transformers import BertTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import torch.nn.functional as F

# Load datasets
duc2001_dataset = load_dataset("midas/duc2001", "raw")["test"]
# inspec_dataset = load_dataset("midas/inspec", "raw")["test"]
# nus_dataset = load_dataset("midas/nus", "raw")["test"]

def dataset_to_dataframe(dataset):
    return pd.DataFrame({
        'document': [item['document'] for item in dataset],
        'doc_bio_tags': [item['doc_bio_tags'] for item in dataset]
    })

# Convert datasets to dataframes
duc2001_df = dataset_to_dataframe(duc2001_dataset)
inspec_df = dataset_to_dataframe(inspec_dataset)
nus_df = dataset_to_dataframe(nus_dataset)

# Concatenate dataframes
combined_df = pd.concat([duc2001_df, inspec_df, nus_df], ignore_index=True)

# Convert the combined DataFrame back to a Hugging Face dataset
combined_dataset = Dataset.from_pandas(combined_df)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Constants
MAX_LEN = 75
BATCH_SIZE = 32

# Prepare mapping for labels
tag2idx = {'B': 0, 'I': 1, 'O': 2}

# Adjust these weights based on your specific dataset and class imbalance
class_weights = torch.tensor([10.0, 15.0, 0.1])  # Example weights for 'B', 'I', 'O'
# class_weights = torch.tensor([10.0, 15.0, 0.1]).cuda()  # Example weights for 'B', 'I', 'O' if GPU applicable

# Tokenization and encoding for BERT
input_ids = []
attention_masks = []
labels = []

for i, item in enumerate(combined_dataset):
    # Join tokens into a single string
    text = ' '.join([t.lower() for t in item['document']])
    tags = item['doc_bio_tags']

    # Encode text
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Prepare labels
    tag_ids = [tag2idx[tag] for tag in tags] + [tag2idx['O']] * (MAX_LEN - len(tags))
    tag_ids = tag_ids[:MAX_LEN]  # Ensure label length matches input length

    input_ids.append(encoded_dict['input_ids'][0])
    attention_masks.append(encoded_dict['attention_mask'][0])
    labels.append(torch.tensor(tag_ids))

# Convert lists to tensors
input_ids = torch.stack(input_ids)
attention_masks = torch.stack(attention_masks)
labels = torch.stack(labels)

# Split into training and validation sets
train_inputs, val_inputs, train_labels, val_labels, train_masks, val_masks = train_test_split(
    input_ids, labels, attention_masks, test_size=0.1, random_state=2018
)

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

# Create the DataLoader for our validation set
valid_data = TensorDataset(val_inputs, val_masks, val_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

# Load BERT for token classification
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions=False,
    output_hidden_states=False,
)

# Set up the optimizer
# optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
optimizer = AdamW(model.parameters(), lr=1e-4, eps=1e-8)  # increased learning rate


# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 4)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# We have a class imbalance which is hindering our model performance
# Apply focal loss to focus more on hard-to-classify examples by down-weighting the loss contributed by well-classified examples(easy-classify)
def hybrid_loss(logits, labels, weights, alpha=0.8, gamma=2.0):
    # Softmax and cross entropy loss
    ce_loss = torch.nn.functional.cross_entropy(logits, labels, reduction='none', weight=weights)
    
    # Calculate probabilities of the true class
    p_t = torch.exp(-ce_loss)
    
    # Calculate focal component
    focal_loss = (alpha * (1 - p_t) ** gamma * ce_loss).mean()
    
    return focal_loss

# Training loop
for epoch in tqdm(range(4), desc="Epoch"):
    model.train()
    total_loss = 0
    
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = hybrid_loss(outputs.logits.view(-1, 3), b_labels.view(-1), class_weights)

        # # Apply class weights
        # log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
        # weighted_loss = torch.nn.functional.nll_loss(log_probs.view(-1, model.num_labels), b_labels.view(-1), weight=class_weights)

        # weighted_loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        # total_loss += weighted_loss.item()
        total_loss += loss.item()


    print(f'Epoch {epoch+1}: Average Training Loss: {total_loss / len(train_dataloader):.2f}')

    # Validation step
    model.eval()
    eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0
    
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    print(f'Validation Accuracy: {eval_accuracy / nb_eval_steps:.2f}')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1: Average Training Loss: 0.84


Epoch:  25%|██▌       | 1/4 [05:53<17:39, 353.17s/it]

Validation Accuracy: 0.08
Epoch 2: Average Training Loss: 0.71


Epoch:  50%|█████     | 2/4 [11:59<12:02, 361.14s/it]

Validation Accuracy: 0.08
Epoch 3: Average Training Loss: 0.57


Epoch:  75%|███████▌  | 3/4 [18:27<06:13, 373.17s/it]

Validation Accuracy: 0.08
Epoch 4: Average Training Loss: 0.44


Epoch: 100%|██████████| 4/4 [24:36<00:00, 369.15s/it]

Validation Accuracy: 0.08





In [29]:
import numpy as np

def update_metrics(preds, labels, metrics):
    preds_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()

    # Calculate true positives, false positives, and false negatives
    tp = np.sum((preds_flat == labels_flat) & (labels_flat != tag2idx['O']))
    fp = np.sum((preds_flat != labels_flat) & (preds_flat != tag2idx['O']))
    fn = np.sum((preds_flat != labels_flat) & (labels_flat != tag2idx['O']))

    metrics['tp'] += tp
    metrics['fp'] += fp
    metrics['fn'] += fn

    return metrics

def calculate_scores(metrics):
    precision = metrics['tp'] / (metrics['tp'] + metrics['fp']) if metrics['tp'] + metrics['fp'] > 0 else 0
    recall = metrics['tp'] / (metrics['tp'] + metrics['fn']) if metrics['tp'] + metrics['fn'] > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

# Modified validation loop
def validate_model(valid_dataloader, model):
    model.eval()
    eval_metrics = {'tp': 0, 'fp': 0, 'fn': 0}

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        eval_metrics = update_metrics(logits, label_ids, eval_metrics)

    precision, recall, f1 = calculate_scores(eval_metrics)
    return precision, recall, f1

# Example of calling the validate_model function
precision, recall, f1 = validate_model(valid_dataloader, model)
print(f"Validation Precision: {precision:.2f}")
print(f"Validation Recall: {recall:.2f}")
print(f"Validation F1 Score: {f1:.2f}")


Validation Precision: 0.08
Validation Recall: 0.65
Validation F1 Score: 0.15


In [33]:
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


Validation Accuracy: 0.08131944444444444


In [34]:
# Save the model and the tokenizer
model.save_pretrained('./model_save_v5/')
tokenizer.save_pretrained('./model_save_v5/')

# Load the model and the tokenizer
model = BertForTokenClassification.from_pretrained('./model_save_v5/')
tokenizer = BertTokenizer.from_pretrained('./model_save_v5/')


In [39]:
def keywordextract(text, model, tokenizer, device):
    # Tokenize input
    encoded_dict = tokenizer.encode_plus(
        text,                      # Document to encode.
        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
        max_length=64,             # Pad or truncate.
        padding='max_length',      # Pad to max_length.
        truncation=True,           # Truncate to max_length.
        return_attention_mask=True,# Construct attention masks.
        return_tensors='pt',       # Return PyTorch tensors.
    )
    
    # Move tensors to the correct device
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Decode predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    predictions = predictions[0].tolist()  # Remove the batch dimension and convert to list

    # Convert input_ids to tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # print("Tokens and Predictions:")  # Debugging output
    # for token, prediction in zip(tokens, predictions):
    #     print(f"{token}: {prediction}")

    # Extract keywords based on the 'B' and 'I' predictions
    keywords = []
    current_keyword = []
    for token, pred in zip(tokens, predictions):
        if pred == 1:  # Corresponds to 'B'
            if current_keyword:  # Save the previous keyword if it exists
                keywords.append("".join(current_keyword).replace("##", ""))
            current_keyword = [token]  # Start a new keyword
        elif pred == 2 and current_keyword:  # Corresponds to 'I'
            current_keyword.append(token)
        else:
            if current_keyword:
                keywords.append("".join(current_keyword).replace("##", ""))
                current_keyword = []
    
    # Check if the last token was part of a keyword
    if current_keyword:
        keywords.append("".join(current_keyword).replace("##", ""))

    return keywords


In [41]:
text = """Machine learning (ML) is a field of study in artificial intelligence 
concerned with the development and study of statistical algorithms that 
can learn from data and generalize to unseen data, and thus 
perform tasks without explicit instructions."""
keywords = keywordextract(text, model, tokenizer, device)
print("Extracted Keywords:", keywords)


Extracted Keywords: ['machine', 'learning', 'ml', ')', 'field', 'artificial', 'intelligence', 'development', 'study', 'of', 'statistical', 'algorithms', 'ize', 'thus', 'perform', 'without', 'explicit', 'instructions', '.', '[SEP]']


In [45]:
text = """Machine learning (ML) is a field of study in artificial intelligence 
concerned with the development and study of statistical algorithms that 
can learn from data and generalize to unseen data, and thus 
perform tasks without explicit instructions."""
keywords = keywordextract(text, model, tokenizer, device)
print("Extracted Keywords:", keywords)


Extracted Keywords: ['machine', 'learning', 'ml', 'artificial', 'intelligence', 'statistical', 'algorithms', 'learn', 'data', 'explicit', 'instructions']


In [7]:
text = """Machine learning (ML) is a field of study in artificial intelligence 
concerned with the development and study of statistical algorithms that 
can learn from data and generalize to unseen data, and thus 
perform tasks without explicit instructions."""
keywords = keywordextract(text, model, tokenizer, device)
print("Extracted Keywords:", keywords)


Extracted Keywords: ['machine', 'learning', '(', 'ml', ')', 'field', 'in', 'artificial', 'intelligence', 'with', 'development', 'study', 'of', 'statistical', 'algorithms', 'data', 'and', 'ize', 'to', 'data', ',', 'and', 'thus', 'perform', 'tasks', 'without', 'instructions', '.', '[SEP]']


In [13]:
text = """Machine learning (ML) is a field of study in artificial intelligence 
concerned with the development and study of statistical algorithms that 
can learn from data and generalize to unseen data, and thus 
perform tasks without explicit instructions."""
keywords = keywordextract(text, model, tokenizer, device)
print("Extracted Keywords:", keywords)


Extracted Keywords: ['machine', 'learning', 'ml', 'artificial', 'intelligence', 'statistical', 'algorithms', 'learn', 'data', 'explicit', 'instructions']


different dataset.
optimizing loss function -> according to diff feature
                            

augmenting the weight to each class

add dropout layer => 3



In [19]:
import numpy as np

def dcg_at_k(relevance_scores, k, method=1):
    """Calculate discounted cumulative gain (DCG) at rank k.

    Args:
        relevance_scores (list of float): The list of relevance scores.
        k (int): The number of results to consider.
        method (int): The method to compute DCG, 0 or 1.

    Returns:
        float: The DCG score.

    Raises:
        ValueError: If the method is not 0 or 1.
    """
    relevance_scores = np.asfarray(relevance_scores)[:k]
    if relevance_scores.size:
        if method == 0:
            return relevance_scores[0] + np.sum(relevance_scores[1:] / np.log2(np.arange(2, relevance_scores.size + 1)))
        elif method == 1:
            return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0

def ndcg_at_k(relevance_scores, k, method=1):
    """Calculate normalized discounted cumulative gain (NDCG) at rank k.

    Args:
        relevance_scores (list of float): The list of relevance scores.
        k (int): The number of results to consider.
        method (int): The method to compute DCG, 0 or 1.

    Returns:
        float: The NDCG score.
    """
    dcg_max = dcg_at_k(sorted(relevance_scores, reverse=True), k, method)
    if not dcg_max:
        return 0.0
    return dcg_at_k(relevance_scores, k, method) / dcg_max

def mean_reciprocal_rank(ranking_lists):
    """Calculate the mean reciprocal rank (MRR).

    Args:
        ranking_lists (list of list of int): Each inner list is a set of binary values (0 or 1)
            indicating the absence or presence of relevant items.

    Returns:
        float: The MRR score.
    """
    first_relevant = (np.asarray(rankings).nonzero()[0] for rankings in ranking_lists)
    return np.mean([1.0 / (ranking[0] + 1) if ranking.size else 0 for ranking in first_relevant])

def calculate_relevance_scores(true_keywords, predicted_keywords):
    """Calculates relevance scores where 1 indicates relevance and 0 indicates irrelevance.
   
    Args:
        true_keywords (list of str): The list of true keywords.
        predicted_keywords (list of tuples): List of predicted keywords with their scores.
   
    Returns:
        list of int: Relevance scores (1 or 0) for each predicted keyword.
    """
    return [1 if keyword in true_keywords else 0 for keyword, _ in predicted_keywords]

def evaluate_keyword_extraction(true_data, predictions):
    """Evaluates the keyword extraction algorithm using NDCG and MRR scoring metrics.
   
    Args:
        true_data (list of list of str): List of lists containing true keywords for each document.
        predictions (list of list of tuples): List of lists, each containing tuples of keywords and their confidence scores.
   
    Returns:
        tuple of (float, float): Mean NDCG score and Mean MRR score.
    """
    ndcg_scores = []
    mrr_scores = []

    for true_keywords, predicted_keywords_with_scores in zip(true_data, predictions):
        predicted_keywords_with_scores.sort(key=lambda x: x[1], reverse=True)  # Sort by confidence score descending
        predicted_keywords = [kw for kw, _ in predicted_keywords_with_scores]
        relevance_scores = calculate_relevance_scores(true_keywords, predicted_keywords_with_scores)

        print("Predicted Keywords with Scores After Sorting:", predicted_keywords_with_scores)


        print("True Keywords:", true_keywords)
        print("Predicted Keywords:", predicted_keywords)
        print("Relevance Scores:", relevance_scores)

        # Compute NDCG
        ndcg_score = ndcg_at_k(relevance_scores, k=len(relevance_scores))
        ndcg_scores.append(ndcg_score)
       
        # Compute MRR
        rs = [[1 if keyword in true_keywords else 0 for keyword in predicted_keywords]]
        mrr_score = mean_reciprocal_rank(rs)
        mrr_scores.append(mrr_score)
   
    mean_ndcg = np.mean(ndcg_scores)
    mean_mrr = np.mean(mrr_scores)
    return mean_ndcg, mean_mrr





In [36]:
def keywordextract(text, model, tokenizer, device):
    # Tokenize input
    encoded_dict = tokenizer.encode_plus(
        text,                      # Document to encode.
        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
        max_length=64,             # Pad or truncate.
        padding='max_length',      # Pad to max_length.
        truncation=True,           # Truncate to max_length.
        return_attention_mask=True,# Construct attention masks.
        return_tensors='pt',       # Return PyTorch tensors.
    )
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    predictions = predictions[0].tolist()  # Remove the batch dimension

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Extract keywords based on predictions
    keywords = []
    current_keyword = ""
    for token, pred in zip(tokens, predictions):
        # If the token is part of a keyword
        if pred in [1, 2]:
            # Remove the BERT's subword prefix if applicable
            if token.startswith("##"):
                token = token[2:]
            current_keyword += token
        # If the token marks the end of a keyword
        elif current_keyword:
            keywords.append(current_keyword)
            current_keyword = ""
    
    return keywords


In [37]:
import numpy as np

def dcg_at_k(relevance_scores, k, method=1):
    """Calculate discounted cumulative gain (DCG) at rank k.

    Args:
        relevance_scores (list of float): The list of relevance scores.
        k (int): The number of results to consider.
        method (int): The method to compute DCG, 0 or 1.

    Returns:
        float: The DCG score.

    Raises:
        ValueError: If the method is not 0 or 1.
    """
    relevance_scores = np.asfarray(relevance_scores)[:k]
    if relevance_scores.size:
        if method == 0:
            return relevance_scores[0] + np.sum(relevance_scores[1:] / np.log2(np.arange(2, relevance_scores.size + 1)))
        elif method == 1:
            return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0

def ndcg_at_k(relevance_scores, k, method=1):
    """Calculate normalized discounted cumulative gain (NDCG) at rank k.

    Args:
        relevance_scores (list of float): The list of relevance scores.
        k (int): The number of results to consider.
        method (int): The method to compute DCG, 0 or 1.

    Returns:
        float: The NDCG score.
    """
    dcg_max = dcg_at_k(sorted(relevance_scores, reverse=True), k, method)
    if not dcg_max:
        return 0.0
    return dcg_at_k(relevance_scores, k, method) / dcg_max

def mean_reciprocal_rank(ranking_lists):
    """Calculate the mean reciprocal rank (MRR).

    Args:
        ranking_lists (list of list of int): Each inner list is a set of binary values (0 or 1)
            indicating the absence or presence of relevant items.

    Returns:
        float: The MRR score.
    """
    first_relevant = (np.asarray(rankings).nonzero()[0] for rankings in ranking_lists)
    return np.mean([1.0 / (ranking[0] + 1) if ranking.size else 0 for ranking in first_relevant])

def calculate_relevance_scores(true_keywords, predicted_keywords):
    """Calculates relevance scores where 1 indicates relevance and 0 indicates irrelevance.
   
    Args:
        true_keywords (list of str): The list of true keywords.
        predicted_keywords (list of tuples): List of predicted keywords with their scores.
   
    Returns:
        list of int: Relevance scores (1 or 0) for each predicted keyword.
    """
    return [1 if keyword in true_keywords else 0 for keyword, _ in predicted_keywords]

def evaluate_keyword_extraction(true_data, predictions):
    """Evaluates the keyword extraction algorithm using NDCG and MRR scoring metrics.
   
    Args:
        true_data (list of list of str): List of lists containing true keywords for each document.
        predictions (list of list of tuples): List of lists, each containing tuples of keywords and their confidence scores.
   
    Returns:
        tuple of (float, float): Mean NDCG score and Mean MRR score.
    """
    ndcg_scores = []
    mrr_scores = []

    for true_keywords, predicted_keywords_with_scores in zip(true_data, predictions):
        predicted_keywords_with_scores.sort(key=lambda x: x[1], reverse=True)  # Sort by confidence score descending
        predicted_keywords = [kw for kw, _ in predicted_keywords_with_scores]
        relevance_scores = calculate_relevance_scores(true_keywords, predicted_keywords_with_scores)

        # Compute NDCG
        ndcg_score = ndcg_at_k(relevance_scores, k=len(relevance_scores))
        ndcg_scores.append(ndcg_score)
       
        # Compute MRR
        rs = [[1 if keyword in true_keywords else 0 for keyword in predicted_keywords]]
        mrr_score = mean_reciprocal_rank(rs)
        mrr_scores.append(mrr_score)
   
    mean_ndcg = np.mean(ndcg_scores)
    mean_mrr = np.mean(mrr_scores)
    return mean_ndcg, mean_mrr


In [23]:
def keywordextract(text, model, tokenizer, device):
    # Tokenize input
    encoded_dict = tokenizer.encode_plus(
        text,                      # Document to encode.
        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
        max_length=64,             # Pad or truncate.
        padding='max_length',      # Pad to max_length.
        truncation=True,           # Truncate to max_length.
        return_attention_mask=True,# Construct attention masks.
        return_tensors='pt',       # Return PyTorch tensors.
    )
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    predictions = predictions[0].tolist()  # Remove the batch dimension

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Extract keywords based on predictions
    keywords = []
    current_keyword = []
    keyword_scores = []
    for token, pred in zip(tokens, predictions):
        # Remove the BERT's subword prefix if applicable
        if token.startswith("##"):
            token = token[2:]
        else:
            # Append and reset the current keyword when encountering a new starting token without '##'
            if current_keyword:
                keywords.append("".join(current_keyword).replace("##", ""))
                keyword_scores.append(max(keyword_scores))
                current_keyword = []
                keyword_scores = []

        # Calculate confidence
        confidence = torch.softmax(logits, dim=-1)[0, :, pred].max().item()

        if pred == 1:  # 'B' for beginning of keyword
            if current_keyword:
                keywords.append("".join(current_keyword).replace("##", ""))
                keyword_scores.append(max(keyword_scores))
            current_keyword = [token]
            keyword_scores = [confidence]
        elif pred == 2 and current_keyword:  # 'I' for continuation
            current_keyword.append(token)
            keyword_scores.append(confidence)
        else:
            if current_keyword:
                keywords.append("".join(current_keyword).replace("##", ""))
                keyword_scores.append(max(keyword_scores))
                current_keyword = []
                keyword_scores = []

    if current_keyword:
        keywords.append("".join(current_keyword).replace("##", ""))
        keyword_scores.append(max(keyword_scores))

    return list(zip(keywords, keyword_scores))


In [21]:
def keywordextract(text, model, tokenizer, device):
    # Tokenize input
    encoded_dict = tokenizer.encode_plus(
        text,                      # Document to encode.
        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
        max_length=64,             # Pad or truncate.
        padding='max_length',      # Pad to max_length.
        truncation=True,           # Truncate to max_length.
        return_attention_mask=True,# Construct attention masks.
        return_tensors='pt',       # Return PyTorch tensors.
    )
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    # Model inference
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    predictions = predictions[0].tolist()  # Remove the batch dimension

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # Extract keywords based on predictions
    keywords = []
    current_keyword = ""
    for token, pred in zip(tokens, predictions):
        # If the token is part of a keyword
        if pred in [1, 2]:
            # Remove the BERT's subword prefix if applicable
            if token.startswith("##"):
                token = token[2:]
            current_keyword += token
        # If the token marks the end of a keyword
        elif current_keyword:
            keywords.append(current_keyword)
            current_keyword = ""
    
    return keywords


In [40]:
# Prepare data for evaluation
true_keywords_list = [['machine', 'learning', 'ml', 'artificial', 'intelligence', 'statistical', 'algorithms', 'learn', 'data', 'explicit', 'instructions']]  # Note the double brackets

text = """Machine learning (ML) is a field of study in artificial intelligence 
concerned with the development and study of statistical algorithms that 
can learn from data and generalize to unseen data, and thus 
perform tasks without explicit instructions."""

predicted_keywords_with_scores = keywordextract(text, model, tokenizer, device)
predicted_keywords_with_scores_list = [predicted_keywords_with_scores]  # Note the double brackets

# Evaluation
mean_ndcg, mean_mrr = evaluate_keyword_extraction(true_keywords_list, predicted_keywords_with_scores_list)
print(f"Mean NDCG: {mean_ndcg:.3f}")
print(f"Mean MRR: {mean_mrr:.3f}")


IndexError: string index out of range

In [42]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text
text = "Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions"

# Predict keywords
predicted_keywords_with_scores = ['machine', 'learning', 'ml', 'artificial', 'intelligence', 'statistical', 'algorithms', 'learn', 'data', 'explicit', 'instructions']

# Define the true keywords
true_keywords = ['machine', 'learning', 'ml', 'artificial', 'intelligence', 'statistical', 'algorithms', 'learn', 'data', 'explicit', 'instructions']

# Calculate relevance scores
relevance_scores = calculate_relevance_scores(true_keywords, predicted_keywords_with_scores)

# Evaluate keyword extraction
mean_ndcg, mean_mrr = evaluate_keyword_extraction([true_keywords], [predicted_keywords_with_scores])

print("Mean NDCG:", mean_ndcg)
print("Mean MRR:", mean_mrr)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted Keywords: ['[CLS]', 'machine', 'ml', 'development', 'study', 'of', 'statistical', 'data', 'general', 'ize', 'unseen', 'data', 'thus', 'perform', 'tasks', 'explicit', 'instructions', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


ValueError: too many values to unpack (expected 2)