In [1]:
%pip install datasets transformers

Collecting datasets
  Using cached datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.2-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.2.0,>=2023.1.0 (from fsspec[http]<=2024.2.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Collecting aioh



In [10]:
from datasets import load_dataset

dataset = load_dataset("midas/duc2001", "raw")

In [16]:
dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata'],
        num_rows: 308
    })
})

In [23]:
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer
import re
from datasets import load_dataset
from abbreviation import limits

# Load the dataset
dataset = load_dataset("midas/duc2001", "raw")

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def expand_contractions(text, contraction_mapping=limits):
    """Expand contractions in the text."""
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                               if contraction_mapping.get(match)\
                               else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def clean_text(text):
    """Perform both contraction expansion and basic text cleaning."""
    text = expand_contractions(text)
    # Additional cleaning steps as before
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_duc2001_dataset(data):
    processed_sentences = []
    processed_labels = []

    for item in data:
        document = item.get('document', '')
        document = clean_text(document)
        keyphrases = item.get('abstractive_keyphrases', [])
        
        # Tokenize the document into sentences
        sentences = sent_tokenize(document)
        
        for sent in sentences:
            bert_tokens = tokenizer.tokenize(sent)
            bert_tokens_lower = [token.lower() for token in bert_tokens]
            token_labels = ['O'] * len(bert_tokens)
            
            for kp in keyphrases:
                kp_tokens = kp.lower().split()
                kp_len = len(kp_tokens)
                
                for i in range(len(bert_tokens_lower)):
                    if bert_tokens_lower[i:i+kp_len] == kp_tokens:
                        token_labels[i] = 'B'  # Mark the beginning of a keyphrase
                        for j in range(1, kp_len):
                            if (i + j) < len(token_labels):
                                token_labels[i + j] = 'I'  # Mark inside a keyphrase
            
            processed_sentences.append(bert_tokens)
            processed_labels.append(token_labels)

    return processed_sentences, processed_labels

# Assuming 'dataset' is loaded as before
sentences, labels = preprocess_duc2001_dataset(dataset['test'])

if sentences and labels:  # Only proceed if both lists are non-empty
    print(sentences[0])  # Example processed sentence (tokens)
    print(labels[0])     # Corresponding labels
else:
    print("No sentences or labels were processed.")


No sentences or labels were processed.


In [24]:
from datasets import load_dataset
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import BertTokenizer
import re

# Load the dataset
dataset = load_dataset("midas/duc2001", "raw")

# Convert to pandas DataFrame
df = pd.DataFrame(dataset['test'])

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def clean_text(text):
    """Basic cleaning of texts."""
    if not isinstance(text, str):
        return ""  # Return empty string if text is not a string
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_documents(dataframe):
    processed_sentences = []
    processed_labels = []

    for _, row in dataframe.iterrows():
        document = clean_text(row['document'])
        keyphrases = row['abstractive_keyphrases']
        
        sentences = sent_tokenize(document)
        for sent in sentences:
            tokens = tokenizer.tokenize(sent)
            token_labels = ['O'] * len(tokens)
            
            for word in word_tokenize(sent):
                is_keyword = any(word.lower() in kp.lower().split() for kp in keyphrases)
                if is_keyword:
                    idx = tokens.index(word) if word in tokens else -1
                    if idx != -1:  # Found the word in tokens
                        token_labels[idx] = 'B'
                        # Extend to 'I' tags if the keyword is more than one token long
                        for k in range(1, len(tokenizer.tokenize(word))):
                            if idx + k < len(token_labels):
                                token_labels[idx + k] = 'I'
            
            if set(token_labels) != {'O'}:  # Ensure we have keywords in the sentence
                processed_sentences.append(tokens)
                processed_labels.append(token_labels)

    return processed_sentences, processed_labels

# Preprocess the dataset
sentences, labels = preprocess_documents(df)

if sentences and labels:
    for i in range(min(5, len(sentences))):  # Print first 5 or fewer processed sentences
        print(f"Sentence {i+1}: {sentences[i]}")
        print(f"Labels {i+1}: {labels[i]}\n")
else:
    print("No sentences or labels were processed.")


No sentences or labels were processed.


In [7]:
first_row = datasets["test"][0]
print(first_row["document"])  # This will print the 'document' content of the first row.
print(first_row["abstractive_keyphrases"])  # Similarly for 'abstractive_keyphrases'.


['Here', ',', 'at', 'a', 'glance', ',', 'are', 'developments', 'today', 'involving', 'the', 'crash', 'of', 'Pan', 'American', 'World', 'Airways', 'Flight', '103', 'Wednesday', 'night', 'in', 'Lockerbie', ',', 'Scotland', ',', 'that', 'killed', 'all', '259', 'people', 'aboard', 'and', 'more', 'than', '20', 'people', 'on', 'the', 'ground', ':']
['terrorist threats', 'widespread wreckage', 'radical palestinian faction', 'terrorist bombing', 'bomb threat', 'sabotage']
