In [1]:
import pandas as pd
import numpy as np
import warnings
from transformers import AutoTokenizer
from transformers import AutoModel
from datasets import load_dataset
import torch
import faiss
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


## Import Entire Dataset

In [2]:
dataset = load_dataset("scientific_papers", "pubmed", split="train")

## Tokenize

In [10]:
tokenizer = AutoTokenizer.from_pretrained('stanford-crfm/BioMedLM')

In [11]:
def biomedical_tokenize_text_chunks(batch, chunk_size=1024):
    # Initialize the dictionary to store tokenized chunks for each column
    tokenized_batch = {'article_tokenized_chunks': [], 'abstract_tokenized_chunks': [], 'section_names_tokenized_chunks': []}
    
    # Process each text column in the batch
    for article, abstract, section_names in zip(batch['article'], batch['abstract'], batch['section_names']):
        # Tokenize and chunk 'article'
        article_tokens = tokenizer.tokenize(article)
        tokenized_batch['article_tokenized_chunks'].append(
            [article_tokens[i:i + chunk_size] for i in range(0, len(article_tokens), chunk_size)]
        )

        # Tokenize and chunk 'abstract'
        abstract_tokens = tokenizer.tokenize(abstract)
        tokenized_batch['abstract_tokenized_chunks'].append(
            [abstract_tokens[i:i + chunk_size] for i in range(0, len(abstract_tokens), chunk_size)]
        )

        # Tokenize and chunk 'section_names'
        section_names_tokens = tokenizer.tokenize(section_names)
        tokenized_batch['section_names_tokenized_chunks'].append(
            [section_names_tokens[i:i + chunk_size] for i in range(0, len(section_names_tokens), chunk_size)]
        )

    return tokenized_batch


In [12]:
tokenized_dataset = dataset.map(biomedical_tokenize_text_chunks, batched=True, batch_size=16)

In [13]:
tokenized_dataset

Dataset({
    features: ['article', 'abstract', 'section_names', 'article_tokenized_chunks', 'abstract_tokenized_chunks', 'section_names_tokenized_chunks'],
    num_rows: 119924
})

In [14]:
import hashlib

def generate_id(text):
    return hashlib.sha256(text.encode()).hexdigest()

dataset = tokenized_dataset

def add_id(example):
    unique_text = example['abstract']  # Or any other combination
    example['ID'] = generate_id(unique_text)
    return example

dataset_with_id = dataset.map(add_id)

In [None]:
dataset_with_id.to_json('data/tokenized_dataset.json')