In [101]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained BERT model and tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

# Input CSV file path
input_csv_path = 'data/New master/master_new.csv'

# Output CSV file path
output_csv_path = 'data/Master_embedded/master_with_embeddings.csv'

# Read the CSV file into a DataFrame
master_df = pd.read_csv(input_csv_path).dropna().reset_index(drop=True)

master_df = master_df[:100]

new_df = pd.DataFrame(columns=['id', 'title', 'chunk', 'chunk_embedded'])

batch_size = 100
# Process abstracts in batches
for i in range(0, len(master_df), batch_size):
    batch_df = master_df.iloc[i:i+batch_size]

    chunks = []
    new_ids = []
    embeddings = []
    titles = []
    # Tokenize and embed each sentence in the abstract using BERT
    for idx, row in batch_df.iterrows():
        title = row['Title']
        abstract_sentences = row['Abstract'].split('.')
        abstract_sentences = [sent.strip() for sent in abstract_sentences if sent.strip()]
        abstract_chunks = ['. '.join(abstract_sentences[i:i+3]) for i in range(0, len(abstract_sentences), 3)]

        for chunk_number, chunk in enumerate(abstract_chunks):
            new_id = f'{row["PMID"]}{chunk_number + 1}'
            tokenized_chunk = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True)
            
            with torch.no_grad():
                model_output = bert_model(**tokenized_chunk)
                embedding = model_output.last_hidden_state.mean(dim=1).tolist()[0]

            embeddings.append(embedding)
            chunks.append(chunk)
            new_ids.append(new_id)
            titles.append(title)
            

    # Update the 'Sentence_Embedded' column with the embeddings
    new_df = new_df.append(pd.DataFrame({'id':new_ids, 'title':titles, 'chunk':chunks, 'chunk_embedded':embeddings}))
    
    # Clear variables to free up memory
    del batch_df, chunks, new_ids, tokenized_chunk, model_output, embeddings

# Write the updated master DataFrame to a single CSV file
new_df.to_csv(output_csv_path, index=False)

print(f"Data extracted and saved to {output_csv_path}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Data extracted and saved to data/Master_embedded/master_with_embeddings.csv
