In [4]:
import pandas as pd
import pickle
import random

In [35]:
# I'm taking out 100 papers from 2023 as my held-out test set, separate from the train-test split later on

df = pd.read_csv('gordonramsay_data_processed.csv')
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y')

# Chose this range bc o1 train cutoff is October
start_date = pd.to_datetime('2023-04-01')
end_date = pd.to_datetime('2023-10-31')

filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

pmids_series_date_filtered = filtered_df['PMID']
pmid_list_date_filtered = pmids_series_date_filtered.tolist()

print(f"\nNumber of PMIDs from date range: {len(pmid_list_date_filtered)}")

# Load the biobert_embeddings_background pickle file
try:
    with open('biobert_embeddings_background.pkl', 'rb') as f:
        biobert_embeddings_background = pickle.load(f)
except FileNotFoundError:
    print("Error: biobert_embeddings_background.pkl file not found. Please check the file path.")
    exit()

keys_in_pickle = biobert_embeddings_background.keys()
print(f"\nNumber of keys in biobert_embeddings_background.pkl: {len(keys_in_pickle)}")
keys_in_pickle_set = set(keys_in_pickle)
pmid_list_date_filtered_set = set(pmid_list_date_filtered)
pmids_in_date_and_pickle_set = pmid_list_date_filtered_set.intersection(keys_in_pickle_set)

pmid_list_final = list(pmids_in_date_and_pickle_set)

num_samples_final = min(100, len(pmid_list_final))  # Ensure we don't sample more than available
if pmid_list_final: # Only sample if there are PMIDs to sample from
    sampled_pmid_list_final = random.sample(pmid_list_final, num_samples_final)
else:
    sampled_pmid_list_final = []

    
pmid_list = sampled_pmid_list_final

print("\nFinal list of PMIDs (in date range AND pickle keys):")
print(pmid_list)
print(f"\nNumber of PMIDs in final list: {len(pmid_list)}")


Number of PMIDs from date range: 5062

Number of keys in biobert_embeddings_background.pkl: 11965

Final list of PMIDs (in date range AND pickle keys):
[37693474, 37503175, 37873372, 37397995, 37333385, 37398218, 37693450, 37131581, 37461514, 37645910, 37645984, 37333098, 37090601, 37502863, 37503048, 37066362, 37986845, 37163100, 37425947, 37425858, 37546986, 37425862, 37205393, 37503180, 37131640, 37904983, 37662226, 37503169, 37425961, 37292708, 37732194, 37577515, 37503223, 37645992, 37333072, 37961580, 37873401, 37066187, 37745351, 37425902, 37398038, 37873081, 37873229, 37873190, 37693601, 37781575, 37333142, 37662295, 37131582, 37425822, 37662341, 37745432, 37214825, 37808647, 37425758, 37214901, 37609260, 37645724, 37873175, 37215046, 37205432, 37905116, 37732274, 37873400, 37066370, 37693504, 37662217, 37745537, 37693549, 37662344, 37503031, 37732176, 37090679, 37398500, 37873491, 37732275, 37609345, 37503164, 37162924, 37034748, 37066302, 37873383, 37904955, 37425755, 372930

In [42]:
## Doing the train test split foor the hypothesis... I initially named everything as biobert but then refactored to be either

pickle_name = 'llama3.1_embeddings_hypothesis'
new_name = 'llama_hypothesis'

with open(f'{pickle_name}.pkl', 'rb') as f:
    biobert_embeddings_hypothesis = pickle.load(f)

len_biobert_embeddings_hypothesis = len(biobert_embeddings_hypothesis)
print(f"Length of {pickle_name}.pkl: {len_biobert_embeddings_hypothesis}")

biobert_hypothesis_test_dict = {}

found_pmids = 0 # Counter for found PMIDs

for pmid in pmid_list:
    if pmid in biobert_embeddings_hypothesis:
        biobert_hypothesis_test_dict[pmid] = biobert_embeddings_hypothesis[pmid]
        found_pmids += 1
    else:
        print(f"  NOT FOUND PMID: {pmid}")

print(f"\nNumber of PMIDs found in {pickle_name}: {found_pmids}") # Summary of found PMIDs

with open(f'{new_name}_test.pkl', 'wb') as f:
    pickle.dump(biobert_hypothesis_test_dict, f)

with open(f'{new_name}_test.pkl', 'rb') as f:
    biobert_hypothesis_test = pickle.load(f)

len_biobert_hypothesis_test = len(biobert_hypothesis_test)
print(f"\nLength of {new_name}_test.pkl: {len_biobert_hypothesis_test}")

biobert_hypothesis_train_dict = {}
pmid_set_test = set(biobert_hypothesis_test_dict.keys())

for pmid, embedding in biobert_embeddings_hypothesis.items():
    if pmid not in pmid_set_test:
        biobert_hypothesis_train_dict[pmid] = embedding

with open(f'{new_name}_train.pkl', 'wb') as f:
    pickle.dump(biobert_hypothesis_train_dict, f)

len_biobert_hypothesis_train = len(biobert_hypothesis_train_dict)
print(f"Length of {new_name}_train.pkl: {len_biobert_hypothesis_train}")

Length of llama3.1_embeddings_hypothesis.pkl: 11218

Number of PMIDs found in llama3.1_embeddings_hypothesis: 100

Length of llama_hypothesis_test.pkl: 100
Length of llama_hypothesis_train.pkl: 11118


In [43]:
# It's super important that the split worked correctly, so double checking here

with open('llama_hypothesis_test.pkl', 'rb') as f:
    biobert_hypothesis_test = pickle.load(f)

keys_in_test_pickle = set(biobert_hypothesis_test.keys())
pmid_list_set = set(pmid_list)

same_keys = keys_in_test_pickle == pmid_list_set
no_repeats_in_pickle_keys = len(keys_in_test_pickle) == len(biobert_hypothesis_test)

print(f"Are keys in this test pkl the same as pmid_list?: {same_keys}")
print(f"Are there repeats in keys of this test pkl?: {not no_repeats_in_pickle_keys}")

Are keys in biobert_hypothesis_test.pkl the same as pmid_list?: True
Are there repeats in keys of biobert_hypothesis_test.pkl?: False


In [1]:
# Generating random embeddings based on the same PMIDs from actual embeddings

import pickle
import numpy as np
import pandas as pd

output_background_pkl = 'random_background_train.pkl'
output_hypothesis_pkl = 'random_hypothesis_train.pkl'
input_biobert_background_pkl = 'biobert_background_train.pkl'


def generate_random_embeddings_from_biobert(input_biobert_pkl_file, output_pkl_file):
    """
    Generates a dictionary of random embeddings using PMIDs and embedding dimension
    from an existing BioBERT embeddings pickle file.
    """
    try:
        with open(input_biobert_pkl_file, 'rb') as f:
            biobert_embeddings_dict = pickle.load(f)
    except FileNotFoundError:
        print(f"Error: Input BioBERT embeddings file not found: {input_biobert_pkl_file}")
        return None

    pmid_list = list(biobert_embeddings_dict.keys())
    if not pmid_list:
        print(f"Error: No PMIDs found in input BioBERT embeddings file: {input_biobert_pkl_file}")
        return None

    example_embedding = biobert_embeddings_dict[pmid_list[0]]
    embedding_dim = len(example_embedding)
    print(f"Inferred embedding dimension from {input_biobert_pkl_file}: {embedding_dim}")
    print(f"Number of PMIDs from {input_biobert_pkl_file}: {len(pmid_list)}")

    print("Generating random embeddings...")
    random_embeddings_dict = {}
    for pmid in pmid_list:
        random_embedding = np.random.rand(embedding_dim).tolist()
        random_embeddings_dict[pmid] = random_embedding
    print(f"Generated {len(random_embeddings_dict)} random embeddings.")

    print(f"Saving random embeddings to: {output_pkl_file}")
    with open(output_pkl_file, 'wb') as f:
        pickle.dump(random_embeddings_dict, f)

    print(f"Random embedding file generated successfully: {output_pkl_file}")


if __name__ == '__main__':
    print("Generating random background test set embeddings...")
    generate_random_embeddings_from_biobert(input_biobert_background_pkl, output_background_pkl)

    print("Generating random hypothesis test set embeddings...")
    generate_random_embeddings_from_biobert(input_biobert_background_pkl, output_hypothesis_pkl)

    print("Random embedding files for test set generated successfully!")

Generating random background test set embeddings...
Inferred embedding dimension from biobert_background_train.pkl: 1024
Number of PMIDs from biobert_background_train.pkl: 11865
Generating random embeddings...
Generated 11865 random embeddings.
Saving random embeddings to: random_background_train.pkl
Random embedding file generated successfully: random_background_train.pkl
Generating random hypothesis test set embeddings...
Inferred embedding dimension from biobert_background_train.pkl: 1024
Number of PMIDs from biobert_background_train.pkl: 11865
Generating random embeddings...
Generated 11865 random embeddings.
Saving random embeddings to: random_hypothesis_train.pkl
Random embedding file generated successfully: random_hypothesis_train.pkl
Random embedding files for test set generated successfully!
