In [1]:
import torch
import numpy as np
import json
import os
import re
import pandas as pd
import shutil

from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetDict

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


helper functions and constants

In [2]:
def dataset_for_prompt(df):
    # number of rows to sample
    num_rows = min(len(df), np.random.randint(5, 25))
    df = df.sample(num_rows)
    #number of genes to sample
    num_genes = 2000 // num_rows

    use_d = True
    if len(df['diseases'].unique()) == 1:
        use_d = False

    s = ""
    sampled_indices = []
    for index, row in df.iterrows():
        cur_num_genes = min(num_genes+np.random.randint(-20, 20), 1000)
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, cur_num_genes))
        if use_d:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"
        sampled_indices.append(index)
        
    return s, sampled_indices

prompt_prefix = """This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider what the analysis the researchers did in the manuscript and the biological context of this study. Then look at the example cell sentences(or a part of them), think about what question the researchers asked or would ask about the cell sentence, what they wanted to and can infer, study, or understand. Imitate the interpretation of data in the manuscript. Then answer the question, quote specific cell sentences and genes as evidence, and include detailed explanations and reasonings resemble a chain of thoughts. You can borrow reasoning and analysis, conclusions from the manuscript but don't directly use or quote the manuscript in question or answer. Avoid questions about experiment design or procedures, avoid vague questions. Vary the questions, answers and their wording as much as possible to cover a diverse range of topics. Include both open ended questions and yes or no questions.
Provide Context for the question, this is only the essential background information required to answer the question, do not directly summarize or describe the dataset. Find context from the manuscript. Provide Answer no less than 30 words, including detailed reasonings. Refer to specific cell sentences as [type] cell gene expression. Provide Keywords of the answer being the most essential parts of the correct answer. The keywords should be yes or no(must include if it is a yes or no question), or important gene names, or biological information in question. These keywords will be used during my training to validate my model's response. 
Give 20 question answer pairs as one list, don't give any other word. Strictly format like this: 
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
...
"""

loads all datasets and meta data \
assumes summary datasets are in a directory as csv files, assuming the format processed by Harry \
all files I'm looking at can be found in my C2S-RL dev github repo branch

In [3]:
# TODO: change to your own file path
datasets_directory = "/users/zlyu12/Desktop/c2s-RL/Dec19_dataset" # all summary datasets
meta_data_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/meta_data.json" # dataset name, filename, url etc.
hf_dataset_output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/temp_hf_dataset" # temporary output directory

datasets_files_paths = [os.path.join(datasets_directory, f) for f in os.listdir(datasets_directory) if f.endswith('.csv')]
dataset_numbers = np.sort([int(re.search(r'\((\d+)\)', f).group(1)) for f in datasets_files_paths])

meta_data = json.load(open(meta_data_path))
dataset_numbers_iterator = iter(dataset_numbers[20:])

try:
    hf_dataset = load_from_disk(hf_dataset_output_path)
except:
    hf_dataset = Dataset.from_dict({})

print(hf_dataset)

Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 365
})


iterates through all datasets in your directory \
this prepares the prompt \
copy paste, add the publication text at the end and run our favorite LLM \
run only once for a row, we rely on some variables defined here in later cells

In [99]:
dataset_index = str(next(dataset_numbers_iterator))
dataset_name = [k for k,v in meta_data.items() if f'({dataset_index})' in v.get('filename', '')]
if len(dataset_name) != 1:
    print("dataset index: ", dataset_index)
    print(dataset_name)
    print(f"dataset \"{dataset_name}\" not in meta_data!")
else:
    dataset_name = dataset_name[0]
    cur_url = meta_data[dataset_name]['url']
    dataset_file_path = [path for path in datasets_files_paths if f"({dataset_index})" in path][0]
    dataset_df = pd.read_csv(dataset_file_path)
    dataset_in_prompt, sampled_indices = dataset_for_prompt(dataset_df)
    print("Dataset Index: ", dataset_index)
    print("\nDataset Name: ", dataset_name)
    print("\nPublication URL: ", cur_url)
    print("\nPrompt:\n", prompt_prefix + "Dataset: \n" + dataset_in_prompt + "Manuscript:\n")


Dataset Index:  50

Dataset Name:  Microglia - 24 months old wild-type and Rag1-KO

Publication URL:  https://cellxgene.cziscience.com/collections/0faa1af6-e504-4b88-a47b-69347e1bace5

Prompt:
 This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first r

Just copy paste the output into the next cell, shouldn't need to do additional processing

In [103]:
output = """
<|Context|>This microglial cell from white matter displays high expression of genes linked to phagocytosis and immune activation, such as TREM2, TYROBP, and CX3CR1.<|Question|>Does the presence of TREM2 in the white matter microglial cell gene expression suggest an activated microglial phenotype associated with clearance of debris?<|Answer|>Yes. TREM2 is well-known for its role in promoting microglial activation and debris uptake. In the [white matter] cell gene expression, TREM2 appears together with genes like TYROBP and CX3CR1, which collectively indicate an immune-active state. The chain of thought is that TREM2+TYROBP interactions facilitate phagocytosis, pointing to an activated microglial phenotype in white matter.<|Keyword|>yes, TREM2, TYROBP, microglia

<|Context|>In this white matter microglial cell, genes such as APOE, CST3, and FTH1 appear prominently alongside other immune-related markers.<|Question|>What is the potential significance of the high APOE expression in this microglia’s profile?<|Answer|>APOE is often associated with lipid metabolism and immune regulation in the brain. In the [white matter] cell gene expression, APOE is listed near genes like FTH1 and CST3, which are involved in iron handling and cysteine protease regulation, respectively. This suggests a microglial state managing lipid and iron turnover, potentially in the context of myelin debris clearance or response to injury.<|Keyword|>APOE, lipid metabolism, microglia

<|Context|>The white matter microglial cell's gene expression includes TYROBP, TREM2, and ITGB5, among others.<|Question|>Are TYROBP and TREM2 co-expression patterns relevant to microglial function in response to altered myelin?<|Answer|>Yes. TYROBP (DAP12) is the intracellular adaptor for TREM2, driving signaling cascades that enhance microglial activation and phagocytosis. In the [white matter] cell gene expression, the co-expression of TYROBP and TREM2 strongly indicates microglia specialized in reacting to myelin pathology, supporting debris clearance in white matter.<|Keyword|>yes, TYROBP, TREM2

<|Context|>Microglial cells in diseased or aging brain tissues often upregulate interferon-responsive genes such as IFNGR1, STAT1, or ISG-related genes.<|Question|>Does the presence of STAT1 in the white matter microglia gene list suggest a response to interferons?<|Answer|>Yes. STAT1 is a key transcription factor activated by interferons. In the [white matter] cell gene expression, STAT1 is detected along with other immune genes such as CX3CR1, indicating a microglial population under IFN-related signaling, possibly in response to immune challenges or aging processes.<|Keyword|>yes, STAT1, interferon

<|Context|>The gray matter microglial cell gene expression also contains TREM2 and TYROBP, but certain IFN-stimulated genes are less prominent.<|Question|>Comparatively, do the gray matter and white matter microglia appear to share TREM2-mediated pathways, but differ in the degree of IFN-related responses?<|Answer|>Yes. Both [white matter] and [gray matter] cell gene expression profiles include TREM2 and TYROBP, suggesting shared TREM2-mediated phagocytic functions. However, the white matter microglia shows more IFN-related genes (e.g., STAT1), indicating a stronger interferon response in white matter compared to gray matter.<|Keyword|>yes, TREM2, IFN response

<|Context|>This white matter microglial profile features phagocytic markers (CTSB, CTSL), major histocompatibility complex genes (H2-D1, H2-K1), and interferon-response genes.<|Question|>How might co-expression of CTSB, CTSL, and MHC class I genes reflect microglia’s role in antigen processing?<|Answer|>CTSB and CTSL are proteases that degrade internalized proteins, while MHC class I genes (e.g., H2-D1, H2-K1) present peptide fragments to T cells. In the [white matter] cell gene expression, their co-expression suggests microglia are actively processing debris, then displaying antigens, potentially influencing T cell activity in that region.<|Keyword|>CTSB, CTSL, H2-K1, antigen processing

<|Context|>In the white matter microglial cell, we see genes like SPP1, CD68, and MARCKS scattered among canonical microglial markers.<|Question|>What can SPP1 (osteopontin) and CD68 tell us about the functional status of these white matter microglia?<|Answer|>SPP1 is often elevated in reactive or disease-associated microglia, aiding in cell adhesion and chemotaxis, while CD68 marks lysosomal/phagocytic capacity. Their presence in the [white matter] cell gene expression indicates an activated microglia with enhanced phagocytosis and migratory functions, presumably responding to injured or aged white matter.<|Keyword|>SPP1, CD68, phagocytosis

<|Context|>Looking at the white matter microglial cell sentence, we see strong expression of CX3CR1, a receptor important for neuron-microglia communication.<|Question|>Does CX3CR1 co-expression with other immune genes imply a supportive or damaging effect on nearby cells?<|Answer|>It can imply both. CX3CR1 helps microglia maintain homeostatic interactions with neurons, but when combined with upregulated immune genes (e.g., CTSS, TREM2), it also indicates an activated state. In the [white matter] cell gene expression, this might reflect a dual role—supporting neuronal health while also potentially removing damaged myelin if inflammation is sustained.<|Keyword|>CX3CR1, homeostatic, activated state

<|Context|>The white matter microglia’s transcript list includes TREM2, CTSZ, and APOE, which are known to be upregulated in microglia responding to myelin debris.<|Question|>Why would these three genes co-occur in microglia dealing with aging or injury in white matter?<|Answer|>TREM2 enhances debris sensing, CTSZ is a lysosomal protease used for digesting engulfed material, and APOE is involved in lipid transport from phagocytosed myelin. In the [white matter] cell gene expression, they likely collaborate to clear damaged myelin, making them crucial for white matter maintenance in aging or injury contexts.<|Keyword|>TREM2, CTSZ, APOE, debris clearance

<|Context|>The gray matter microglial cell sentence still shows robust microglial markers (CST3, C1QA, HEXB) but somewhat lower expression of TREM2.<|Question|>Could lower TREM2 in gray matter microglia indicate a reduced phagocytic requirement compared to white matter?<|Answer|>Yes. TREM2 is closely tied to phagocytic activation. Seeing it less in the [gray matter] cell gene expression suggests these microglia might be in a more homeostatic or lower-activity state, reflecting reduced debris-clearing demands in gray matter as compared to white matter regions.<|Keyword|>yes, TREM2, gray matter

<|Context|>Within the white matter microglia sentence, we notice expression of C1QA, C1QB, and C1QC, complement components often linked to synapse pruning or debris opsonization.<|Question|>What does the presence of these complement genes suggest about microglial activity in white matter?<|Answer|>C1QA, C1QB, and C1QC initiate the complement cascade, marking targets for microglial clearance. In the [white matter] cell gene expression, they point to ongoing opsonization and removal of cellular or myelin debris, underscoring an active cleanup function in aging or disease states.<|Keyword|>C1QA, C1QB, C1QC, complement

<|Context|>The microglial cell in white matter has genes like H2-D1 (MHC-I) and IFNGR1 side by side.<|Question|>Does co-expression of MHC-I and IFN receptors indicate potential interplay between microglia and T cells in the white matter?<|Answer|>Yes. H2-D1 allows antigen presentation, and IFNGR1 mediates response to interferons secreted by T cells. Their co-expression in the [white matter] cell gene expression points to microglia’s readiness to interact with T cells, possibly in an inflammatory or surveillance context.<|Keyword|>yes, MHC-I, IFNGR1, T cell interaction

<|Context|>Among the top-ranked genes for this white matter microglial cell are pro-inflammatory genes (CTSS, CTSD) and housekeeping genes (ACTB, GAPDH).<|Question|>How do the elevated proteases CTSS and CTSD fit into the microglia’s role in tissue maintenance?<|Answer|>CTSS and CTSD degrade extracellular proteins and damaged cell components. In the [white matter] cell gene expression, these proteases suggest microglia engaging in cleaning up degenerating myelin or other debris, which aligns with increased phagocytic or inflammatory roles in aging white matter.<|Keyword|>CTSS, CTSD, proteolysis

<|Context|>Within the white matter microglial sentence, the presence of CD81 is noted, along with immune genes such as CD9 and TREM2.<|Question|>Could CD81 be involved in microglial cellular communication or exosome formation in this context?<|Answer|>Yes. CD81 is a tetraspanin that participates in membrane organization and exosome biogenesis. Its co-expression with CD9 and TREM2 in the [white matter] cell gene expression hints that microglia might use these pathways for intercellular communication or delivering signals to surrounding cells during injury or immunosurveillance.<|Keyword|>CD81, exosome, communication

<|Context|>In the gray matter microglial cell gene set, there are fewer MHC class I genes but still many phagocytic markers (HEX, CTSS).<|Question|>Does having fewer MHC class I genes necessarily mean gray matter microglia are less immunocompetent?<|Answer|>Not necessarily. While MHC class I expression can indicate antigen presentation, microglia can still use other immune functions. The [gray matter] cell gene expression includes robust phagocytic markers like CTSS and HEXB, so they remain active in clearance. However, they might exhibit a reduced antigen-presenting profile compared to their white matter counterparts.<|Keyword|>no, MHC class I, phagocytosis

<|Context|>White matter microglial expression of CSF1R is quite prominent, alongside genes for survival signaling.<|Question|>Why is CSF1R particularly important for microglial viability in white matter?<|Answer|>CSF1R is essential for microglial survival and proliferation, regulated by colony-stimulating factors. In the [white matter] cell gene expression, high CSF1R implies microglia rely on CSF1 or IL-34 for survival signals, especially in regions undergoing more dynamic turnover of cells and myelin, like white matter.<|Keyword|>CSF1R, survival, microglia

<|Context|>The presence of heat shock genes like HSP90AB1 and HSPA8 in the white matter microglial sentence suggests stress responses.<|Question|>How do elevated heat shock proteins support microglial function in potentially inflamed or damaged white matter?<|Answer|>Heat shock proteins like HSP90AB1 and HSPA8 help refold misfolded proteins and manage stress responses. In the [white matter] cell gene expression, their upregulation reflects microglia coping with heightened metabolic and inflammatory stress, aiding cell stability and continuous debris-clearing activity.<|Keyword|>HSP90AB1, HSPA8, stress response

<|Context|>We see B2M (beta-2-microglobulin) in both the white matter and gray matter microglial gene sets, but it’s more emphasized in white matter.<|Question|>What might higher B2M imply about immune-related pathways in white matter microglia?<|Answer|>B2M is a component of MHC class I molecules. Its higher presence in the [white matter] cell gene expression indicates enhanced antigen presentation or immune surveillance. This aligns with the idea that white matter microglia frequently engage with potential immune triggers, reflecting more pronounced inflammatory reactivity.<|Keyword|>B2M, MHC I, immune surveillance

<|Context|>Some genes in the white matter microglial list, such as IFI27L2A, appear associated with interferon-stimulated states.<|Question|>Could IFI27L2A expression suggest that these microglia respond to interferon signals in aging or disease?<|Answer|>Yes. IFI27L2A is an interferon-inducible gene. Its presence in the [white matter] cell gene expression implies microglia under IFN signaling, likely responding to IFN-γ or other inflammatory cues. This can shape microglial activity toward a reactive, potentially neuroprotective or neurotoxic, function in aging brains.<|Keyword|>yes, IFI27L2A, IFN signaling

<|Context|>Genes such as CYBA, NCF1, and NCF2 in the white matter microglial sentence contribute to reactive oxygen species (ROS) generation.<|Question|>Why would microglia produce more ROS in white matter, and can this be harmful?<|Answer|>These genes code for NADPH oxidase components, which generate ROS to kill pathogens and degrade debris. In the [white matter] cell gene expression, their upregulation can help clear damaged myelin but might also cause oxidative stress to nearby cells. This dual effect can be damaging if prolonged or excessive.<|Keyword|>ROS, CYBA, NCF1, harmful

<|Context|>Analyzing the white matter microglial profile, we see SPP1 and TREM2 expression aligned with robust cathepsins and MHC-I genes.<|Question|>What does this combination imply regarding microglial adaptation in chronic or age-related white matter stress?<|Answer|>It suggests a specialized immune-reactive microglial phenotype. SPP1 and TREM2 facilitate recognition and uptake of myelin debris, while cathepsins (CTSB, CTSL) degrade it. MHC-I genes signal readiness for antigen presentation. Together, this points to a microglia state adapted to chronic cleanup and immune interactions commonly found in aged or stressed white matter.<|Keyword|>SPP1, TREM2, cathepsins, chronic stress
"""

run the following cell to convert the output into a hf dataset

In [104]:
qa_dict_list = []
for QA_pair in output.split("<|Context|>"):
    if not QA_pair:
        continue
    try:
        context = QA_pair.split("<|Question|>")[0].strip()
        question = QA_pair.split("<|Question|>")[1].split("<|Answer|>")[0].strip()
        answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
        label = QA_pair.split("<|Keyword|>")[1].strip()
    except:
        print("Error in the following QA pair:")
        print(QA_pair)
        continue

    entry = {
        "Context": context,
        "Summary_Dataset": dataset_in_prompt,
        "Question": question, 
        "Answer": answer,
        "Keyword": label,
        "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
        "Dataset_Name": dataset_name,
        "Publication_URL": cur_url,
        "Dataset_Index": dataset_index,
        "Used_Rows": sampled_indices,
    }
    qa_dict_list.append(entry)

qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists

new_hf_dataset = Dataset.from_dict(qa_dict)
print("Example:")
new_hf_dataset[0]

Error in the following QA pair:


Example:


{'Context': 'This microglial cell from white matter displays high expression of genes linked to phagocytosis and immune activation, such as TREM2, TYROBP, and CX3CR1.',
 'Summary_Dataset': 'Cell Type: microglial cell, Tissue: brain white matter, Gene Expression: CST3 MALAT1 TMSB4X C1QA C1QB CTSS HEXB ACTB C1QC CTSD APOE FTH1 ITM2B RPS29 MT-CO3 EEF1A1 RPLP1 FAU MT-ATP6 MT-CO1 TPT1 MT-CO2 RPS24 TYROBP RPS12 RPL23 CD81 RPL13 RPL39 MT-CYTB RPL27A RPS21 FCER1G RPL37A CX3CR1 RPS9 TREM2 RPS8 RPL30 FTL1 LGMN RPL35A B2M RPL37 RPS27A RPL21 RPL41 RPL18A RPS11 RPS4X CTSB CD9 CTSZ RPS10 RPS3A1 RPS15A SPARC RPL32 RPS16 RPL19 RPS23 RPL6 RPS14 RPS20 CSF1R RPL17 MT-ND4 RPS27 RPL10 MARCKS RPL34 LY86 PSAP RPL26 RPL11 LAPTM5 RPS28 MT-ND1 RPLP2 RPL9 RPS13 RPLP0 RPS7 CTSL SERINC3 RPS3 RPL28 RPS5 RPS19 RPS18 RPL12 PPIA RPS25 RPL38 RPL3 RPS2 RPSA RPL10A RPS26 MT-ND2 LAMP1 RPL18 CYBA RPL7 RPL15 LYZ2 SELPLG RPL24 RPL36 DDX5 P2RY12 NPC2 RGS10 RPL8 RPS15 H2-D1 UBB RPL22 GPR34 TRF PTMA RPL5 CTSH SELENOP RPL29 RPL3

run the following cell every time to save updates \
sometimes save_to_disk will fail because it doesn't automatically overwrites, just delete the old file adn run it again

In [105]:
hf_dataset = concatenate_datasets([hf_dataset, new_hf_dataset])
hf_dataset


Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 866
})

In [106]:
hf_dataset.save_to_disk(hf_dataset_output_path+"_new")
print("current hf_dataset: ")
hf_dataset

Saving the dataset (1/1 shards): 100%|██████████| 866/866 [00:00<00:00, 18776.26 examples/s]

current hf_dataset: 





Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 866
})