In [1]:
import torch
import numpy as np
import json
import os
import re
import pandas as pd
import shutil

from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetDict

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


helper functions and constants

In [2]:
def dataset_for_prompt(df):
    # number of rows to sample
    num_rows = min(len(df), np.random.randint(5, 25))
    df = df.sample(num_rows)
    #number of genes to sample
    num_genes = 2000 // num_rows

    use_d = True
    if len(df['diseases'].unique()) == 1:
        use_d = False

    s = ""
    sampled_indices = []
    for index, row in df.iterrows():
        cur_num_genes = min(num_genes+np.random.randint(-20, 20), 1000)
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, cur_num_genes))
        if use_d:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"
        sampled_indices.append(index)
        
    return s, sampled_indices

prompt_prefix = """This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider what the analysis the researchers did in the manuscript and the biological context of this study. Then look at the example cell sentences(or a part of them), think about what question the researchers asked or would ask about the cell sentence, what they wanted to and can infer, study, or understand. Imitate the interpretation of data in the manuscript. Then answer the question, quote specific cell sentences and genes as evidence, and include detailed explanations and reasonings resemble a chain of thoughts. You can borrow reasoning and analysis, conclusions from the manuscript but don't directly use or quote the manuscript in question or answer. Avoid questions about experiment design or procedures, avoid vague questions. Vary the questions, answers and their wording as much as possible to cover a diverse range of topics. Include both open ended questions and yes or no questions.
Provide Context for the question, this is only the essential background information required to answer the question, do not directly summarize or describe the dataset. Find context from the manuscript. Provide Answer no less than 30 words, including detailed reasonings. Refer to specific cell sentences as [type] cell gene expression. Also include a chain of thoughts as intermediate reasoning and analysis steps, think about the important gene names and functions, gene sets. Provide Keywords of the answer being the most essential parts of the correct answer. The keywords should be yes or no(must include if it is a yes or no question), or important gene names, or biological information in question. These keywords will be used during my training to validate my model's response. 
Give 20 question answer pairs as one list, don't give any other word. Strictly format like this: 
<|Context|>the context<|Question|>the content of the question<|Chain of Thoughts|>the intermediate reasonings<|Answer|>the content of the answer<|Keyword|>the answer keywords
<|Context|>the context<|Question|>the content of the question<|Chain of Thoughts|>the intermediate reasonings<|Answer|>the content of the answer<|Keyword|>the answer keywords
...
"""

loads all datasets and meta data \
assumes summary datasets are in a directory as csv files, assuming the format processed by Harry \
all files I'm looking at can be found in my C2S-RL dev github repo branch

In [14]:
# TODO: change to your own file path
datasets_directory = "/users/zlyu12/Desktop/c2s-RL/Dec19_dataset" # all summary datasets
meta_data_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/meta_data.json" # dataset name, filename, url etc.
hf_dataset_output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/temp_hf_dataset" # temporary output directory

datasets_files_paths = [os.path.join(datasets_directory, f) for f in os.listdir(datasets_directory) if f.endswith('.csv')]
dataset_numbers = np.sort([int(re.search(r'\((\d+)\)', f).group(1)) for f in datasets_files_paths])

meta_data = json.load(open(meta_data_path))
dataset_numbers_iterator = iter(dataset_numbers[190:])

try:
    hf_dataset = load_from_disk(hf_dataset_output_path)
except:
    hf_dataset = Dataset.from_dict({})

print(hf_dataset)

Dataset({
    features: [],
    num_rows: 0
})


iterates through all datasets in your directory \
this prepares the prompt \
copy paste, add the publication text at the end and run our favorite LLM \
run only once for a row, we rely on some variables defined here in later cells

In [141]:
dataset_index = str(next(dataset_numbers_iterator))
dataset_name = [k for k,v in meta_data.items() if f'({dataset_index})' in v.get('filename', '')]
if len(dataset_name) != 1:
    print("dataset index: ", dataset_index)
    print(dataset_name)
    print(f"dataset \"{dataset_name}\" not in meta_data!")
else:
    dataset_name = dataset_name[0]
    cur_url = meta_data[dataset_name]['url']
    dataset_file_path = [path for path in datasets_files_paths if f"({dataset_index})" in path][0]
    dataset_df = pd.read_csv(dataset_file_path)
    dataset_in_prompt, sampled_indices = dataset_for_prompt(dataset_df)
    print("Dataset Index: ", dataset_index)
    print("\nDataset Name: ", dataset_name)
    print("\nPublication URL: ", cur_url)
    print("\nPrompt:\n", prompt_prefix + "Dataset: \n" + dataset_in_prompt + "Manuscript:\n")


Dataset Index:  250

Dataset Name:  Central Cornea

Publication URL:  https://cellxgene.cziscience.com/collections/63d03351-06be-478e-a0db-f7a653b6b19b

Prompt:
 This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consid

Just copy paste the output into the next cell, shouldn't need to do additional processing

In [142]:
output = """
<|Context|>These cells contribute to the stromal structure in the cornea, helping maintain its clarity and tensile strength. They are associated with matrix production and remodeling.<|Question|>Are the [fibroblast] cell gene expression profiles suggestive of extracellular matrix organization in the corneal stroma?<|Answer|>Yes. The [fibroblast] cell gene expression includes “COL5A2,” “COL6A3,” and “COL4A3,” which encode various collagen components supporting stromal structure. Also, “ITGBL1” and “DCN” are known to modulate cell–matrix interactions. Together, these gene signals indicate that fibroblasts likely contribute actively to corneal extracellular matrix assembly and homeostasis.<|Keyword|>yes, COL5A2, COL6A3, COL4A3, ITGBL1, DCN

<|Context|>Transit amplifying cells exist in the basal layers of the corneal epithelium and rapidly proliferate to replenish epithelial cells.<|Question|>Do [transit amplifying cell gene expression] profiles indicate a state of active cell cycling?<|Answer|>Yes. The [transit amplifying cell gene expression] list features markers like “MKI67,” “TOP2A,” and “BIRC5,” which are well-known regulators of proliferation and mitosis. Their concurrent expression strongly supports that these transit amplifying cells are actively cycling to supply new epithelial cells.<|Keyword|>yes, MKI67, TOP2A, BIRC5

<|Context|>Corneal endothelial cells form a single layer that maintains fluid balance within the cornea to preserve transparency.<|Question|>Which genes in [corneal endothelial cell gene expression] might regulate corneal fluid balance?<|Answer|>Genes such as “SLC4A4” and “SLC4A11” from [corneal endothelial cell gene expression] suggest involvement in bicarbonate transport and fluid regulation. Additionally, “CA12” could assist in maintaining ionic gradients for fluid homeostasis. Together, these genes support endothelial function in corneal hydration control.<|Keyword|>SLC4A4, SLC4A11, CA12

<|Context|>Basal cells in the corneal epithelium continually transition into wing and superficial cells, maintaining a healthy epithelial surface.<|Question|>Which expressed genes in [basal cell gene expression] hint at basal epithelial adhesion to the basement membrane?<|Answer|>The [basal cell gene expression] list includes “LAMA3,” “ITGA6,” and “COL17A1,” which facilitate interactions with the basement membrane. Their combined presence likely reinforces stable adhesion of basal epithelial cells to underlying corneal structures, underscoring this cell type’s role in epithelial anchoring.<|Keyword|>LAMA3, ITGA6, COL17A1

<|Context|>Leukocytes in the cornea can indicate immune surveillance or inflammation.<|Question|>Which immune-related genes from [leukocyte] cell gene expression suggest inflammatory responses in the cornea?<|Answer|>The [leukocyte] cell gene expression profile shows “PTPRC,” “FYN,” and “IKZF1,” which are known immune regulators. “PTPRC” (CD45) is expressed widely by leukocytes, “FYN” is involved in T-cell signaling, and “IKZF1” influences lymphocyte development. Their presence signals active immune or surveillance roles.<|Keyword|>PTPRC, FYN, IKZF1

<|Context|>Corneal fibroblasts (stromal keratocytes) can undergo changes in response to injury and contribute to fibrotic processes.<|Question|>Which genes in [fibroblast] cell gene expression could drive scar formation in corneal wound healing?<|Answer|>“COL6A2,” “COL12A1,” and “TGFBI” from [fibroblast] cell gene expression are key fibrotic markers. Elevated collagen transcripts (“COL6A2,” “COL12A1”) imply an increased extracellular matrix deposition, while “TGFBI” can mediate fibrotic pathways. Altogether, these genes suggest a fibrotic remodeling response.<|Keyword|>COL6A2, COL12A1, TGFBI

<|Context|>Transit amplifying cells differentiate toward mature epithelial phenotypes and keep the tissue surface robust.<|Question|>How does [transit amplifying cell gene expression] underscore their transitional state between basal and terminally differentiated corneal cells?<|Answer|>The coexpression of proliferation markers (e.g., “MKI67”) with early differentiation genes like “KRT5” in [transit amplifying cell gene expression] indicates these cells are actively dividing yet beginning to acquire corneal-specific structural proteins. This aligns with a transitional, rapidly dividing phenotype.<|Keyword|>MKI67, KRT5

<|Context|>The corneal endothelium is integral to preventing corneal edema by regulating ion pumps.<|Question|>Which [corneal endothelial cell gene expression] features suggest an active fluid pump mechanism for corneal deturgescence?<|Answer|>Genes such as “ATP1A2” and “SLC4A4” found in [corneal endothelial cell gene expression] are central to ion transport. “ATP1A2” drives active ion exchange, and “SLC4A4” regulates bicarbonate flux. Together, they likely maintain the fluid pump mechanism critical for preventing excessive fluid accumulation.<|Keyword|>ATP1A2, SLC4A4

<|Context|>Corneal epithelial basal cells anchor tightly to the basement membrane for continual renewal.<|Question|>Which transcripts from [basal cell gene expression] might be essential for epithelial integrity and renewal?<|Answer|>“KRT14” and “PAX6” in [basal cell gene expression] are major players. “KRT14” helps form the cytoskeletal framework in basal epithelial layers, while “PAX6” is crucial for ocular epithelial development and regeneration. Their high expression supports structural integrity and ongoing renewal.<|Keyword|>KRT14, PAX6

<|Context|>Leukocytes in the cornea can reflect disease states such as infection or autoimmunity.<|Question|>From [leukocyte] cell gene expression, do you see indications of T-cell or B-cell lineage involvement?<|Answer|>Yes. “IKZF1” and “CD69” in [leukocyte] cell gene expression are suggestive of T-cell activity, as IKZF1 is involved in lymphocyte differentiation, and CD69 is an early activation marker. This pattern supports T-cell involvement in immune responses within the cornea.<|Keyword|>yes, IKZF1, CD69

<|Context|>Corneal fibroblasts synthesize collagen to maintain corneal strength and transparency.<|Question|>Which [fibroblast] cell gene expression markers might relate to collagen fibril organization supporting corneal transparency?<|Answer|>“COL5A2” and “COL1A2” in [fibroblast] cell gene expression coordinate fibril diameter and spacing. “COL5A2” helps initiate fibril assembly, while “COL1A2” contributes to structural tensile strength. Their combined expression suggests a role in proper collagen architecture for clarity.<|Keyword|>COL5A2, COL1A2

<|Context|>Transit amplifying cells are a rapidly dividing subset in corneal epithelium, replacing worn-out surface cells.<|Question|>What genes in [transit amplifying cell gene expression] imply rapid cell turnover supporting epithelial surface upkeep?<|Answer|>Markers “BIRC5” and “UBE2C” from [transit amplifying cell gene expression] are associated with cell cycle progression and survival. Their high expression underscores sustained proliferative renewal, ensuring the corneal surface remains replenished and functional.<|Keyword|>BIRC5, UBE2C

<|Context|>Corneal endothelial cell dysfunction can lead to diseases such as Fuchs’ endothelial dystrophy.<|Question|>Which genes from [corneal endothelial cell gene expression] might be connected to corneal endothelial dystrophy risk?<|Answer|>“COL8A1” and “COL4A3” in [corneal endothelial cell gene expression] have been linked to endothelial dystrophies. “COL8A1,” for instance, may be involved in Descemet’s membrane structure. Alterations here could predispose cells to degenerative conditions like Fuchs’ dystrophy, impacting endothelial function.<|Keyword|>COL8A1, COL4A3

<|Context|>Basal epithelial cells of the cornea transition into specialized superficial layers that form a protective barrier.<|Question|>Which transcription factors in [basal cell gene expression] might regulate early steps of epithelial cell differentiation?<|Answer|>“PAX6” and “TFAP2B” from [basal cell gene expression] can govern the early differentiation of corneal epithelial cells. “PAX6” is an overarching eye-specific regulator, while “TFAP2B” modulates key epithelial programs, guiding basal cells toward their specialized surface identity.<|Keyword|>PAX6, TFAP2B

<|Context|>Leukocytes can help monitor and modulate corneal immune privilege.<|Question|>What elements in [leukocyte] cell gene expression suggest a regulatory role rather than a strictly proinflammatory function?<|Answer|>The presence of “SAT1” and “RASA2” in [leukocyte] cell gene expression indicates possible immunomodulatory actions. “SAT1” has roles in polyamine metabolism affecting immune responses; “RASA2” helps regulate Ras signaling. These findings suggest leukocytes here may balance vigilance with immune tolerance.<|Keyword|>SAT1, RASA2

<|Context|>Fibroblasts in the cornea can respond to external signals for matrix remodeling.<|Question|>Which signals in [fibroblast] cell gene expression are linked to remodeling and matrix turnover?<|Answer|>“MMP” family members are notably absent in the snippet, but “PITPNC1,” “DCN,” and “FRMD4A” appear in [fibroblast] cell gene expression. “DCN” modulates collagen fibril assembly, while “PITPNC1” may regulate phosphoinositide signaling. Together, they hint at dynamic matrix turnover and remodeling.<|Keyword|>PITPNC1, DCN, FRMD4A

<|Context|>Transit amplifying cells must maintain a balance between proliferation and commitment to terminal differentiation.<|Question|>Which genes in [transit amplifying cell gene expression] signal a shift toward differentiation rather than continued proliferation?<|Answer|>While many are proliferative, the detection of “KRT12” in [transit amplifying cell gene expression] indicates these cells also initiate cornea-specific differentiation. So, even with high proliferation markers, some cells begin expressing corneal-epithelium–specific genes, suggesting progression toward maturity.<|Keyword|>KRT12

<|Context|>The corneal endothelium is vital to nutrient exchange for the avascular cornea.<|Question|>From [corneal endothelial cell gene expression], which transporters are likely key to nutrient supply in the cornea?<|Answer|>“SLC4A4” and “GLS” appear in [corneal endothelial cell gene expression]. “SLC4A4” helps with bicarbonate transport crucial for pH balance, while “GLS” could aid in glutamine metabolism. These genes suggest an active role in nutrient and metabolite supply to corneal tissues.<|Keyword|>SLC4A4, GLS

<|Context|>Basal corneal epithelial cells are crucial for maintaining a functional barrier against pathogens.<|Question|>Which [basal cell gene expression] factors may support barrier integrity to resist microbial invasion?<|Answer|>“KRT14” ensures a robust cytoskeletal network, and “LAMA3” helps attach cells to basement membranes. Both in [basal cell gene expression] reinforce epithelial tightness, reducing gaps for pathogens. This structural cohesion strengthens the corneal epithelial barrier against microbial challenges.<|Keyword|>KRT14, LAMA3

<|Context|>Leukocytes in the cornea might be recruited in conditions like microbial keratitis or autoimmune disease.<|Question|>Do [leukocyte] cell gene expression profiles hint at potential chemotactic behavior in corneal inflammation?<|Answer|>Yes. Genes like “CCND3” and “DOCK8” in [leukocyte] cell gene expression can be associated with cell migration and proliferation. “DOCK8” influences cytoskeletal rearrangement, promoting movement toward chemokines. This indicates leukocytes might actively migrate into inflamed or infected corneas.<|Keyword|>yes, CCND3, DOCK8


"""

run the following cell to convert the output into a hf dataset

In [143]:
qa_dict_list = []
for QA_pair in output.split("<|Context|>"):
    if not QA_pair:
        continue
    try:
        context = QA_pair.split("<|Question|>")[0].strip()
        question = QA_pair.split("<|Question|>")[1].split("<|Chain of Thoughts|>")[0].strip()
        chain_of_thoughts = QA_pair.split("<|Chain of Thoughts|>")[1].split("<|Answer|>")[0].strip()
        answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
        label = QA_pair.split("<|Keyword|>")[1].strip()
    except:
        print("Error in the following QA pair:")
        print(QA_pair)
        continue

    entry = {
        "Context": context,
        "Summary_Dataset": dataset_in_prompt,
        "Question": question, 
        "Chain of Thoughts": chain_of_thoughts,
        "Answer": answer,
        "Keyword": label,
        "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
        "Dataset_Name": dataset_name,
        "Publication_URL": cur_url,
        "Dataset_Index": dataset_index,
        "Used_Rows": sampled_indices,
    }
    qa_dict_list.append(entry)

qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists

new_hf_dataset = Dataset.from_dict(qa_dict)
print("Example:")
new_hf_dataset[0]

Error in the following QA pair:


Example:


{'Context': 'These cells contribute to the stromal structure in the cornea, helping maintain its clarity and tensile strength. They are associated with matrix production and remodeling.',
 'Summary_Dataset': 'Cell Type: fibroblast, Tissue: cornea, Gene Expression: MALAT1 NEAT1 PDE4D ITGBL1 RORA ALDH3A1 COL5A2 ZBTB20 FTX AKAP13 DCN COL12A1 CEMIP CH507-528H12 MSI2 ZFHX3 NEBL COL6A3 DOCK5 WWOX SNX29 IMMP2L PTPRM SASH1 ABCA6 DLG2 WSB1 COL4A3 TCF12 UTRN RERE PLXDC2 RIPOR2 LSAMP AUTS2 TMTC1 SVIL ITSN1 PTPRG BICC1 FKBP5 MACF1 SIPA1L1 C1ORF21 ELF2 SPOCK1 DDX17 AC003090 REV3L AOPEP COLEC12 COL1A2 ABI3BP USP34 TTC28 COL6A2 CACNA2D3 FNDC3B BIRC6 FRMD4A PITPNC1 ASPH NFIA MME MKLN1 PSD3 SH3D19 TNRC6A MYCBP2 TFAP2B ENO1 C20ORF194 ITGB4 TNRC6B COL6A1 MIR99AHG CFH CHD9 MYO1D MAP4K3 AKT3 EFNA5 CNTN3 ZBTB16 EIF4G3 ARID1B PTK2 CARMIL1 MAMDC2 PLPP3 COG5 TPCN1 CDON PNISR COL4A4 PDGFD SLC20A1 DLG1 FNDC3A AFF3 BCAS3 STAG1 TGFBI GLUL LUC7L3 PLCB4 STXBP5 GPHN PIAS1 AKAP9 SRSF11 PTPN13 SBF2 APBB2 MAGI2 INPP4B I

run the following cell every time to save updates \
sometimes save_to_disk will fail because it doesn't automatically overwrites, just delete the old file adn run it again

In [144]:
hf_dataset = concatenate_datasets([hf_dataset, new_hf_dataset])
hf_dataset


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 489
})

In [145]:
hf_dataset.save_to_disk(hf_dataset_output_path+"_new")
print("current hf_dataset: ")
hf_dataset

Saving the dataset (1/1 shards): 100%|██████████| 489/489 [00:00<00:00, 13968.73 examples/s]

current hf_dataset: 





Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 489
})