In [1]:
import torch
import numpy as np
import json
import os
import re
import pandas as pd
import shutil

from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetDict

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


helper functions and constants

In [6]:
def dataset_for_prompt(df):
    # number of rows to sample
    num_rows = min(len(df), np.random.randint(5, 25))
    df = df.sample(num_rows)
    #number of genes to sample
    num_genes = 2000 // num_rows

    use_d = True
    if len(df['diseases'].unique()) == 1:
        use_d = False

    s = ""
    sampled_indices = []
    for index, row in df.iterrows():
        cur_num_genes = min(num_genes+np.random.randint(-20, 20), 1000)
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, cur_num_genes))
        if use_d:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"
        sampled_indices.append(index)
        
    return s, sampled_indices

prompt_prefix = """This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider what the analysis the researchers did in the manuscript and the biological context of this study. Then look at the example cell sentences(or a part of them), think about what question the researchers asked or would ask about the cell sentence, what they wanted to and can infer, study, or understand. Imitate the interpretation of data in the manuscript. Then answer the question, quote specific cell sentences and genes as evidence, and include a chain of thoughts. You can borrow reasoning and analysis, conclusions from the manuscript but don't directly use or quote the manuscript in question or answer. Avoid questions about experiment design or procedures, avoid vague questions. Try to vary the type of things the question asks and the wording, include both open ended questions and yes or no questions.
Provide Context for the question, this is only the essential background information required to answer the question. Find context from the manuscript. Provide Answer, including detailed reasonings. Refer to specific cell sentences as [type] cell gene expression. Provide Keywords of the answer being the most essential parts of the correct answer. The keywords should be yes or no(must include if it is a yes or no question), or important gene names, or biological information in question. These keywords will be used during my training to validate my model's response. 
Give 20 question answer pairs as one list, don't give any other word. Strictly format like this: 
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
...
"""

loads all datasets and meta data \
assumes summary datasets are in a directory as csv files, assuming the format processed by Harry \
all files I'm looking at can be found in my C2S-RL dev github repo branch

In [7]:
# TODO: change to your own file path
datasets_directory = "/users/zlyu12/Desktop/c2s-RL/Dec19_dataset" # all summary datasets
meta_data_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/meta_data.json" # dataset name, filename, url etc.
hf_dataset_output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/temp_hf_dataset" # temporary output directory

datasets_files_paths = [os.path.join(datasets_directory, f) for f in os.listdir(datasets_directory) if f.endswith('.csv')]
dataset_numbers = [re.search(r'\((\d+)\)', f).group(1) for f in datasets_files_paths]

meta_data = json.load(open(meta_data_path))
dataset_numbers_iterator = iter(dataset_numbers)

try:
    hf_dataset = load_from_disk(hf_dataset_output_path)
except:
    hf_dataset = Dataset.from_dict({})


iterates through all datasets in your directory \
this prepares the prompt \
copy paste, add the publication text at the end and run our favorite LLM \
run only once for a row, we rely on some variables defined here in later cells

In [9]:
dataset_index = next(dataset_numbers_iterator)
dataset_name = [k for k,v in meta_data.items() if f'({dataset_index})' in v.get('filename', '')]
if len(dataset_name) != 1:
    print(dataset_name)
    print(f"dataset \"{dataset_name}\" not in meta_data!")
else:
    dataset_name = dataset_name[0]
    cur_url = meta_data[dataset_name]['url']
    dataset_file_path = [path for path in datasets_files_paths if f"({dataset_index})" in path][0]
    dataset_df = pd.read_csv(dataset_file_path)
    dataset_in_prompt, sampled_indices = dataset_for_prompt(dataset_df)
    print("Dataset Index: ", dataset_index)
    print("\nDataset Name: ", dataset_name)
    print("\nPublication URL: ", cur_url)
    print("\nPrompt:\n", prompt_prefix + "Dataset: \n" + dataset_in_prompt + "Manuscript:\n")



Dataset Name:  UMAP of Endothelial cells

Publication URL:  https://cellxgene.cziscience.com/collections/a18474f4-ff1e-4864-af69-270b956cee5b

Prompt:
 This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider what t

Just copy paste the output into the next cell, shouldn't need to do additional processing

In [10]:
output = """
<|Context|>In this study, endothelial cells in both normal and metaplastic tissues can show altered gene expression patterns reflecting changes in their microenvironment. Researchers are interested in identifying whether certain cell states in intestinal metaplasia resemble gastric or intestinal phenotypes and if there is an association with inflammation or tissue remodeling.
<|Question|>Do the vein endothelial cells from the lower esophagus in a normal state exhibit a gene expression profile that suggests a stable, noninflamed microenvironment compared to metaplastic conditions?
<|Answer|>Yes. The [vein endothelial cell (lower esophagus, normal) gene expression] includes stable housekeeping genes (e.g., RPL10, EEF1A1), common endothelial markers (e.g., VIM), and limited stress-responsive genes. Crucially, while it contains broad markers like MALAT1 and B2M, it lacks strong inflammatory markers (e.g., fewer interferon-inducible genes like IFITM2 or IFI27) that appear more frequently in diseased states. This suggests a steady endothelial phenotype without evident inflammatory signals.
<|Keyword|>yes, stable microenvironment, MALAT1, B2M, VIM

<|Context|>Intestinal metaplasia can induce a mosaic phenotype in surrounding cells, including endothelial cells. Researchers analyze gene expression to detect subtle shifts toward either a more gastric- or intestinal-like profile.
<|Question|>In the endothelial cell of the artery from the duodenum (normal), do we see any genes associated with a more mature intestinal environment, reflecting its native intestinal tissue location?
<|Answer|>Yes. The [endothelial cell of artery (duodenum, normal) gene expression] shows transcripts like PLVAP and IGFBP7, which are associated with fenestrated and nutrient-absorptive vascular beds common in intestinal tissues. The presence of genes such as APOA4 and APOA1 also aligns with nutrient handling in the intestinal vasculature. These expression patterns support a phenotype consistent with a well-defined intestinal environment.
<|Keyword|>yes, APOA4, APOA1, PLVAP, IGFBP7, intestinal environment

<|Context|>In metaplastic lesions, certain endothelial cells may express genes linked to inflammation or epithelial remodeling.
<|Question|>Does the endothelial cell of the artery in the lower esophagus with Barrett esophagus show increased expression of genes related to immune response compared to normal esophageal endothelial cells?
<|Answer|>Yes. The [endothelial cell of artery (lower esophagus, Barrett esophagus) gene expression] includes IFI27, HLA-DRB1, and HLA-A, reflecting heightened antigen presentation and immune signaling. These differ from the normal lower esophagus endothelial profile, which had fewer such immune-related genes. The presence of IFI27 and multiple HLA genes suggests an environment more engaged with immune surveillance and possible inflammatory processes.
<|Keyword|>yes, IFI27, HLA-DRB1, HLA-A, immune response

<|Context|>Intestinal metaplasia often involves shifts in cellular phenotypes, including endothelial adaptation to a changing tissue environment.
<|Question|>In the vein endothelial cell from the colon (normal), do we observe markers indicating a strong baseline intestinal vascular identity?
<|Answer|>Yes. The [vein endothelial cell (colon, normal) gene expression] includes PLVAP, FABP5, and IGFBP7, which are commonly associated with intestinal endothelium that supports nutrient absorption and metabolic activity. Additionally, genes like IFI27 are less prominent, suggesting less active inflammation compared to metaplastic conditions. This pattern is consistent with a stable intestinal vascular environment.
<|Keyword|>yes, PLVAP, FABP5, IGFBP7, intestinal identity

<|Context|>Metaplastic changes can lead to the presence of stress or remodeling signals in cells.
<|Question|>Does the endothelial cell of artery in the body of the stomach with a stomach disorder show increased expression of transcriptional regulators related to cellular stress or remodeling compared to normal stomach endothelium?
<|Answer|>Yes. The [endothelial cell of artery (body of stomach, stomach disorder) gene expression] includes JUNB, FOS, and ZFP36. JUNB and FOS are immediate early genes tied to stress responses and tissue remodeling. Their presence suggests a shift toward a microenvironment undergoing adaptive changes, unlike stable normal stomach endothelium that shows fewer stress-associated transcription factors.
<|Keyword|>yes, JUNB, FOS, ZFP36, stress response

<|Context|>Gastritis and other inflammatory conditions can shape endothelial gene expression toward more immunomodulatory profiles.
<|Question|>Do capillary endothelial cells from the body of the stomach with gastritis show gene changes indicating an activated inflammatory environment?
<|Answer|>Yes. The [capillary endothelial cell (body of stomach, gastritis) gene expression] includes IFI27 and IFITM3, which are interferon-stimulated genes linked to immune activation. Their presence, alongside stress-related markers like JUNB and FOS, indicates that these cells encounter and respond to inflammatory signals within the gastric mucosa affected by gastritis.
<|Keyword|>yes, IFI27, IFITM3, JUNB, FOS, inflammation

<|Context|>Atrophic changes and metaplasia can alter the typical endothelial gene expression patterns, introducing intestinal-like features in the esophagus.
<|Question|>In vein endothelial cells from the lower esophagus with Barrett esophagus, is there any sign of partial intestinal-like signaling based on gene expression?
<|Answer|>Yes. The [vein endothelial cell (lower esophagus, Barrett esophagus) gene expression] includes genes like IFI27, IFITM3, and TFF1. Although not classic intestinal epithelial markers, TFF1 is often associated with mucosal protection seen in intestinal and gastric epithelia. The co-presence of stress response (IFI27) and a protective mucosal factor (TFF1) suggests subtle shifts toward a more intestinal-like microenvironment compared to strictly normal esophageal endothelium.
<|Keyword|>yes, TFF1, IFI27, intestinal-like

<|Context|>In normal tissues, endothelial cells support stable tissue function, whereas in metaplasia, they may show expression of extracellular matrix or angiogenic signals.
<|Question|>Do the capillary endothelial cells from the submucosal esophageal gland (normal) show a simple housekeeping and structural gene profile rather than an activated, remodeling profile?
<|Answer|>Yes. The [capillary endothelial cell (submucosal esophageal gland, normal) gene expression] is dominated by ribosomal proteins (RPL41, RPL32), metabolic regulators (TPT1, GAPDH), and basic vascular markers (CLDN5). There are fewer immediate early genes (like FOS or JUN) or interferon-related genes (IFI27), indicating a stable, non-remodeling state typical of normal, healthy tissue.
<|Keyword|>yes, RPL41, RPL32, CLDN5, stable state

<|Context|>Chronic inflammation and metaplasia can trigger expression of genes linked to tissue remodeling or altered vascular permeability.
<|Question|>In vein endothelial cells from the submucosal esophageal gland (normal), is there evidence of an active remodeling phenotype?
<|Answer|>No. The [vein endothelial cell (submucosal esophageal gland, normal) gene expression] largely consists of housekeeping genes (RPS27, RPS28, RPL32), with minimal immediate early genes or inflammatory genes. This lack of stress markers (e.g., IFI27, FOS) suggests the absence of a remodeling phenotype.
<|Keyword|>no, RPS27, RPL32, no remodeling

<|Context|>In metaplastic and inflamed tissues, endothelial cells may upregulate immune-related genes, including HLA class I and II molecules.
<|Question|>Do endothelial cells from the artery in the body of the stomach under stomach disorders express more HLA genes suggestive of enhanced antigen presentation?
<|Answer|>Yes. The [endothelial cell of artery (body of stomach, stomach disorder) gene expression] includes HLA-B, HLA-A, and HLA-C. The enrichment of multiple HLA genes indicates increased antigen presentation capability, reflecting a more interactive role with the immune system in this diseased state.
<|Keyword|>yes, HLA-B, HLA-A, HLA-C, antigen presentation

<|Context|>Intestinal metaplasia is associated with a mosaic environment blending features of gastric and intestinal identity, influencing stromal and endothelial responses.
<|Question|>Do the capillary endothelial cells from the lower esophagus (normal) show any significant markers of intestinal-like adaptation?
<|Answer|>No. The [capillary endothelial cell (lower esophagus, normal) gene expression] includes general markers like MALAT1, B2M, and structural components like RPL10, but lacks distinct intestinal-associated markers (e.g., APOA4, APOA1, or strong interferon-induced genes). This suggests an absence of intestinal-like adaptation in the normal esophagus.
<|Keyword|>no, MALAT1, B2M, no intestinal adaptation

<|Context|>As diseases progress, endothelial cells may express cytokine-related or stress-response genes tied to tissue injury.
<|Question|>In capillary endothelial cells from submucosal esophageal gland (normal), is there evidence of cytokine-driven remodeling (e.g., strong expression of immediate early genes)?
<|Answer|>No. The [capillary endothelial cell (submucosal esophageal gland, normal) gene expression] is mainly housekeeping and structural. There is no marked expression of immediate early genes like FOS or JUN. The profile is more akin to a stable, quiescent endothelium, without prominent cytokine-driven remodeling signals.
<|Keyword|>no, stable endothelium, no FOS, no JUN

<|Context|>Disease states often upregulate stress response genes, which can serve as markers of early pathological changes.
<|Question|>Do vein endothelial cells from the body of the stomach (normal) express fewer stress-related transcription factors than cells from stomach disorder conditions?
<|Answer|>Yes. The [vein endothelial cell (body of stomach, normal) gene expression] relies heavily on ribosomal and metabolic genes (RPL41, RPS27, RPL10), without the pronounced presence of immediate early genes (FOS, JUN) or interferon-induced genes (IFI27). Compared to stomach disorder conditions, these normal endothelial cells are less stress-activated.
<|Keyword|>yes, RPL41, RPS27, no stress factors

<|Context|>In pathologic states, endothelial cells may reflect the presence of growth factors or angiogenic signals.
<|Question|>Do endothelial cells of the artery in the lower esophagus under Barrett esophagus express markers associated with angiogenic remodeling (e.g., endothelial growth factors)?
<|Answer|>Yes. The [endothelial cell of artery (lower esophagus, Barrett esophagus) gene expression] includes TIE1 (an endothelial receptor), and RAMP2 (related to angiogenic pathways), suggesting a shift toward a vasculature supportive of altered tissue states. TIE1 in particular can indicate active remodeling in metaplastic conditions.
<|Keyword|>yes, TIE1, RAMP2, angiogenesis

<|Context|>Metaplasia involves interplay of different cell types, with endothelial cells potentially reflecting changes in nutrient transport and tissue integrity.
<|Question|>In vein endothelial cells from the colon (normal), are there markers that indicate efficient nutrient handling characteristic of a healthy intestinal setting?
<|Answer|>Yes. The [vein endothelial cell (colon, normal) gene expression] has FABP5, a fatty acid-binding protein, and IGFBP7, involved in metabolic regulation. These suggest a well-adapted nutrient exchange system, consistent with a stable and functional intestinal vasculature.
<|Keyword|>yes, FABP5, IGFBP7, nutrient handling

<|Context|>Intestinal metaplasia and associated conditions can alter the balance of endothelial surface molecules, potentially affecting leukocyte trafficking.
<|Question|>Does the endothelial cell of artery in stomach disorder states express immune-related genes that would facilitate immune cell adhesion or interaction?
<|Answer|>Yes. The [endothelial cell of artery (body of stomach, stomach disorder) gene expression] includes HLA-B, HLA-C, and IFI27. This combination suggests an environment that is more immune-interactive, potentially enhancing leukocyte-endothelium interactions common in inflamed or metaplastic tissues.
<|Keyword|>yes, HLA-B, HLA-C, IFI27, immune interaction

<|Context|>As tissues progress towards metaplasia, even supportive cells like endothelium can express genes indicating microenvironmental shifts.
<|Question|>In the capillary endothelial cell from the body of the stomach with gastritis, do we see any subtle changes toward intestinal-like features, despite its gastric origin?
<|Answer|>Yes. The [capillary endothelial cell (body of stomach, gastritis) gene expression] includes IFI27 and IFITM3, which can appear in inflamed intestinal conditions as well, and a mix of metabolic genes like FABP5. While not overtly intestinal, the stress/inflammatory profile resembles patterns seen in intestinalized environments undergoing remodeling.
<|Keyword|>yes, IFI27, IFITM3, FABP5, intestinal-like signals

<|Context|>Fibrotic or remodeling signals often appear in disease states, potentially reflected by changes in gene sets of endothelial cells.
<|Question|>In the endothelial cell of artery from duodenum (normal), is there less evidence of fibrotic or desmoplastic gene expression compared to gastric or esophageal metaplastic states?
<|Answer|>Yes. The [endothelial cell of artery (duodenum, normal) gene expression] mainly features typical vascular and metabolic genes (PLVAP, IGFBP7) without strong immediate early genes (FOS, JUN) or ECM-related markers (POSTN) that appear in fibrotic conditions. This indicates a lack of desmoplastic or fibrotic signals in the normal duodenum endothelium.
<|Keyword|>yes, PLVAP, IGFBP7, no POSTN, no fibrosis

<|Context|>Inflammatory and metaplastic processes can lead to increased expression of certain immunomodulatory and tissue remodeling genes in endothelial cells.
<|Question|>Do vein endothelial cells from the lower esophagus normal tissue show fewer immunomodulatory genes than Barrett esophagus conditions?
<|Answer|>Yes. The [vein endothelial cell (lower esophagus, normal) gene expression] lacks strong immunomodulatory genes like IFI27 or multiple HLA class II variants. In contrast, Barrett esophagus samples feature those genes. This suggests that normal endothelial cells in the lower esophagus have a more quiescent immune profile.
<|Keyword|>yes, fewer HLA, fewer IFI27, quiescent immune profile

<|Context|>In metaplastic tissues, endothelial cells can reflect features of both gastric and intestinal microenvironments.
<|Question|>In capillary endothelial cells from the cardia of the stomach with a stomach disorder, do we see gene expression patterns suggesting a transitional environment influenced by both gastric and intestinal signals?
<|Answer|>Yes. The [capillary endothelial cell (cardia of stomach, stomach disorder) gene expression] includes intestinal-like factors (e.g., IFI27), gastric-endothelial markers (TIE1), and metabolic genes (GSN, FABP5). The coexistence of these markers indicates a transitional state, reflecting both gastric origin and intestinal-like signals, consistent with a mosaic phenotype in metaplastic conditions.
<|Keyword|>yes, IFI27, TIE1, FABP5, mosaic environment

"""

run the following cell to convert the output into a hf dataset

In [11]:
qa_dict_list = []
for QA_pair in output.split("<|Context|>"):
    if not QA_pair:
        continue
    try:
        context = QA_pair.split("<|Question|>")[0].strip()
        question = QA_pair.split("<|Question|>")[1].split("<|Answer|>")[0].strip()
        answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
        label = QA_pair.split("<|Keyword|>")[1].strip()
    except:
        print("Error in the following QA pair:")
        print(QA_pair)
        continue

    entry = {
        "Context": context,
        "Summary_Dataset": dataset_in_prompt,
        "Question": question, 
        "Answer": answer,
        "Keyword": label,
        "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
        "Dataset_Name": dataset_name,
        "Publication_URL": cur_url,
        "Dataset_Index": dataset_index,
        "Used_Rows": sampled_indices,
    }
    qa_dict_list.append(entry)

qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists

new_hf_dataset = Dataset.from_dict(qa_dict)
print("Example:")
new_hf_dataset[0]

Error in the following QA pair:


Example:


{'Context': 'In this study, endothelial cells in both normal and metaplastic tissues can show altered gene expression patterns reflecting changes in their microenvironment. Researchers are interested in identifying whether certain cell states in intestinal metaplasia resemble gastric or intestinal phenotypes and if there is an association with inflammation or tissue remodeling.',
 'Summary_Dataset': 'Cell Type: vein endothelial cell, Tissue: lower esophagus, Disease: normal, Gene Expression: MALAT1 B2M TMSB4X RPL10 EEF1A1 RPS18 RPL13 RPS6 RPLP1 RPL41 RPL13A PTMA RPS2 RPS14 RPL32 MT-CO1 KRT13 RPS27A RPL34 RPL3 RPS19 RPL11 S100A9 RPS4X RPS3 RPS8 RPL19 RPL21 RPS12 ACTB RPL18A S100A8 RPL8 RPS27 MT-CO3 RPLP2 MT-ND4 TMSB10 MT-CO2 RPS23 RPS15A MT-CYB TPT1 RPS15 RPL15 RPS9 HLA-B RPL12 RPL9 HLA-A MT-ND2 RPS7 RPL28 RPS3A SPRR3 VIM RPL27A RPL18 ITM2B RPL35A RPL30 RPS5 RPL10A EIF1 RPS16 CSTB RPL29 FTH1 RPL7 RPS20 RPS13 RPL6 RPL7A RPL23A RPL26 FAU RPL35 HLA-C RPL31 H3-3B RPS28 ACTG1 MT-ATP6 IGFBP7 

run the following cell every time to save updates \
sometimes save_to_disk will fail because it doesn't automatically overwrites, just delete the old file adn run it again

In [12]:
hf_dataset = concatenate_datasets([hf_dataset, new_hf_dataset])
if os.path.exists(hf_dataset_output_path):
    shutil.rmtree(hf_dataset_output_path)
hf_dataset.save_to_disk(hf_dataset_output_path)
print("current hf_dataset: ")
hf_dataset

Saving the dataset (1/1 shards): 100%|██████████| 20/20 [00:00<00:00, 1120.41 examples/s]

current hf_dataset: 





Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 20
})