In [1]:
import torch
import numpy as np
import json
import os
import re
import pandas as pd
import shutil

from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetDict

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


helper functions and constants

In [3]:
def dataset_for_prompt(df):
    # number of rows to sample
    num_rows = min(len(df), np.random.randint(5, 25))
    df = df.sample(num_rows)
    #number of genes to sample
    num_genes = 2000 // num_rows

    use_d = True
    if len(df['diseases'].unique()) == 1:
        use_d = False

    s = ""
    sampled_indices = []
    for index, row in df.iterrows():
        cur_num_genes = min(num_genes+np.random.randint(-20, 20), 1000)
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, cur_num_genes))
        if use_d:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"
        sampled_indices.append(index)
        
    return s, sampled_indices

prompt_prefix = """This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider what the analysis the researchers did in the manuscript and the biological context of this study. Then look at the example cell sentences(or a part of them), think about what question the researchers asked or would ask about the cell sentence, what they wanted to and can infer, study, or understand. Imitate the interpretation of data in the manuscript. Then answer the question, quote specific cell sentences and genes as evidence, and include detailed explanations and reasonings resemble a chain of thoughts. You can borrow reasoning and analysis, conclusions from the manuscript but don't directly use or quote the manuscript in question or answer. Avoid questions about experiment design or procedures, avoid vague questions. Vary the questions, answers and their wording as much as possible to cover a diverse range of topics. Include both open ended questions and yes or no questions.
Provide Context for the question, this is only the essential background information required to answer the question, do not directly summarize or describe the dataset. Find context from the manuscript. Provide Answer no less than 30 words, including detailed reasonings. Refer to specific cell sentences as [type] cell gene expression. Also include a chain of thoughts as intermediate reasoning and analysis steps, think about the important gene names and functions, gene sets. Provide Keywords of the answer being the most essential parts of the correct answer. The keywords should be yes or no(must include if it is a yes or no question), or important gene names, or biological information in question. These keywords will be used during my training to validate my model's response. 
Give 30 question answer pairs as one list, don't give any other word. Strictly format like this: 
<|Context|>the context<|Question|>the content of the question<|Chain of Thoughts|>the intermediate reasonings<|Answer|>the content of the answer<|Keyword|>the answer keywords
<|Context|>the context<|Question|>the content of the question<|Chain of Thoughts|>the intermediate reasonings<|Answer|>the content of the answer<|Keyword|>the answer keywords
...
"""

loads all datasets and meta data \
assumes summary datasets are in a directory as csv files, assuming the format processed by Harry \
all files I'm looking at can be found in my C2S-RL dev github repo branch

In [5]:
# TODO: change to your own file path
datasets_directory = "/users/zlyu12/Desktop/c2s-RL/Dec19_dataset" # all summary datasets
meta_data_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/meta_data.json" # dataset name, filename, url etc.
hf_dataset_output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/temp_hf_dataset_new" # temporary output directory

datasets_files_paths = [os.path.join(datasets_directory, f) for f in os.listdir(datasets_directory) if f.endswith('.csv')]
dataset_numbers = np.sort([int(re.search(r'\((\d+)\)', f).group(1)) for f in datasets_files_paths])

meta_data = json.load(open(meta_data_path))
dataset_numbers_iterator = iter(dataset_numbers[500:])

try:
    hf_dataset = load_from_disk(hf_dataset_output_path)
except:
    hf_dataset = Dataset.from_dict({})

print(hf_dataset)

Dataset({
    features: [],
    num_rows: 0
})


iterates through all datasets in your directory \
this prepares the prompt \
copy paste, add the publication text at the end and run our favorite LLM \
run only once for a row, we rely on some variables defined here in later cells

In [65]:
dataset_index = str(next(dataset_numbers_iterator))
dataset_name = [k for k,v in meta_data.items() if f'({dataset_index})' in v.get('filename', '')]
if len(dataset_name) != 1:
    print("dataset index: ", dataset_index)
    print(dataset_name)
    print(f"dataset \"{dataset_name}\" not in meta_data!")
else:
    dataset_name = dataset_name[0]
    cur_url = meta_data[dataset_name]['url']
    dataset_file_path = [path for path in datasets_files_paths if f"({dataset_index})" in path][0]
    dataset_df = pd.read_csv(dataset_file_path)
    dataset_in_prompt, sampled_indices = dataset_for_prompt(dataset_df)
    print("Dataset Index: ", dataset_index)
    print("\nDataset Name: ", dataset_name)
    print("\nPublication URL: ", cur_url)
    print("\nPrompt:\n", prompt_prefix + "Dataset: \n" + dataset_in_prompt + "Manuscript:\n")


Dataset Index:  537

Dataset Name:  Nasopharynx

Publication URL:  https://cellxgene.cziscience.com/collections/35d0b748-3eed-43a5-a1c4-1dade5ec5ca0

Prompt:
 This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider 

Just copy paste the output into the next cell, shouldn't need to do additional processing

In [61]:
output = """



<|Context|>Natural killer cells in the large intestine (normal) and small intestine (Crohn disease) show differences in certain immune-related genes.<|Question|>Do the natural killer cells from the small intestine with Crohn disease display elevated markers that hint at heightened immune activation compared to the natural killer cells from the large intestine with a normal condition?<|Chain of Thoughts|>Comparing the [natural killer cell, large intestine, normal, gene expression] with [natural killer cell, small intestine, Crohn disease, gene expression] reveals upregulation of TMSB4X, B2M, and immune-related genes like NKG7. This indicates a potential heightened immune response in the Crohn disease context.<|Answer|>Yes. The natural killer cell from the Crohn disease small intestine expresses multiple genes (for instance, TMSB4X, B2M, and NKG7) that are notably present and can be associated with increased immune reactivity. In particular, NKG7 has been implicated in cell-mediated cytotoxicity, and together with other immune-related genes, suggests enhanced immune activation in the diseased tissue. By contrast, the natural killer cell from the normal large intestine still expresses immune genes such as MALAT1 but lacks pronounced upregulation of certain inflammation-related markers.<|Keyword|>yes, NKG7, B2M, TMSB4X, heightened immune activation

<|Context|>Group 3 innate lymphoid cells (ILC3s) in the large intestine (normal) have characteristic expression patterns associated with immunoregulation.<|Question|>Which genes might indicate that these group 3 innate lymphoid cells contribute to gut immune homeostasis?<|Chain of Thoughts|>By examining the [group 3 innate lymphoid cell, large intestine, normal, gene expression], we see expression of IL7R, JUN, DUSP1, and RORC signaling components that can be tied to regulatory immune responses.<|Answer|>Genes such as IL7R, JUN, and DUSP1, observed in the group 3 innate lymphoid cell gene expression, suggest capabilities in modulating immune pathways and providing homeostatic signals in the gut. IL7R is critical for lymphocyte development; JUN and DUSP1 are linked to cellular stress and inflammation regulation. These collectively point to ILC3s playing an essential immunoregulatory role in maintaining intestinal balance.<|Keyword|>IL7R, JUN, DUSP1, immune homeostasis

<|Context|>Oligodendrocyte precursor cells can express typical ribosomal and structural protein genes in the large intestine (normal).<|Question|>What might the presence of cytoskeletal genes like ACTG1 and tubulin-related genes indicate about the oligodendrocyte precursor cell in the gut environment?<|Chain of Thoughts|>Looking at [oligodendrocyte precursor cell, large intestine, normal, gene expression], we see ACTG1, TUBA1A, and other structural proteins. These might imply cytoskeletal readiness for potential morphological changes or migration within the gut.<|Answer|>The presence of genes such as ACTG1 and tubulin-related genes (for example, TUBA1A) in the oligodendrocyte precursor cell gene expression profile suggests a supportive cytoskeletal framework for cellular processes like shape alteration and motility. Even though oligodendrocyte precursor cells are best known in the central nervous system, their expression of structural components here might assist in local adaptation and potential guidance or support roles within the intestinal milieu, reflecting a capacity for varied morphogenetic functions.<|Keyword|>ACTG1, TUBA1A, cytoskeletal support

<|Context|>T cells in the small intestine (normal) exhibit a range of ribosomal genes alongside markers such as B2M and HLA-related genes.<|Question|>How do these HLA and ribosomal genes suggest a T cell’s involvement in antigen processing in the small intestine?<|Chain of Thoughts|>From [T cell, small intestine, normal, gene expression], we see HLA-B, HLA-A, and a robust set of ribosomal components. HLA molecules enable antigen presentation, while ribosomal genes reflect active protein synthesis.<|Answer|>The T cell’s expression of HLA-B and HLA-A (notably present in the T cell gene expression list) is key to presenting antigens to other immune cells, while the robust presence of ribosomal genes (such as RPL and RPS family members) indicates high protein translation capacity. Together, these features highlight the T cell’s active role in sampling and responding to antigens in the gut, ensuring effective immune surveillance and communication with other immune partners.<|Keyword|>HLA-B, HLA-A, ribosomal genes, antigen processing

<|Context|>NKp44-negative group 3 innate lymphoid cells in the lymph node (normal) show expression of multiple immediate early genes such as FOS and JUN.<|Question|>Could the expression of FOS and JUN in NKp44-negative group 3 innate lymphoid cells signal rapid immune responses in the lymph node environment?<|Chain of Thoughts|>The [NKp44-negative group 3 innate lymphoid cell, lymph node, normal, gene expression] includes FOS, JUN, DUSP1, which are immediate early genes that typically respond quickly to stimuli.<|Answer|>Yes. The prominent presence of FOS and JUN, seen among the top expressed genes, typically denotes cells capable of swift transcriptional responses to external signals. These immediate early genes can trigger downstream cascades linked to cell proliferation, cytokine production, or inflammatory responses. Thus, in a lymph node context, their expression would equip these NKp44-negative ILC3s with prompt reaction capabilities, presumably for rapid immune cell coordination and activation.<|Keyword|>yes, FOS, JUN, immediate early genes

<|Context|>Mesodermal cells in the lymph node (normal) present VIM and SPARC among other genes.<|Question|>How might SPARC and VIM expression indicate a role for mesodermal cells in structural integrity within the lymph node?<|Chain of Thoughts|>Examining [mesodermal cell, lymph node, normal, gene expression], SPARC and VIM are notable. SPARC is involved in extracellular matrix remodeling; VIM is a classic intermediate filament protein.<|Answer|>SPARC (a protein that modulates collagen and matrix interactions) and VIM (a major cytoskeletal component) point to mesodermal cells contributing substantially to the extracellular matrix and mechanical architecture of the lymph node. Their expression supports cell adhesion, migration, and structural maintenance, which are crucial for preserving tissue organization and facilitating immune cell trafficking in that microenvironment.<|Keyword|>SPARC, VIM, structural integrity

<|Context|>Memory B cells in the vermiform appendix (normal) display markers like CD74 and HLA-DRA, alongside multiple ribosomal genes.<|Question|>Which evidence in their gene expression suggests that these memory B cells are ready for antigen presentation and antibody production?<|Chain of Thoughts|>The [memory B cell, vermiform appendix, normal, gene expression] includes CD74 and HLA-DRA, essential for antigen presentation, and robust ribosomal genes, supporting high translational capacity.<|Answer|>The presence of CD74 and HLA-DRA within memory B cells indicates a strong capacity for antigen presentation through class II MHC pathways. Meanwhile, a large number of ribosomal genes, such as RPL and RPS family members, underpins a robust protein synthesis framework. Combined, these features suggest that memory B cells are primed to present antigens and rapidly produce antibodies if re-exposed to their cognate antigens in the appendix.<|Keyword|>CD74, HLA-DRA, antigen presentation

<|Context|>T-helper 17 cells in the small intestine (normal) feature IL7R, KLRB1, and other markers typical of effector immune responses.<|Question|>What does the presence of IL7R and KLRB1 imply about the Th17 cell’s functionality in the intestinal environment?<|Chain of Thoughts|>In [T-helper 17 cell, small intestine, normal, gene expression], IL7R is crucial for T cell development and survival; KLRB1 encodes CD161, often linked to mucosal T helper subsets.<|Answer|>IL7R expression underscores the cell’s reliance on IL-7 signaling for survival and expansion, a key aspect of T cell homeostasis. KLRB1 (CD161), frequently associated with mucosal T cells, hints at specialized interactions within the gut environment. Together, they highlight these Th17 cells as potent mediators of mucosal immunity, capable of responding robustly to local microbial stimuli and helping maintain epithelial integrity and protective immunity.<|Keyword|>IL7R, KLRB1, Th17 functionality

<|Context|>Pericytes in the vermiform appendix (normal) express an array of smooth muscle-related genes, including ACTA2 and MYH11.<|Question|>Does the co-expression of ACTA2 and MYH11 suggest a specialized contractile function in pericytes surrounding blood vessels in the appendix?<|Chain of Thoughts|>Assessing [pericyte, vermiform appendix, normal, gene expression] shows ACTA2 (smooth muscle actin) and MYH11 (smooth muscle myosin) typically linked to contractility and regulation of vascular tone.<|Answer|>Yes. The joint presence of ACTA2 and MYH11 strongly indicates that these pericytes can exert contractile forces on nearby vessels, helping manage blood flow and vessel stability in the appendix. Both genes are hallmarks of smooth muscle–type contractility, highlighting the specialized supportive and regulatory role pericytes serve in modulating local perfusion and maintaining vascular integrity.<|Keyword|>yes, ACTA2, MYH11, contractile function

<|Context|>Colon epithelial cells in the lymph node (normal) have high expression of genes like RPL41, EEF1A1, and RPS3A.<|Question|>What does this abundance of ribosomal and elongation factor genes reveal about epithelial cells that might have migrated or been sampled in the lymph node?<|Chain of Thoughts|>From [colon epithelial cell, lymph node, normal, gene expression], many RPL/RPS genes and EEF1A1. These indicate active protein synthesis, possibly reflecting a highly metabolic state or rapid turnover.<|Answer|>The notable enrichment of multiple RPL and RPS genes, along with EEF1A1, points to robust protein synthesis processes, indicating these epithelial cells are highly metabolically active. The presence of colon-derived epithelial cells in the lymph node context suggests they could be sampled or drained via lymphatic routes, maintaining their vigorous translational capacity even outside the typical colon environment. This may reflect tissue surveillance or immune sampling events.<|Keyword|>ribosomal genes, EEF1A1, high metabolic activity

<|Context|>A natural killer cell in the small intestine with Crohn disease exhibits CCL5 and KLRB1 expression.<|Question|>How do CCL5 and KLRB1 co-expression suggest the NK cell’s role in inflammatory chemotaxis or tissue retention during Crohn disease?<|Chain of Thoughts|>Looking at [natural killer cell, small intestine, Crohn disease, gene expression], we see CCL5, known for attracting immune cells, and KLRB1, which can mark tissue-homing or mucosal subsets.<|Answer|>The NK cell’s production of CCL5 can drive recruitment of other immune cells, enhancing the inflammatory milieu in Crohn disease. KLRB1, often found on tissue-homing lymphocytes, may aid retention within the gut mucosa, potentially prolonging local immune activation. Together, these genes imply that NK cells not only directly engage target cells but also orchestrate broader inflammatory processes in the diseased intestine.<|Keyword|>CCL5, KLRB1, chemotaxis, tissue retention

<|Context|>A CD34-positive, CD56-positive, CD117-positive common innate lymphoid precursor in the large intestine (normal) shows IL7R expression.<|Question|>What does the presence of IL7R signal about the developmental potential of these precursor cells in gut immune contexts?<|Chain of Thoughts|>In [CD34-positive, CD56-positive, CD117-positive common innate lymphoid precursor, large intestine, normal, gene expression], IL7R is essential for early lymphoid development, hinting at potential to differentiate into multiple ILC subsets.<|Answer|>IL7R’s expression is strongly associated with driving lymphoid lineage commitment, indicating that these common innate lymphoid precursors can proliferate and mature into various ILC subsets. By expressing IL7R in the large intestine, they are likely primed to respond to local growth factors, fostering differentiation pathways that support tissue-specific immune surveillance and regulatory roles crucial for gut homeostasis.<|Keyword|>IL7R, common innate lymphoid precursor, developmental potential

<|Context|>A reticular cell in the large intestine (normal) has high RPS27, RPL10, and TMSB4X expression.<|Question|>How might these gene products highlight the role of reticular cells in immune cell support within the large intestine?<|Chain of Thoughts|>The [reticular cell, large intestine, normal, gene expression] includes abundant ribosomal genes plus TMSB4X, which can impact actin dynamics and possibly facilitate scaffolding for immune cells.<|Answer|>Robust ribosomal gene expression (for example, RPS27, RPL10) indicates strong translational activity, potentially necessary for producing extracellular matrix and signaling molecules. TMSB4X, associated with actin remodeling, may help shape a supportive meshwork. Altogether, this profile suggests reticular cells provide structural and trophic support, aiding immune cell movement and retention in the large intestine’s lymphoid architecture.<|Keyword|>RPS27, RPL10, TMSB4X, immune cell support

<|Context|>A motor neuron in the small intestine (normal) expresses neuron-specific cytoskeletal genes like TUBA1A and STMN1.<|Question|>Could these genes indicate an active remodeling state in enteric neurons as they integrate signals in the small intestine?<|Chain of Thoughts|>The [motor neuron, small intestine, normal, gene expression] has TUBA1A, a tubulin alpha, and STMN1, a microtubule regulator that suggests dynamic cytoskeletal changes.<|Answer|>Yes. STMN1 is a key regulator of microtubule turnover, and TUBA1A encodes alpha-tubulin, crucial for forming neuronal microtubules. Their co-expression indicates that these motor neurons maintain a dynamically remodeling cytoskeleton, which is essential for axon outgrowth, synaptic plasticity, and potentially rapid adaptation to enteric signals and motility demands in the small intestine.<|Keyword|>yes, TUBA1A, STMN1, active remodeling

<|Context|>Group 3 innate lymphoid cells in the large intestine (normal) express DUSP1 and TSC22D3.<|Question|>How could DUSP1 and TSC22D3 contribute to regulating inflammatory responses among these ILC3s?<|Chain of Thoughts|>The [group 3 innate lymphoid cell, large intestine, normal, gene expression] includes DUSP1 (a phosphatase regulating MAPK pathways) and TSC22D3 (a glucocorticoid-induced leucine zipper gene), both can dampen inflammation.<|Answer|>DUSP1 dephosphorylates components of the MAPK pathway, curbing prolonged inflammatory signaling, while TSC22D3 (often known as GILZ) mediates anti-inflammatory effects under glucocorticoid influence. Their co-expression implies these ILC3s possess intrinsic mechanisms to fine-tune or terminate inflammatory cascades, helping maintain tissue homeostasis in the large intestine.<|Keyword|>DUSP1, TSC22D3, anti-inflammatory regulation

<|Context|>A T cell in the small intestine (normal) shows B2M and HLA-A expression.<|Question|>Does the presence of B2M alongside HLA-A indicate a capacity for presenting peptides to CD8+ T cells in the gut?<|Chain of Thoughts|>B2M is required for MHC class I assembly; HLA-A is a class I molecule. Looking at [T cell, small intestine, normal, gene expression], both are present, so it supports class I presentation.<|Answer|>Yes. HLA-A class I molecules must associate with B2M to effectively present endogenous peptides, primarily to CD8+ T cells. Their co-expression indicates an active involvement in antigen presentation, aiding in cytotoxic T cell engagement within the small intestine’s immune environment.<|Keyword|>yes, B2M, HLA-A, class I presentation

<|Context|>Memory B cells in the vermiform appendix (normal) show high expression of CD74 and multiple RPL genes.<|Question|>Could CD74 and robust ribosomal gene expression signify that these memory B cells are prepared for both antigen presentation and rapid immunoglobulin production?<|Chain of Thoughts|>[Memory B cell, vermiform appendix, normal, gene expression] includes CD74, a key MHC class II chaperone, plus numerous RPL genes enabling quick protein synthesis, which suggests readiness for antibody secretion.<|Answer|>Yes. CD74 helps load antigenic peptides onto MHC class II for T-helper cell engagement, and the abundance of ribosomal genes highlights a high-capacity protein synthesis machinery. This dual readiness means memory B cells can present antigens to helper T cells and then swiftly translate immunoglobulin mRNAs upon activation, ensuring rapid and potent secondary immune responses in the appendix.<|Keyword|>yes, CD74, ribosomal genes, immunoglobulin production

<|Context|>Pericytes in the vermiform appendix (normal) express TPM2, CALD1, and MYL9.<|Question|>How might these genes reflect the contractile machinery of pericytes influencing blood vessel regulation?<|Chain of Thoughts|>In [pericyte, vermiform appendix, normal, gene expression], TPM2 (tropomyosin), CALD1 (caldesmon), MYL9 (myosin light chain) are integral to smooth muscle contraction, indicating pericytes help regulate vascular tone.<|Answer|>TPM2, CALD1, and MYL9 are core components of the smooth muscle–like apparatus. TPM2 stabilizes actin filaments, CALD1 modulates actin–myosin interactions, and MYL9 helps drive the contractile force. Their co-expression in pericytes suggests an integrated contractile phenotype that facilitates blood flow regulation, vessel stability, and potential support of local vascular remodeling within the appendix.<|Keyword|>TPM2, CALD1, MYL9, contractile machinery

<|Context|>Colon epithelial cells in the lymph node (normal) express FOS, a gene related to immediate early responses.<|Question|>What could FOS expression imply about epithelial cells’ reactivity or signaling potential when they appear in lymph node contexts?<|Chain of Thoughts|>Examining [colon epithelial cell, lymph node, normal, gene expression], FOS is a rapid-response transcription factor, typically linked to cellular activation. This suggests a readiness to respond to stimuli.<|Answer|>FOS is well-known to be quickly upregulated in cells experiencing external stimuli like cytokine signals or stress. For epithelial cells residing or sampled in the lymph node, FOS expression likely indicates they can mount swift transcriptional changes, possibly interacting with immune cells or adapting to new microenvironmental cues. This highlights a dynamic role beyond just structural coverage, extending to active immune or signaling participation.<|Keyword|>FOS, immediate early response, signaling potential

<|Context|>A natural killer cell in the large intestine (normal) expresses NKG7, typically associated with cytotoxic function.<|Question|>Could NKG7 expression alone indicate robust cytotoxic capabilities in this intestinal NK cell type?<|Chain of Thoughts|>The [natural killer cell, large intestine, normal, gene expression] includes NKG7, known for roles in lytic granule function, which might reflect strong cytotoxic potential.<|Answer|>Yes. NKG7 is frequently involved in the formation and exocytosis of cytolytic granules, hallmark features of NK cell killing activity. Its presence in this normal large intestine NK cell suggests it is poised for direct cytotoxic actions against target cells, underscoring a key defense mechanism within the gut’s immune surveillance.<|Keyword|>yes, NKG7, cytotoxic function

<|Context|>NKp44-negative group 3 innate lymphoid cells in the lymph node (normal) show high TMSB4X and IL7R.<|Question|>How do TMSB4X and IL7R together underscore both structural and developmental roles of these ILC3s in the lymph node?<|Chain of Thoughts|>From [NKp44-negative group 3 innate lymphoid cell, lymph node, normal, gene expression], TMSB4X is linked to actin dynamics; IL7R is crucial for ILC development, indicating both morphological and maturation aspects.<|Answer|>TMSB4X, known to modulate actin reorganization, could enhance the cell’s migratory or shape-adjusting capabilities. IL7R is integral for ILC survival and differentiation. Their co-expression suggests these ILC3s can adapt structurally while maintaining the potential for further development or expansion in the lymph node. This dual feature may support rapid recruitment, positioning, and local immune functions.<|Keyword|>TMSB4X, IL7R, structural remodeling, development

<|Context|>A mesodermal cell in the lymph node (normal) expresses both VIM and LGALS1.<|Question|>What might the presence of LGALS1, a galectin, indicate about mesodermal cells interacting with immune cells in the lymph node?<|Chain of Thoughts|>The [mesodermal cell, lymph node, normal, gene expression] includes LGALS1, recognized for modulating immune cell behavior. It can bind glycoproteins on lymphocytes, potentially influencing cell migration or cytokine secretion.<|Answer|>LGALS1 (galectin-1) can bind to specific glycosylation patterns on immune cells, modulating their proliferation and apoptosis. In conjunction with VIM, representing a dynamic cytoskeleton, these mesodermal cells may provide a microenvironment that shapes how immune cells traffic and respond. This suggests an immunoregulatory capacity that complements their structural function in the lymph node.<|Keyword|>LGALS1, immunoregulation, mesodermal cell

<|Context|>A memory B cell in the vermiform appendix (normal) shows strong HLA-DRA expression alongside costimulatory molecules.<|Question|>Does this indicate that memory B cells might function as potent antigen-presenting cells to T cells in the appendix?<|Chain of Thoughts|>From [memory B cell, vermiform appendix, normal, gene expression], HLA-DRA is crucial for MHC class II presentation. Memory B cells can present antigen to CD4+ T cells, especially if costimulatory genes are expressed.<|Answer|>Yes. By strongly expressing HLA-DRA, memory B cells in the appendix assemble MHC class II complexes for peptide display. Coupled with costimulatory molecules, they can effectively engage helper T cells and catalyze robust adaptive immune responses. This capacity underscores the integral role of memory B cells not just in antibody production but also in bridging innate and adaptive immunity through efficient antigen presentation.<|Keyword|>yes, HLA-DRA, antigen presentation, memory B cell

<|Context|>T-helper 17 cells in the small intestine (normal) exhibit KLRB1 and LTB expression.<|Question|>How might LTB complement KLRB1 in shaping the local immune microenvironment for Th17 cells?<|Chain of Thoughts|>Looking at [T-helper 17 cell, small intestine, normal, gene expression], LTB (lymphotoxin beta) can drive local lymphoid structure formation, while KLRB1 marks mucosal T helper cells.<|Answer|>LTB can promote local stromal cell activation and formation of lymphoid-like structures, reinforcing protective immune niches within the intestine. KLRB1 serves as a mucosal homing or function-related marker, and together, these genes imply Th17 cells orchestrate tissue remodeling and immune cell recruitment in gut-associated lymphoid areas, fostering a supportive immune environment.<|Keyword|>LTB, KLRB1, local immune environment

<|Context|>A pericyte in the vermiform appendix (normal) expresses MT-ND4, MT-ND1, and other mitochondrial genes.<|Question|>Do high mitochondrial gene levels suggest an active metabolic state in appendix pericytes for vascular support?<|Chain of Thoughts|>[Pericyte, vermiform appendix, normal, gene expression] shows multiple mitochondrial transcripts (MT-ND4, MT-ND1). Pericytes with high oxidative phosphorylation capacity could maintain contractile functions and regulate blood flow effectively.<|Answer|>Yes. Elevated expression of mitochondrial respiratory chain genes like MT-ND4 and MT-ND1 in pericytes implies high energy production via oxidative phosphorylation. This robust ATP availability supports contractile activity, signaling processes, and overall maintenance of microvascular tone in the appendix, where local blood supply regulation is vital for efficient immune and epithelial function.<|Keyword|>yes, MT-ND4, MT-ND1, high metabolic state

<|Context|>Colon epithelial cells in the lymph node (normal) express CKB (creatine kinase B) and COX7C (a cytochrome c oxidase subunit).<|Question|>How do these mitochondrial and metabolic genes hint at the energetic demands of epithelial cells located in a lymph node environment?<|Chain of Thoughts|>The [colon epithelial cell, lymph node, normal, gene expression] includes CKB for energy buffering and COX7C for the electron transport chain, suggesting high ATP demands for these cells.<|Answer|>With CKB helping store and transfer phosphate groups for ATP regeneration and COX7C contributing to electron transport in mitochondria, these epithelial cells appear to maintain high energy flux. Even though they reside in or have migrated to a lymph node setting, they likely need robust metabolic capacity to sustain barrier or immunomodulatory functions, indicating an adaptable but energy-intensive role.<|Keyword|>CKB, COX7C, high energy demand

<|Context|>A natural killer cell in the small intestine (Crohn disease) expresses CCL5 (RANTES) and GNLY (granulysin).<|Question|>Does the combination of CCL5 and GNLY suggest a more aggressive inflammatory response by NK cells in Crohn disease tissue?<|Chain of Thoughts|>The [natural killer cell, small intestine, Crohn disease, gene expression] includes chemokine CCL5 and cytotoxic GNLY, which together may intensify local immune cell recruitment and tissue damage.<|Answer|>Yes. CCL5 recruits a broad range of immune cells, amplifying inflammatory processes, while GNLY contributes potent antimicrobial and cytotoxic actions. Their combined expression in Crohn disease NK cells could substantially escalate tissue injury and inflammation, reflecting a more pathogenic NK cell phenotype that intensifies the mucosal immune assault and perpetuates gut inflammation.<|Keyword|>yes, CCL5, GNLY, aggressive inflammation

<|Context|>A CD34-positive, CD56-positive, CD117-positive common innate lymphoid precursor in the large intestine (normal) expresses LTB alongside IL7R.<|Question|>How might LTB interact with IL7R signaling to guide the differentiation of these innate lymphoid precursors in the gut?<|Chain of Thoughts|>From [CD34-positive, CD56-positive, CD117-positive, large intestine, normal, gene expression], LTB can drive local stromal activation; IL7R fosters ILC lineage commitment. Combined, they might shape tissue-specific ILC differentiation.<|Answer|>LTB may stimulate stromal and endothelial cells to provide microenvironmental cues, while IL7R signaling ensures the precursor cell proliferates and matures into functional ILC subsets. This synergy can guide the precursor cell’s developmental trajectory in the intestine, potentially aiding in establishing the appropriate balance of innate lymphoid populations crucial for homeostatic or immune-protective functions.<|Keyword|>LTB, IL7R, precursor differentiation

<|Context|>A reticular cell in the large intestine (normal) expresses DCN (decorin) and VIM.<|Question|>Why do DCN and VIM suggest a role for reticular cells in extracellular matrix organization and scaffolding in the gut?<|Chain of Thoughts|>[Reticular cell, large intestine, normal, gene expression] includes DCN, a proteoglycan influencing collagen fibrils, and VIM, an intermediate filament. This indicates structural maintenance roles.<|Answer|>DCN (decorin) regulates collagen fibrillogenesis, helping to organize and stabilize extracellular matrix structures, while VIM (vimentin) supports the cell’s internal framework for distributing mechanical stresses. Together, these genes highlight reticular cells’ essential function in sculpting and maintaining a supportive environment, ensuring the intestinal tissue’s architecture remains conducive to immune cell interactions and nutrient handling.<|Keyword|>DCN, VIM, extracellular matrix organization

<|Context|>Motor neurons in the small intestine (normal) express TUBA1A and BTF3.<|Question|>Could BTF3, a general transcription factor, work alongside cytoskeletal genes to regulate neuronal function in the enteric system?<|Chain of Thoughts|>Looking at [motor neuron, small intestine, normal, gene expression], TUBA1A is structural, while BTF3 helps regulate transcription. This indicates integrated control of both morphological and transcriptional programs.<|Answer|>Yes. BTF3 can modulate core promoter activity, thus influencing the expression of numerous neuronal genes, while TUBA1A stabilizes microtubule networks essential for axonal transport and neuron shape. Together, they support the enteric neuron’s function, balancing the need for precise cytoskeletal organization with the ongoing transcriptional demands required for neurotransmission and neuronal maintenance in the small intestine.<|Keyword|>yes, BTF3, TUBA1A, neuronal function

<|Context|>Group 3 innate lymphoid cells in the large intestine (normal) express KLF6 and TNFAIP3.<|Question|>How do KLF6 and TNFAIP3 highlight a potential regulatory checkpoint for inflammation within ILC3 populations?<|Chain of Thoughts|>In [group 3 innate lymphoid cell, large intestine, normal, gene expression], KLF6 is a transcription factor and TNFAIP3 is a negative regulator of NF-κB signaling, possibly limiting over-inflammation.<|Answer|>KLF6 can shape transcriptional responses to stress or immune signals, while TNFAIP3 (A20) is key for terminating NF-κB–driven inflammatory pathways. Together, these genes imply a built-in restraint mechanism in ILC3s, preventing excessive cytokine release or destructive inflammation. This suggests that ILC3s are capable of self-limiting responses to preserve tissue health and prevent inflammatory overdrive.<|Keyword|>KLF6, TNFAIP3, anti-inflammatory checkpoint

<|Context|>A mesodermal cell in the lymph node (normal) expresses SPARC and TAGLN.<|Question|>Do SPARC and TAGLN co-expression suggest any fibroblast-like remodeling capabilities in the lymph node stroma?<|Chain of Thoughts|>From [mesodermal cell, lymph node, normal, gene expression], SPARC is involved in ECM remodeling, and TAGLN (transgelin) is linked to actin cytoskeleton, indicating a contractile or remodeling fibroblast-like phenotype.<|Answer|>SPARC modifies collagen and other extracellular components, while TAGLN regulates actin cross-linking and smooth muscle–like traits. Their combined expression points to a fibroblast-like mesodermal cell able to reshape the stromal matrix and potentially alter tissue architecture. This capacity helps the lymph node adapt its structural scaffold, facilitating immune cell trafficking and optimal function.<|Keyword|>SPARC, TAGLN, fibroblast-like remodeling

<|Context|>Memory B cells in the vermiform appendix (normal) show expression of TXNIP.<|Question|>How might TXNIP, known for its role in redox regulation, influence memory B cell responsiveness in the appendix?<|Chain of Thoughts|>Assessing [memory B cell, vermiform appendix, normal, gene expression], TXNIP can modulate oxidative stress responses and cellular metabolism, potentially affecting B cell activation or survival.<|Answer|>TXNIP binds thioredoxin, limiting antioxidant activity and modulating redox-sensitive signaling. In memory B cells, tight control of oxidative stress may optimize their reactivation capacity and longevity. By regulating cellular metabolic states, TXNIP ensures memory B cells remain poised to proliferate and produce antibodies swiftly upon antigen re-encounter in the appendix microenvironment.<|Keyword|>TXNIP, redox regulation, memory B cell responsiveness

<|Context|>T-helper 17 cells in the small intestine (normal) express SH3BGRL3 and CD3D.<|Question|>Does this combination of SH3BGRL3 and CD3D point to specialized signaling adaptations in Th17 cells for gut environments?<|Chain of Thoughts|>The [T-helper 17 cell, small intestine, normal, gene expression] shows CD3D as part of the T cell receptor complex, while SH3BGRL3 might modulate intracellular signaling cascades. Together, they might reflect specialized TCR activation pathways.<|Answer|>SH3BGRL3 has been implicated in signal transduction processes, possibly interacting with cytoskeletal or adaptor proteins. Coupled with CD3D, essential for T cell receptor assembly and activation, this gene pair could equip Th17 cells with finely tuned intracellular pathways. Such adaptations facilitate rapid and nuanced responses to luminal antigens, microbial flora, or inflammatory cues in the small intestine.<|Keyword|>SH3BGRL3, CD3D, specialized TCR signaling

<|Context|>A pericyte in the vermiform appendix (normal) displays high MYL6 and CALD1 expression.<|Question|>How do MYL6 and CALD1 underscore dynamic control of blood vessel caliber by pericytes in gut-associated lymphoid tissues?<|Chain of Thoughts|>[Pericyte, vermiform appendix, normal, gene expression] includes MYL6, a myosin light chain subunit, and CALD1 for actin-myosin interactions. This suggests smooth muscle–like regulation of vascular caliber.<|Answer|>MYL6 contributes to the contractile apparatus that generates tension on vessel walls, and CALD1 modulates the interaction of actin and myosin. Their high expression highlights these pericytes as active regulators of microvascular diameter, enabling precise blood flow modulation. This is critical in gut-associated lymphoid tissues where immune cell trafficking and localized blood supply must be finely tuned.<|Keyword|>MYL6, CALD1, vascular caliber control

<|Context|>Motor neurons in the small intestine (normal) express MARCKSL1 and MAP1B.<|Question|>What do MARCKSL1 and MAP1B indicate about the synaptic plasticity or axonal transport capacity of enteric motor neurons?<|Chain of Thoughts|>From [motor neuron, small intestine, normal, gene expression], MARCKSL1 is linked to cytoskeletal organization, MAP1B is a microtubule-associated protein, both important for synaptic function and neurite outgrowth.<|Answer|>MARCKSL1 helps coordinate actin dynamics at membranes, potentially affecting synapse formation and plasticity, while MAP1B stabilizes microtubules essential for axonal transport and structural integrity. Their co-expression suggests these enteric motor neurons are well-equipped for adaptive synaptic changes and efficient delivery of molecular cargo along axons, underpinning the finely tuned motility and sensory reflexes of the gut.<|Keyword|>MARCKSL1, MAP1B, synaptic plasticity, axonal transport





"""

run the following cell to convert the output into a hf dataset

In [62]:
qa_dict_list = []
for QA_pair in output.split("<|Context|>"):
    if not QA_pair:
        continue
    try:
        context = QA_pair.split("<|Question|>")[0].strip()
        question = QA_pair.split("<|Question|>")[1].split("<|Chain of Thoughts|>")[0].strip()
        chain_of_thoughts = QA_pair.split("<|Chain of Thoughts|>")[1].split("<|Answer|>")[0].strip()
        answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
        label = QA_pair.split("<|Keyword|>")[1].strip()
    except:
        print("Error in the following QA pair:")
        print(QA_pair)
        continue

    entry = {
        "Context": context,
        "Summary_Dataset": dataset_in_prompt,
        "Question": question, 
        "Chain of Thoughts": chain_of_thoughts,
        "Answer": answer,
        "Keyword": label,
        "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
        "Dataset_Name": dataset_name,
        "Publication_URL": cur_url,
        "Dataset_Index": dataset_index,
        "Used_Rows": sampled_indices,
    }
    qa_dict_list.append(entry)

qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists

new_hf_dataset = Dataset.from_dict(qa_dict)
print("Example:")
new_hf_dataset[0]

Error in the following QA pair:





Example:


{'Context': 'Natural killer cells in the large intestine (normal) and small intestine (Crohn disease) show differences in certain immune-related genes.',
 'Summary_Dataset': 'Cell Type: natural killer cell, Tissue: large intestine, Disease: normal, Gene Expression: MALAT1 MT-CO1 B2M TMSB4X EEF1A1 MT-CO2 RPS27 MT-CO3 RPL41 RPL10 RPLP1 RPL13 ACTB RPS18 RPS27A RPS15A RPS19 RPL34 RPS12 MT-CYB PTMA RPS3 RPS3A MT-ATP6 RPL32 RPL39 RPS14 MT-ND4 RPS23 RPL30 RPS4X MT-ND3 RPL28 RPL11 RPL21 RPL3 RPL18A RPS28 RPS2 RPL19 RPS29 H3-3B RPLP2 TPT1 RPS6 RPS15 RPS8 RPL13A NKG7 RPL23A RPL35A RPS24 RPL15 SRGN RPL26 RPL37 HLA-B RPS13 EIF1 FAU RPS7 RPL29 PFN1 RPL18 RPL8 RPLP0 RPL12 RPL6 RPL7A TMSB10 ZFP36 RPS25 CCL4 MT-ND2 RPL37A RPL9 RPL14 RPL36 RPL10A RPS16 BTG1 HSP90AA1 HLA-A RACK1 FTH1 FTL RPS9 RPS21 RPS5 MT-ND1 HLA-C RPL31 RPSA RPL35 FOS MT-ND4L RPL27A ACTG1 RPL5 CCL5 KLRB1 RPL24 RPL7 TSC22D3 UBA52 GAPDH TYROBP NFKBIA RPL27 CD7 UBC IFITM2 HSP90AB1 CFL1 GZMA RPL22 IFITM1 NACA RPL4 RPS26 KLRD1 MT-ND5 CMC1 

run the following cell every time to save updates \
sometimes save_to_disk will fail because it doesn't automatically overwrites, just delete the old file adn run it again

In [63]:
hf_dataset = concatenate_datasets([hf_dataset, new_hf_dataset])
hf_dataset


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Chain of Thoughts', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 409
})

In [64]:
hf_dataset.save_to_disk("dataset515-536_1-24-25_hf_dataset")
print("current hf_dataset: ")
hf_dataset

Saving the dataset (1/1 shards): 100%|██████████| 409/409 [00:00<00:00, 16369.30 examples/s]

current hf_dataset: 





Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Chain of Thoughts', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 409
})