In [1]:
import torch
import numpy as np
import json
import os
import re
import pandas as pd
import shutil

from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetDict

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


helper functions and constants

In [2]:
def dataset_for_prompt(df):
    # number of rows to sample
    num_rows = min(len(df), np.random.randint(5, 25))
    df = df.sample(num_rows)
    #number of genes to sample
    num_genes = 5000 // num_rows

    use_d = True
    if len(df['diseases'].unique()) == 1:
        use_d = False

    s = ""
    sampled_indices = []
    for index, row in df.iterrows():
        cur_num_genes = min(num_genes+np.random.randint(-20, 20), 1000)
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, cur_num_genes))
        if use_d:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"
        sampled_indices.append(index)
        
    return s, sampled_indices

prompt_prefix = """This is a scientific manuscript(attached), and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains representative cells from each type and the corresponding cell sentence(ranked gene names by expression). 
I'm a researcher training LLMs to unserstand and be able to analyze and reason with gene expression data as cell sentences. You need to create example context-question-answer-reasoning pairs for Large Language Models to learn analyzing and reasoning gene expression datasets when given pieces of data converted into cell sentences. I would like questions about the given cell sentence in terms of differentially expressed genes, cell type, tissue origen, disease, or other relevant biological information. The questions must be answerable by looking at the given cell sentences alone and should be open-ended.  
First read the given manuscript, consider the biological context of this study, the logic progression and conclusion made in the manuscript. Then look at the given cell sentences(or a part of them), think about what question the researchers observed, analyzed or asked about the gene expression. When answering the quesiton, discuss genes that are most relevent. 
The manuscript is only for your reference to create the questions and answers, don't mention it directly. Do not include any gene name in the context or question, but the answer should have them. Avoid questions about experiment design or procedures, or about general facts. Vary the questions as much as possible to cover a diverse range of topics.
Besides the questions and answers, each pair should include the following. The Context is a brief general background of the study. Do not summarize or describe the cell sentence or gene expression because I want my model to get trained to make those observations. Chain of Thoughts is detailed reasoning and analysis in no less than 50 words. Refer to a cell sentences as gene expression of ... cell. The Answer should be no less than 20 words. The Keywords is the most essential parts from the answer, like important gene names, or biological information. These keywords will be used during training to validate my model's response and must be in the Answer as well.
Give 30 question answer pairs as one list, don't give any other word. Strictly format like this: 
<|Context|>the context<|Question|>the content of the question<|Chain of Thoughts|>the intermediate reasonings<|Answer|>the content of the answer<|Keyword|>the answer keywords
<|Context|>the context<|Question|>the content of the question<|Chain of Thoughts|>the intermediate reasonings<|Answer|>the content of the answer<|Keyword|>the answer keywords
...
"""

loads all datasets and meta data \
assumes summary datasets are in a directory as csv files, assuming the format processed by Harry \
all files I'm looking at can be found in my C2S-RL dev github repo branch

In [3]:
# TODO: change to your own file path
datasets_directory = "/users/zlyu12/Desktop/c2s-RL/Dec19_dataset" # all summary datasets
meta_data_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/meta_data.json" # dataset name, filename, url etc.
hf_dataset_output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/temp_hf_dataset_new" # temporary output directory

datasets_files_paths = [os.path.join(datasets_directory, f) for f in os.listdir(datasets_directory) if f.endswith('.csv')]
dataset_numbers = np.sort([int(re.search(r'\((\d+)\)', f).group(1)) for f in datasets_files_paths])

meta_data = json.load(open(meta_data_path))
dataset_numbers_iterator = iter(dataset_numbers[5:])

try:
    hf_dataset = load_from_disk(hf_dataset_output_path)
except:
    hf_dataset = Dataset.from_dict({})

print(hf_dataset)

Dataset({
    features: [],
    num_rows: 0
})


iterates through all datasets in your directory \
this prepares the prompt \
copy paste, add the publication text at the end and run our favorite LLM \
run only once for a row, we rely on some variables defined here in later cells

In [14]:
dataset_index = str(next(dataset_numbers_iterator))
dataset_name = [k for k,v in meta_data.items() if f'({dataset_index})' in v.get('filename', '')]
if len(dataset_name) != 1:
    print("dataset index: ", dataset_index)
    print(dataset_name)
    print(f"dataset \"{dataset_name}\" not in meta_data!")
else:
    dataset_name = dataset_name[0]
    cur_url = meta_data[dataset_name]['url']
    dataset_file_path = [path for path in datasets_files_paths if f"({dataset_index})" in path][0]
    dataset_df = pd.read_csv(dataset_file_path)
    dataset_in_prompt, sampled_indices = dataset_for_prompt(dataset_df)
    print("Dataset Index: ", dataset_index)
    print("\nDataset Name: ", dataset_name)
    print("\nPublication URL: ", cur_url)
    print("\nPrompt:\n", prompt_prefix + "Dataset: \n" + dataset_in_prompt + "Manuscript:\n")


Dataset Index:  6

Dataset Name:  UMAP of Columnar cells

Publication URL:  https://cellxgene.cziscience.com/collections/a18474f4-ff1e-4864-af69-270b956cee5b

Prompt:
 This is a scientific manuscript(attached), and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains representative cells from each type and the corresponding cell sentence(ranked gene names by expression). 
I'm a researcher training LLMs to unserstand and be able to analyze and reason with gene expression data as cell sentences. You need to create example context-question-answer-reasoning pairs for Large Language Models to learn analyzing and reasoning gene expression datasets when given pieces of data converted into cell sentences. I would like questions about the given cell sentence in terms of differentially expressed genes, cell type, tissue origen, disease, or other relevant biological information. The questions must be answerable by looking at the given cell sente

Just copy paste the output into the next cell, shouldn't need to do additional processing

In [15]:
output = """
<|Context|>This study investigates cellular heterogeneity in the gastrointestinal tract using single‐cell RNA sequencing to reveal subtle changes in gene expression among various epithelial cell types.<|Question|>How does the molecular profile suggest a response to inflammatory stimuli in a cell from the stomach body affected by gastritis?<|Chain of Thoughts|>By evaluating the expression levels of mitochondrial and ribosomal genes, one can deduce that an inflammatory state induces increased energy demand and protein synthesis. The upregulation of specific markers is indicative of cellular stress and an activated inflammatory response. This pattern can be observed by comparing the expression intensities across genes known for their role in metabolism and inflammation.<|Answer|>The gene expression profile shows significantly elevated levels of MT-CO2, RPL41, and RPL34, which are linked to increased metabolic activity and an inflammatory response typical of gastritis in stomach cells.<|Keyword|>MT-CO2, RPL41, RPL34, gastritis, inflammatory response <|Context|>This research examines gene expression across different gastrointestinal epithelial cells to understand their functional diversity.<|Question|>What features in the molecular profile indicate a stem cell phenotype even when an inflammatory background is present?<|Chain of Thoughts|>Analyzing the transcriptome, one should look for markers that denote self-renewal and undifferentiated status, alongside those that respond to environmental stress. A mixed expression pattern, where classical stem cell markers are co-expressed with stress-response genes, suggests that the cell is maintaining its stemness while adapting to inflammation. Such cells often show moderate to high expression of both metabolic and ribosomal genes, reflecting their active state.<|Answer|>The presence of high MALAT1 and RPS18 expression together with RPL41 and RPL34 indicates a stem cell that is actively self-renewing while simultaneously responding to inflammatory stimuli.<|Keyword|>MALAT1, RPS18, RPL41, RPL34, stem cell <|Context|>The dataset includes multiple epithelial cell types from distinct regions of the gastrointestinal tract.<|Question|>What molecular clues in the gene expression data reflect the normal state of a peptic cell from the cardia of the stomach?<|Chain of Thoughts|>A peptic cell typically exhibits a strong metabolic signature and balanced expression of genes involved in enzyme production. Analyzing its profile, one should notice high expression of mitochondrial genes that drive energy production, along with markers indicative of secretory function. These clues help distinguish peptic cells from those undergoing stress or transformation.<|Answer|>The profile is characterized by robust expression of MT-CO1, MT-CO2, and RPLP1, which together denote a healthy metabolic function and secretory activity that are hallmarks of normal peptic cells in the stomach cardia.<|Keyword|>MT-CO1, MT-CO2, RPLP1, peptic cell, normal <|Context|>Single-cell RNA sequencing enables detailed exploration of cellular differentiation in the stomach.<|Question|>What gene expression evidence supports a foveolar cell phenotype in the stomach body?<|Chain of Thoughts|>Foveolar cells, essential for mucin production, exhibit specific transcriptional markers. One would expect to see elevated levels of genes that promote secretory function and maintain the integrity of the gastric surface. The balanced expression of mitochondrial genes alongside secretory markers supports this specialized role, distinguishing them from other epithelial cells.<|Answer|>Elevated levels of MALAT1 and TFF1, in conjunction with moderate MT-CO1 expression, strongly indicate a foveolar cell phenotype focused on mucin secretion and barrier maintenance.<|Keyword|>MALAT1, TFF1, MT-CO1, foveolar cell <|Context|>The analysis differentiates normal from diseased cellular states within gastrointestinal tissues.<|Question|>Which molecular signals in the gene expression profile are associated with enteroendocrine function in a rectal cell?<|Chain of Thoughts|>Enteroendocrine cells are specialized for hormone secretion. Their transcriptomes typically reveal high expression of genes that regulate secretory pathways and peptide hormone production. Focusing on the key markers that modulate these processes provides insight into their active functional state, allowing differentiation from other epithelial types that lack this specialized role.<|Answer|>The cell exhibits significant expression of MALAT1, PCSK1N, and PYY, suggesting an active enteroendocrine function that is essential for hormonal regulation in rectal tissues.<|Keyword|>MALAT1, PCSK1N, PYY, enteroendocrine <|Context|>Exploration of cell type-specific gene expression assists in identifying functional states within the gastrointestinal tract.<|Question|>What aspects of the molecular profile indicate enhanced secretory activity in an intestinal goblet cell from a stomach disorder context?<|Chain of Thoughts|>Goblet cells specialize in mucus production and protection. Their gene expression profiles should reflect upregulation of secretory markers. By comparing the relative abundance of transcripts related to mucin production, one can infer the cell's response to pathological conditions, such as increased secretion to counteract mucosal damage or irritation.<|Answer|>Elevated expression of TFF3, SPINK4, and MUC2, together with MALAT1, clearly indicates enhanced secretory activity that is pivotal for goblet cell function in the context of a stomach disorder.<|Keyword|>TFF3, SPINK4, MUC2, MALAT1, goblet cell <|Context|>The study maps differentiation states of gastrointestinal cells using transcriptomic signatures.<|Question|>What gene expression evidence confirms an enterocyte identity in cells derived from the cardia?<|Chain of Thoughts|>Enterocytes are primarily involved in nutrient absorption and exhibit a distinctive gene expression profile that includes both structural and metabolic markers. Analyzing the expression levels of transcripts linked to cellular transport and metabolism, one can identify a signature that distinguishes enterocytes from other epithelial cells. Key markers in this profile include those involved in cytoskeletal organization and mitochondrial function.<|Answer|>The robust expression of MALAT1, PHGR1, and S100A6, along with moderate levels of KRT19, confirms the enterocyte identity and underscores its role in nutrient absorption and barrier function in the cardia.<|Keyword|>MALAT1, PHGR1, S100A6, KRT19, enterocyte <|Context|>This analysis leverages single-cell data to detect early molecular changes in diseased tissues.<|Question|>How can the gene expression pattern in a stem cell from the lower esophagus be used to infer features of Barrett esophagus?<|Chain of Thoughts|>Barrett esophagus is characterized by a mosaic pattern of gastric and intestinal features. In a stem cell, the presence of both metabolic and ribosomal markers can indicate a transitional state. Elevated levels of genes typically associated with inflammation and cellular stress, alongside markers of stemness, suggest a shift towards a Barrett esophageal phenotype. This hybrid expression profile is key for early detection of precancerous changes.<|Answer|>The cell displays a mosaic pattern with high expression of MALAT1, MT-CO2, and RPS18, indicating a shift toward Barrett esophagus characterized by features of both gastric and intestinal lineages.<|Keyword|>MALAT1, MT-CO2, RPS18, Barrett esophagus, mosaic <|Context|>The research aims to define distinct cellular identities within the gastrointestinal epithelium.<|Question|>Which molecular markers most clearly distinguish peptic cells from other epithelial cell types in the GI tract?<|Chain of Thoughts|>Peptic cells have a unique signature linked to digestive enzyme production and metabolic regulation. Focusing on genes that are highly expressed in these cells but not in other types allows for clear discrimination. Markers related to lipid digestion and protein synthesis are central to identifying peptic cells, providing insight into their specialized function.<|Answer|>Distinctive expression of LIPF, PGC, and PGA4, along with MALAT1, serves as a hallmark of peptic cells, effectively distinguishing them from other epithelial types in the gastrointestinal tract.<|Keyword|>LIPF, PGC, PGA4, MALAT1, peptic cell <|Context|>This investigation explores the gene expression signatures that underlie cellular specialization in metaplastic conditions.<|Question|>What gene expression features suggest a specialized function in gastric metaplastic cells?<|Chain of Thoughts|>Gastric metaplastic cells often exhibit altered expression patterns reflecting changes in secretion and metabolism. By focusing on markers that indicate shifts in acid production and mucin secretion, one can identify a signature that underpins their specialized role. The integration of mitochondrial and secretory gene data further refines the understanding of these cells’ functions in disease contexts.<|Answer|>The profile reveals significant levels of MALAT1, TFF1, and GAST, along with mitochondrial marker MT-CO2, reflecting specialized functions such as acid secretion and metabolic regulation in gastric metaplastic cells.<|Keyword|>MALAT1, TFF1, GAST, MT-CO2, gastric metaplasia <|Context|>Differential gene expression analysis allows for tracking transitions between healthy and diseased states.<|Question|>How might differential expression in stem cells signal a transition from normal to a diseased state in the GI tract?<|Chain of Thoughts|>Examining changes in gene expression levels, particularly in genes responsible for metabolism and stress response, can indicate cellular reprogramming. Elevated transcripts of mitochondrial and ribosomal genes in stem cells may reflect an increased metabolic demand or a stress response, serving as early indicators of disease progression. This nuanced shift can be crucial for early intervention strategies.<|Answer|>Elevated expressions of MALAT1, RPS18, and MT-CO2 in the stem cell population suggest a shift toward a diseased state, indicating increased metabolic demand and a stress response during transition.<|Keyword|>MALAT1, RPS18, MT-CO2, stem cell, transition <|Context|>The dataset captures cells with mixed phenotypes, revealing intermediate states in cellular differentiation.<|Question|>What evidence from the gene expression data supports an intermediate cell phenotype bridging gastric and intestinal lineages?<|Chain of Thoughts|>Cells in a transitional state exhibit a mosaic expression pattern, sharing markers from both gastric and intestinal profiles. This can be seen by the simultaneous presence of genes associated with secretory and metabolic functions typical of both lineages. Such an intermediate state is crucial for understanding the progression of metaplasia and its potential role in disease development.<|Answer|>A combination of elevated MALAT1, TFF1, and RPS18 expression highlights a mosaic pattern, which is indicative of cells in an intermediate state bridging gastric and intestinal differentiation.<|Keyword|>MALAT1, TFF1, RPS18, mosaic, intermediate phenotype <|Context|>The study employs single-cell techniques to profile distinct endocrine functions in gastrointestinal tissues.<|Question|>How does the gene expression profile of an enteroendocrine cell inform its functional role in the rectum?<|Chain of Thoughts|>Enteroendocrine cells are defined by their capacity to secrete hormones, and their transcriptomes typically show high levels of genes linked to peptide hormone production and secretion. The detection of these markers, alongside transcriptional regulators that drive endocrine function, provides insight into their role in maintaining gastrointestinal homeostasis. This profile can be contrasted with other cell types lacking such specialized functions.<|Answer|>High levels of MALAT1, PCSK1N, and PYY in the enteroendocrine cell indicate its active role in hormone secretion and signal modulation, which is essential for rectal function and homeostasis.<|Keyword|>MALAT1, PCSK1N, PYY, enteroendocrine, rectum <|Context|>This research highlights the importance of secretory mechanisms in protective cell functions within the GI tract.<|Question|>What gene expression indicators in a goblet cell suggest a primary role in mucin production?<|Chain of Thoughts|>Goblet cells are specialized for mucus secretion, which is critical for protecting the mucosal surface. Their gene expression profiles should show a clear upregulation of secretory markers that mediate mucin production and secretion. Evaluating these markers provides insight into the cell's functional state, especially under conditions of stress or injury where increased mucus production is required.<|Answer|>Enhanced expression of TFF3, SPINK4, and MUC2, together with MALAT1, reflects a strong secretory function, underpinning the goblet cell’s essential role in mucin production and mucosal protection.<|Keyword|>TFF3, SPINK4, MUC2, MALAT1, goblet cell <|Context|>The study distinguishes among various gastric epithelial cell types using their transcriptional profiles.<|Question|>How do the gene expression profiles differentiate foveolar from parietal cells in the stomach?<|Chain of Thoughts|>Foveolar cells are primarily involved in mucin secretion, while parietal cells are key for acid production. Their gene expression patterns reflect these functional differences. Foveolar cells typically exhibit high levels of secretory markers, whereas parietal cells express genes related to acid secretion and specialized metabolic functions. This comparative approach allows for clear delineation between the two cell types based on their molecular signatures.<|Answer|>Foveolar cells exhibit elevated MALAT1 and TFF1 levels, whereas parietal cells show distinct expression of GAST and MT-CO2, delineating their roles in mucin secretion versus acid production respectively.<|Keyword|>MALAT1, TFF1, GAST, MT-CO2, foveolar, parietal <|Context|>Mapping gene expression signatures facilitates the identification of nutrient-absorbing cells.<|Question|>What insights does the gene expression data provide regarding the identity of an enterocyte in the cardia?<|Chain of Thoughts|>Enterocytes are specialized for nutrient absorption and thus exhibit a distinct molecular signature characterized by both metabolic and structural markers. The analysis focuses on transcripts that play roles in nutrient transport, cytoskeletal integrity, and metabolic regulation. By comparing these markers, one can confidently attribute an enterocyte identity to cells with this composite expression pattern.<|Answer|>The enterocyte from the cardia displays marked expression of MALAT1, PHGR1, and S100A6, which are essential for nutrient absorption and maintaining cellular integrity in the gastrointestinal barrier.<|Keyword|>MALAT1, PHGR1, S100A6, enterocyte, cardia <|Context|>Single-cell analysis allows for early detection of pathological changes in stem cell populations.<|Question|>How can differential gene expression in stem cells be used to infer the onset of metaplastic transformation?<|Chain of Thoughts|>An increase in the expression of mitochondrial and ribosomal genes often signals cellular stress and a potential shift in differentiation. In stem cells, such alterations can serve as early markers for metaplastic transformation. This transition is marked by a reprogramming of gene expression, where markers of normal function begin to coexist with those associated with abnormal differentiation, providing a window into the early events of disease progression.<|Answer|>Upregulated expressions of MALAT1, MT-CO2, and RPS18 in stem cells serve as early indicators of metaplastic transformation, reflecting a shift towards an abnormal differentiation program under chronic stress.<|Keyword|>MALAT1, MT-CO2, RPS18, metaplasia, transformation <|Context|>The dataset reveals hybrid cell states that exhibit features of multiple lineages.<|Question|>What gene expression features distinguish a Barrett esophagus cell from a normal gastric cell?<|Chain of Thoughts|>Barrett esophagus is marked by a mosaic expression pattern that integrates elements of both gastric and intestinal lineages. In such cells, one would expect to find elevated levels of markers typically associated with metabolic stress and altered differentiation. The presence of these dual characteristics can be used to delineate Barrett esophagus from normal gastric cells, which lack this hybrid signature.<|Answer|>The Barrett esophagus cell displays a hybrid profile with elevated levels of MALAT1, MT-CO2, RPS18, and AGR2, reflecting a blend of gastric and intestinal features that set it apart from normal gastric cells.<|Keyword|>MALAT1, MT-CO2, RPS18, AGR2, Barrett esophagus <|Context|>This investigation examines the secretory and digestive functions of specialized gastrointestinal cells.<|Question|>How does the gene expression in a peptic cell illustrate its specialized digestive function?<|Chain of Thoughts|>Peptic cells are uniquely equipped for enzyme production and secretion, which is critical for digestion. Their gene expression profiles reveal high levels of enzymes and proteins that facilitate these processes. By analyzing key markers known to be involved in lipid and protein digestion, one can confirm the functional specialization of these cells, differentiating them from other epithelial types that may have less pronounced secretory roles.<|Answer|>Peptic cells exhibit high expression of LIPF, PGC, and PGA4 along with MALAT1, which together underscore their specialized role in digestive enzyme production and efficient metabolic regulation.<|Keyword|>LIPF, PGC, PGA4, MALAT1, peptic cell <|Context|>The analysis focuses on how gastrointestinal cells adapt to tissue-specific stressors.<|Question|>What aspects of the gene expression profile suggest a cellular adaptation to oxidative stress in gastrointestinal cells?<|Chain of Thoughts|>Oxidative stress often leads to the upregulation of genes that support increased metabolic demand and stress response. In gastrointestinal cells, elevated expression of mitochondrial and ribosomal markers indicates that the cells are adapting to such stress. This adaptive mechanism ensures that energy production and protein synthesis are maintained despite adverse conditions, which is critical for cell survival in a hostile microenvironment.<|Answer|>Increased levels of MALAT1, MT-CO2, and RPS18 in the cell indicate an adaptive response to oxidative stress, highlighting enhanced mitochondrial function and stress-responsive transcriptional regulation.<|Keyword|>MALAT1, MT-CO2, RPS18, oxidative stress, adaptation <|Context|>Single-cell transcriptomics can uncover early signs of precancerous changes in specific tissues.<|Question|>How does the gene expression pattern in a stem cell from the lower esophagus relate to precancerous changes?<|Chain of Thoughts|>A stem cell’s transcriptome reflecting a hybrid or mosaic expression pattern often signals early precancerous alterations. Elevated levels of metabolic and stress-related genes can serve as early warning signs of transformation. By focusing on these markers, one can infer that the cell is beginning to deviate from its normal phenotype, potentially adopting characteristics that predispose it to malignant transformation.<|Answer|>The stem cell exhibits elevated expression of MALAT1, MT-CO2, and RPS18, suggesting early precancerous alterations marked by shifts in metabolic activity and transcriptional reprogramming in the lower esophagus.<|Keyword|>MALAT1, MT-CO2, RPS18, precancerous, stem cell <|Context|>The study deciphers the functional roles of secretory cells in maintaining gastrointestinal integrity.<|Question|>What gene expression signature in a goblet cell indicates its strong secretory function?<|Chain of Thoughts|>Goblet cells are essential for producing mucus, and their functional state can be assessed by the abundance of specific secretory markers. An upregulation of genes involved in mucin production typically reflects an enhanced secretory function. These markers, when evaluated together, provide a clear indication of the cell’s role in protecting the mucosal barrier and facilitating tissue repair.<|Answer|>The prominent expression of TFF3, SPINK4, and MUC2, along with MALAT1, strongly indicates a robust secretory function that is critical for mucus production and mucosal protection in goblet cells.<|Keyword|>TFF3, SPINK4, MUC2, MALAT1, goblet cell <|Context|>Using single-cell transcriptomics, researchers differentiate among various gastric epithelial subtypes.<|Question|>How do the gene expression profiles differentiate between foveolar and parietal cells in the stomach?<|Chain of Thoughts|>Foveolar and parietal cells serve distinct functions in the stomach. Foveolar cells are primarily involved in mucus secretion, while parietal cells are responsible for acid production. Their gene expression patterns reflect these roles, with foveolar cells showing higher levels of secretory markers and parietal cells exhibiting markers associated with metabolic functions linked to acid secretion. This comparative analysis enables clear cellular distinction based solely on transcriptional signatures.<|Answer|>Foveolar cells are distinguished by high MALAT1 and TFF1 expression, whereas parietal cells are characterized by elevated GAST and MT-CO2 levels, highlighting their respective roles in mucin secretion and acid production.<|Keyword|>MALAT1, TFF1, GAST, MT-CO2, foveolar, parietal <|Context|>The research utilizes detailed gene expression profiles to assign cell identities within the GI tract.<|Question|>What gene expression signature in an enterocyte suggests its involvement in nutrient absorption and metabolism?<|Chain of Thoughts|>Enterocytes must exhibit markers that indicate active nutrient transport and metabolic activity. The presence of high levels of genes related to both structural integrity and metabolic processes is key. This combined signature provides evidence of the cell’s role in nutrient uptake and energy production, which are essential for maintaining the gastrointestinal barrier and overall metabolism.<|Answer|>A combination of robust MALAT1, PHGR1, and S100A6 expression, together with detectable KRT19 levels, points to an active enterocyte that is engaged in nutrient absorption and efficient metabolic regulation.<|Keyword|>MALAT1, PHGR1, S100A6, KRT19, enterocyte <|Context|>By profiling stem cells at the single-cell level, researchers can monitor readiness for differentiation.<|Question|>How does the gene expression in a stem cell illustrate its readiness for differentiation in the gastrointestinal tract?<|Chain of Thoughts|>Stem cells poised for differentiation often show upregulation of genes involved in both self-renewal and the machinery necessary for rapid protein synthesis. Elevated mitochondrial activity and ribosomal gene expression serve as indicators that the cell is primed for the energetic and biosynthetic demands of differentiation. This signature is key to understanding cellular plasticity and the potential transition toward a more differentiated state.<|Answer|>Elevated levels of MALAT1, RPS18, and MT-CO2 in the stem cell indicate it is in a poised state for differentiation, reflecting active ribosomal biogenesis and enhanced mitochondrial function needed for cellular transition.<|Keyword|>MALAT1, RPS18, MT-CO2, stem cell, differentiation <|Context|>The research characterizes distinct cellular responses in healthy versus diseased tissues using transcriptomic data.<|Question|>Which gene expression features in a peptic cell are most indicative of its healthy functional status?<|Chain of Thoughts|>Healthy peptic cells maintain a balanced expression of genes that underpin their digestive functions. Markers related to enzyme production and proper metabolic regulation should be robustly expressed. A harmonious expression pattern of specific digestive enzymes and metabolic regulators confirms that the cell is functioning normally without signs of pathological stress or degeneration.<|Answer|>High expression of LIPF, PGC, and PGA4, alongside balanced MALAT1 levels, strongly indicates a healthy peptic cell that is efficiently producing digestive enzymes and maintaining optimal metabolic regulation.<|Keyword|>LIPF, PGC, PGA4, MALAT1, healthy peptic cell <|Context|>Single-cell gene expression profiling enables the detection of metabolic shifts in diseased cells.<|Question|>What does the elevated mitochondrial gene expression in a cell imply about its metabolic state in a disease context?<|Chain of Thoughts|>Mitochondrial genes, such as those encoding components of the electron transport chain, are critical for energy production. In disease contexts, their increased expression can indicate a compensatory mechanism in response to stress or an early sign of cellular transformation. This enhanced mitochondrial activity is often correlated with elevated energy demands and can be a precursor to pathological changes in cell function.<|Answer|>The marked increase in mitochondrial genes such as MT-CO2 and MT-ND4 suggests a heightened metabolic state, reflecting an adaptive response to stress and potential early stages of disease-associated transformation.<|Keyword|>MT-CO2, MT-ND4, metabolic state, disease, transformation <|Context|>The dataset provides insights into cellular responses to tissue injury in the gastrointestinal tract.<|Question|>How does the gene expression pattern in a goblet cell reveal its response to mucosal injury?<|Chain of Thoughts|>When mucosal injury occurs, goblet cells typically respond by increasing mucus production to protect and repair the tissue. This response is marked by the upregulation of genes involved in mucin synthesis and secretion. By assessing these transcriptional changes, one can infer that the cell is actively engaged in a protective, reparative process, counteracting the damage inflicted on the mucosal barrier.<|Answer|>Enhanced expression of TFF3, SPINK4, and MUC2 in the goblet cell reflects an adaptive response to mucosal injury, promoting increased mucus production and facilitating tissue repair.<|Keyword|>TFF3, SPINK4, MUC2, goblet cell, mucosal injury <|Context|>The study investigates molecular signatures that distinguish cellular phenotypes in gastrointestinal diseases.<|Question|>What gene expression characteristics in a parietal cell are associated with gastritis in the stomach body?<|Chain of Thoughts|>Parietal cells under disease conditions, such as gastritis, often exhibit altered transcriptional profiles, including changes in mitochondrial and secretory gene expression. Elevated levels of genes associated with acid secretion and metabolic stress can indicate an inflammatory response. This shift in gene expression reflects both the cell's functional changes and its adaptation to the diseased environment, which can be critical for diagnosis and further treatment strategies.<|Answer|>The parietal cell demonstrates high expression of MT-CO2, MALAT1, and TFF1, along with notable levels of GAST, collectively suggesting an altered acid secretion profile and an inflammatory response typical of gastritis in the stomach body.<|Keyword|>MT-CO2, MALAT1, TFF1, GAST, parietal cell, gastritis
"""

run the following cell to convert the output into a hf dataset

In [16]:
qa_dict_list = []
for QA_pair in output.split("<|Context|>"):
    if not QA_pair:
        continue
    try:
        context = QA_pair.split("<|Question|>")[0].strip()
        question = QA_pair.split("<|Question|>")[1].split("<|Chain of Thoughts|>")[0].strip()
        chain_of_thoughts = QA_pair.split("<|Chain of Thoughts|>")[1].split("<|Answer|>")[0].strip()
        answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
        label = QA_pair.split("<|Keyword|>")[1].strip()
    except:
        print("Error in the following QA pair:")
        print(QA_pair)
        continue

    entry = {
        "Context": context,
        "Summary_Dataset": dataset_in_prompt,
        "Question": question, 
        "Chain of Thoughts": chain_of_thoughts,
        "Answer": answer,
        "Keyword": label,
        "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
        "Dataset_Name": dataset_name,
        "Publication_URL": cur_url,
        "Dataset_Index": dataset_index,
        "Used_Rows": sampled_indices,
    }
    qa_dict_list.append(entry)

qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists

new_hf_dataset = Dataset.from_dict(qa_dict)
print("Example:")
new_hf_dataset[0]

Error in the following QA pair:


Example:


{'Context': 'This study investigates cellular heterogeneity in the gastrointestinal tract using single‐cell RNA sequencing to reveal subtle changes in gene expression among various epithelial cell types.',
 'Summary_Dataset': 'Cell Type: stem cell, Tissue: body of stomach, Disease: gastritis, Gene Expression: MT-CO2 RPL41 RPL34 RPS18 MT-CO1 RPS19 RPL39 MALAT1 RPL7 RPS27A RPS6 RPS3A EEF1A1 RPS27 MT-CO3 RPS14 RPS15A RPS2 RPL13 RPL10 RPL13A RPLP1 RPS28 MT-ND4 RPS12 RPL21 RPS29 RPL32 RPS4X RPL3 RPS3 RPL18A RPL27A RPS15 RPL12 RPL36 LYZ RPL11 RPL37A RPL26 RPS8 RPL15 RPLP2 RPS23 MT-CYB RPL28 FTH1 PTMA RPS13 RPL35A MT-ATP6 RPS9 MT-ND3 FTL TMSB4X RPL23A RPL9 TPT1 RPL19 RPL8 RPL10A RPL31 RPS16 RPL35 RPS5 RPL37 TMSB10 B2M RPL29 MT-ND2 RPS24 RPS25 RPS20 RPLP0 S100A6 RPS21 RPS7 RPL18 RPL6 RPL14 RPL7A RPL30 RPL5 RPS11 ACTB RPS10 MT-ND1 ACTG1 RACK1 RPS17 RPL23 SERF2 RPL24 FAU RPL27 TFF1 NEAT1 H3-3B RPL38 EIF1 RPSA TXN UBA52 GAST BTF3 HNRNPA1 COX4I1 SLC25A6 KRT8 GAPDH RPL22 CYSTM1 NACA RPL36A RPS26 CO

run the following cell every time to save updates \
sometimes save_to_disk will fail because it doesn't automatically overwrites, just delete the old file adn run it again

In [17]:
hf_dataset = concatenate_datasets([hf_dataset, new_hf_dataset])
hf_dataset


Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Chain of Thoughts', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 29
})

In [64]:
hf_dataset.save_to_disk("dataset6_3-5-25_hf_dataset")
print("current hf_dataset: ")
hf_dataset

Saving the dataset (1/1 shards): 100%|██████████| 409/409 [00:00<00:00, 16369.30 examples/s]

current hf_dataset: 





Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Chain of Thoughts', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 409
})