In [1]:
import torch
import numpy as np
import json
import os
import re
import pandas as pd
import shutil

from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetDict

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


helper functions and constants

In [2]:
def dataset_for_prompt(df):
    # number of rows to sample
    num_rows = min(len(df), np.random.randint(5, 25))
    df = df.sample(num_rows)
    #number of genes to sample
    num_genes = 2000 // num_rows

    use_d = True
    if len(df['diseases'].unique()) == 1:
        use_d = False

    s = ""
    sampled_indices = []
    for index, row in df.iterrows():
        cur_num_genes = min(num_genes+np.random.randint(-20, 20), 1000)
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, cur_num_genes))
        if use_d:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"
        sampled_indices.append(index)
        
    return s, sampled_indices

prompt_prefix = """This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider what the analysis the researchers did in the manuscript and the biological context of this study. Then look at the example cell sentences(or a part of them), think about what question the researchers asked or would ask about the cell sentence, what they wanted to and can infer, study, or understand. Imitate the interpretation of data in the manuscript. Then answer the question, quote specific cell sentences and genes as evidence, and include a chain of thoughts. You can borrow reasoning and analysis, conclusions from the manuscript but don't directly use or quote the manuscript in question or answer. Avoid questions about experiment design or procedures, avoid vague questions. Try to vary the type of things the question asks and the wording, include both open ended questions and yes or no questions.
Provide Context for the question, this is only the essential background information required to answer the question. Find context from the manuscript. Provide Answer, including detailed reasonings. Refer to specific cell sentences as [type] cell gene expression. Provide Keywords of the answer being the most essential parts of the correct answer. The keywords should be yes or no(must include if it is a yes or no question), or important gene names, or biological information in question. These keywords will be used during my training to validate my model's response. 
Give 20 question answer pairs as one list, don't give any other word. Strictly format like this: 
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
...
"""

loads all datasets and meta data \
assumes summary datasets are in a directory as csv files, assuming the format processed by Harry \
all files I'm looking at can be found in my C2S-RL dev github repo branch

In [3]:
# TODO: change to your own file path
datasets_directory = "/users/zlyu12/Desktop/c2s-RL/RL_data/cell_type_result2024-12-09" # all summary datasets
title_to_datasets_path = "/users/zlyu12/Desktop/c2s-RL/RL_data/matching_datasets.csv" # dataset name to dataset filename
meta_data_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/meta_data.json" # dataset name, filename, url etc.
hf_dataset_output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/temp_hf_dataset" # temporary output directory

title_to_datasets = pd.read_csv(title_to_datasets_path)
datasets_files_paths = [os.path.join(datasets_directory, f) for f in os.listdir(datasets_directory) if f.endswith('.csv')]
dataset_numbers = [re.search(r'\((\d+)\)', f).group(1) for f in datasets_files_paths]

meta_data = json.load(open(meta_data_path))
row_iterator = iter(title_to_datasets.iterrows())
try:
    hf_dataset = load_from_disk(hf_dataset_output_path)
except:
    hf_dataset = Dataset.from_dict({})

title_to_datasets.head()

Unnamed: 0,dataset,filename
0,Activated PBMCs - Expanded cohort CITE-seq,local(304).h5ad
1,blood and bone marrow from a healthy young donor,local(477).h5ad
2,scRNA-seq analysis of healthy human fallopian ...,local(770).h5ad
3,HypoMap – a unified single cell gene expressio...,local(52).h5ad
4,L4 IT - MTG: Seattle Alzheimer's Disease Atlas...,local(274).h5ad


iterates through all datasets in your directory \
this prepares the prompt \
copy paste, add the publication text at the end and run our favorite LLM \
run only once for a row, we rely on some variables defined here in later cells

In [8]:
next_row = next(row_iterator)

dataset_index = re.search(r'\((\d+)\)', next_row[1]['filename']).group(1)
dataset_name = next_row[1]['dataset']
if dataset_name not in meta_data:
    print(f"Row number {next_row[0]}, dataset \"{dataset_name}\" not in meta_data!")
elif dataset_index not in dataset_numbers:
    print(f"Row number {next_row[0]}, dataset \"{dataset_name}\", dataset index \"{dataset_index}\" not in datasets directory!")
else:
    cur_url = meta_data[dataset_name]['url']
    dataset_file_path = [path for path in datasets_files_paths if f"({dataset_index})" in path][0]
    dataset_df = pd.read_csv(dataset_file_path)
    dataset_in_prompt, sampled_indices = dataset_for_prompt(dataset_df)
    print("Row Number: ", next_row[0])
    print("\nDataset Name: ", dataset_name)
    print("\nPublication URL: ", cur_url)
    print("\nPrompt:\n", prompt_prefix + "Dataset: \n" + dataset_in_prompt + "Manuscript:\n")


Row Number:  1

Dataset Name:  blood and bone marrow from a healthy young donor

Publication URL:  https://cellxgene.cziscience.com/collections/93eebe82-d8c3-41bc-a906-63b5b5f24a9d

Prompt:
 This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read

Just copy paste the output into the next cell, shouldn't need to do additional processing

In [5]:
output = """
Here is a list of 20 question-answer pairs derived from the provided single-cell RNA sequencing dataset and the manuscript's reasoning:

<|Context|>Conventional dendritic cells (cDCs) are critical for antigen presentation and immune activation in the bone marrow. Gene expression profiles, such as high expression of HLA-DRB1, HLA-DRA, and CD74, suggest a strong antigen presentation function.<|Question|>Does the cell sentence of the conventional dendritic cell support its antigen-presenting function in the bone marrow?<|Answer|>Yes, the cell sentence supports the antigen-presenting function of conventional dendritic cells. High expression of HLA-DRB1, HLA-DRA, and CD74 genes indicates strong involvement in antigen presentation pathways, which are critical for dendritic cells. These genes encode components of the major histocompatibility complex (MHC) class II molecules and chaperones that facilitate antigen presentation to T cells.<|Keyword|>Yes, HLA-DRB1, HLA-DRA, CD74, antigen presentation

<|Context|>Common dendritic progenitors give rise to various dendritic cell subtypes and are found in blood, often showing a proliferation signature.<|Question|>Do the gene expressions in the common dendritic progenitor indicate its proliferative capacity?<|Answer|>Yes, the gene expression profile shows signs of proliferation. The presence of genes such as MCM2, MCM4, and PCNA, which are involved in DNA replication and cell cycle progression, indicates active proliferation in common dendritic progenitors.<|Keyword|>Yes, MCM2, MCM4, PCNA, proliferation

<|Context|>Precursor B cells in the bone marrow are critical for early B-cell lineage commitment.<|Question|>Which gene expression signatures in precursor B cells reflect their early B-cell lineage commitment?<|Answer|>The expression of IGHM, IGLL1, and CD79A reflects early B-cell lineage commitment. IGHM encodes the immunoglobulin heavy chain, while IGLL1 and CD79A are essential components for pre-B cell receptor signaling during B-cell development.<|Keyword|>IGHM, IGLL1, CD79A, B-cell lineage

<|Context|>Basophil mast progenitor cells are critical for immune responses related to allergic reactions and inflammation.<|Question|>Do the gene expressions in the basophil mast progenitor cell suggest its function in allergic responses?<|Answer|>Yes, genes like CLC and CPA3, which are highly expressed in basophil mast progenitor cells, indicate their role in allergic responses. CLC encodes Charcot-Leyden crystal protein, a marker of eosinophil and basophil activity, and CPA3 encodes a mast cell-specific protease involved in inflammation.<|Keyword|>Yes, CLC, CPA3, allergic response

<|Context|>Effector memory CD8+ T cells are found in blood and exhibit cytotoxic activity.<|Question|>Does the gene expression in effector memory CD8+ T cells indicate cytotoxic activity?<|Answer|>Yes, the expression of GZMA and NKG7 confirms cytotoxic activity. GZMA encodes Granzyme A, a key effector molecule involved in apoptosis of target cells, while NKG7 supports cytotoxic granule formation.<|Keyword|>Yes, GZMA, NKG7, cytotoxic activity

<|Context|>Late promyelocytes in the bone marrow are transitioning from proliferation to differentiation.<|Question|>What genes in the late promyelocyte cell sentence indicate its differentiation into granulocytes?<|Answer|>Genes such as S100A8 and S100A9, which encode calcium-binding proteins involved in granulocyte maturation, indicate differentiation. These proteins are markers of neutrophilic granulocytes and are associated with inflammatory responses.<|Keyword|>S100A8, S100A9, granulocyte differentiation

<|Context|>The late promyelocyte population in blood shows inflammatory gene markers.<|Question|>Which genes in the late promyelocyte population indicate its involvement in inflammation?<|Answer|>The genes S100A8, S100A9, and LYZ suggest inflammatory involvement. S100A8 and S100A9 are key inflammatory mediators, and LYZ encodes lysozyme, which contributes to antimicrobial activity.<|Keyword|>S100A8, S100A9, LYZ, inflammation

<|Context|>Erythroid progenitor cells are responsible for red blood cell production.<|Question|>What gene expressions in erythroid progenitors reflect their commitment to red blood cell production?<|Answer|>The expression of HBB and HBD, which encode beta-globin and delta-globin, indicates a commitment to red blood cell production. These genes are critical for hemoglobin formation during erythropoiesis.<|Keyword|>HBB, HBD, red blood cell production

<|Context|>Lymphoid lineage restricted progenitors give rise to lymphocytes.<|Question|>Which gene expressions in lymphoid lineage restricted progenitors indicate lymphoid cell commitment?<|Answer|>The presence of IGLL1 and SPINK2 reflects lymphoid cell commitment. IGLL1 is part of the pre-BCR signaling, and SPINK2 is expressed during early lymphoid development.<|Keyword|>IGLL1, SPINK2, lymphoid lineage

<|Context|>Classical monocytes are responsible for inflammation and phagocytosis in the bone marrow.<|Question|>What gene expression in classical monocytes suggests their inflammatory activity?<|Answer|>Genes such as S100A8, S100A9, and FCN1 suggest inflammatory activity. FCN1 encodes ficolin-1, a marker for monocyte activation during inflammation.<|Keyword|>S100A8, S100A9, FCN1, inflammation

<|Context|>Effector memory CD8+ T cells are known for their ability to respond quickly to previously encountered pathogens.<|Question|>What genes support the rapid immune response of effector memory CD8+ T cells?<|Answer|>The expression of GZMA and PRF1 supports rapid immune responses. GZMA encodes granzyme A, and PRF1 encodes perforin, which together mediate target cell apoptosis.<|Keyword|>GZMA, PRF1, immune response

<|Context|>Late erythroid progenitors transition to mature red blood cells.<|Question|>Which genes indicate the hemoglobin synthesis function of late erythroid progenitors?<|Answer|>Genes like HBB, HBA1, and HBA2, which encode beta and alpha-globin chains, indicate hemoglobin synthesis in late erythroid progenitors.<|Keyword|>HBB, HBA1, HBA2, hemoglobin synthesis

<|Context|>Basophil mast progenitor cells show immune response markers.<|Question|>Which genes reflect the immune function of basophil mast progenitors?<|Answer|>The expression of SRGN and CPA3 reflects immune function. SRGN encodes serglycin, which supports granule formation, and CPA3 encodes a protease involved in immune responses.<|Keyword|>SRGN, CPA3, immune response

<|Context|>Precursor B cells are responsible for early immune responses.<|Question|>What genes reflect the activation and differentiation of precursor B cells?<|Answer|>The expression of CD79A and VPREB1 reflects activation. These genes encode key components of the B-cell receptor complex necessary for B-cell differentiation.<|Keyword|>CD79A, VPREB1, activation

<|Context|>Effector memory CD8+ T cells exhibit signs of T-cell activation.<|Question|>What genes in the effector memory CD8+ T cell indicate T-cell activation?<|Answer|>The expression of IL32 and CD2 indicates T-cell activation. IL32 encodes an inflammatory cytokine, and CD2 supports T-cell receptor signaling.<|Keyword|>IL32, CD2, T-cell activation

<|Context|>Conventional dendritic cells present antigens efficiently in the immune response.<|Question|>What genes in conventional dendritic cells highlight their antigen-presenting role?<|Answer|>The expression of HLA-DRB1, HLA-DPA1, and HLA-DPB1 highlights their role in antigen presentation. These genes encode MHC class II molecules critical for presenting antigens to T cells.<|Keyword|>HLA-DRB1, HLA-DPA1, HLA-DPB1, antigen presentation

<|Context|>Common dendritic progenitors express genes linked to early activation.<|Question|>What gene expressions in common dendritic progenitors support early activation?<|Answer|>The presence of CD74 and HLA-DRB1 supports early activation, as these genes are involved in antigen processing and presentation.<|Keyword|>CD74, HLA-DRB1, early activation

<|Context|>Precursor B cells are preparing for antibody production.<|Question|>What genes in precursor B cells reflect preparation for antibody production?<|Answer|>Genes like IGLL1 and VPREB1 reflect preparation for antibody production. These genes encode surrogate light chains for the pre-B cell receptor complex.<|Keyword|>IGLL1, VPREB1, antibody production

<|Context|>Basophil mast progenitor cells are involved in immune signaling.<|Question|>What genes indicate immune signaling in basophil mast progenitors?<|Answer|>Genes like TYROBP and FCER1A indicate immune signaling. TYROBP supports immune receptor signaling, and FCER1A is part of the high-affinity IgE receptor.<|Keyword|>TYROBP, FCER1A, immune signaling
"""

run the following cell to convert the output into a hf dataset

In [6]:
qa_dict_list = []
for QA_pair in output.split("<|Context|>"):
    if not QA_pair:
        continue
    try:
        context = QA_pair.split("<|Question|>")[0].strip()
        question = QA_pair.split("<|Question|>")[1].split("<|Answer|>")[0].strip()
        answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
        label = QA_pair.split("<|Keyword|>")[1].strip()
    except:
        print("Error in the following QA pair:")
        print(QA_pair)
        continue

    entry = {
        "Context": context,
        "Summary_Dataset": dataset_in_prompt,
        "Question": question, 
        "Answer": answer,
        "Keyword": label,
        "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
        "Dataset_Name": dataset_name,
        "Publication_URL": cur_url,
        "Dataset_Index": dataset_index,
        "Used_Rows": sampled_indices,
    }
    qa_dict_list.append(entry)

qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists

new_hf_dataset = Dataset.from_dict(qa_dict)
print("Example:")
new_hf_dataset[0]

Error in the following QA pair:

Here is a list of 20 question-answer pairs derived from the provided single-cell RNA sequencing dataset and the manuscript's reasoning:


Example:


{'Context': 'Conventional dendritic cells (cDCs) are critical for antigen presentation and immune activation in the bone marrow. Gene expression profiles, such as high expression of HLA-DRB1, HLA-DRA, and CD74, suggest a strong antigen presentation function.',
 'Summary_Dataset': 'Cell Type: classical monocyte, Tissue: blood, Disease: normal, Gene Expression: FTL FTH1 ACTB B2M TMSB4X MALAT1 MT-CO1 CD74 TYROBP MT-CO2 TMSB10 FOS RPLP1 FCER1G PFN1 S100A6 MT-CO3 PTMA S100A11 RPL10 RPS12 CTSB RPL28 CCL2 EEF1A1 RP11-84E17 LYZ HLA-B CTSD HLA-DRA MYL6 NPC2 SH3BGRL3 SERF2 S100A4 PSAP CFL1 S100A9 MT-ND4 VIM HLA-A CST3 NFKBIA FUCA1 TPT1 CXCL8 AIF1 GAPDH MT-CYB RPL19 LGALS3 MT-ND1 HLA-DRB1 LGALS1 OAZ1 RPS19 RPL13 RPS8 CD68 RPL12 GABARAP RPS28 RPL11 SAT1 RPLP2 RPS27A RPS15 ACTG1 HLA-C RPS14 CCL3 RPS24 RPL30 RPS23 MT-ATP6 RPS13 RPL32 FAU ITM2B PRDX1 RPLP0 CSTB PPIA ATP5F1E RPS15A DUSP1 RPL8 GPX1 RPS10 RPL18A GSTP1 MT-ND4L EIF1 CTSS RPS18 JUN RPS3A S100A8 KLF6 UBC GLUL MMP9 RPS7 RPL7A RPL15 RPS3 RPL1

run the following cell every time to save updates \
sometimes save_to_disk will fail because it doesn't automatically overwrites, just delete the old file adn run it again

In [7]:
hf_dataset = concatenate_datasets([hf_dataset, new_hf_dataset])
if os.path.exists(hf_dataset_output_path):
    shutil.rmtree(hf_dataset_output_path)
hf_dataset.save_to_disk(hf_dataset_output_path)
print("current hf_dataset: ")
hf_dataset


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


OSError: [Errno 39] Directory not empty: '/users/zlyu12/Desktop/c2s-RL/Create_Dataset/temp_hf_dataset'