# Init 

In [1]:
import torch
import fitz
import numpy as np
import json
import openai
import os
import pandas as pd
import re

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset, load_from_disk, concatenate_datasets, DatasetDict
from trl import DPOConfig, DPOTrainer

seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
np.random.seed(seed)

  from .autonotebook import tqdm as notebook_tqdm


# Publication

In [2]:
title_to_data = pd.read_csv("/users/zlyu12/Desktop/c2s-RL/RL_data/matching_datasets.csv")

parsed_articles = {}
with open('/users/zlyu12/Desktop/cell2sentence/cell2sentence-dev/zyl/Q&A/data/publications/parsed_articles.json', 'r') as f:
    parsed_articles = json.load(f)

ct = 0
for title in title_to_data['dataset']:
    if title in parsed_articles:
        ct += 1
    else:
        print(title)
print(ct)

91


In [3]:
publication_path = "/users/zlyu12/Desktop/c2s-RL/RL_data/cell_type_result2024-12-09"
publication_files = [os.path.join(publication_path, f) for f in os.listdir(publication_path) if f.endswith('.csv')]

for file_path in publication_files:
    df = pd.read_csv(file_path)
    # Check for gene_1 through gene_100 columns
    expected_gene_cols = [f'gene_{i}' for i in range(1, 101)]
    missing_cols = [col for col in expected_gene_cols if col not in df.columns]
    
    if missing_cols:
        print(f"\nFile: {os.path.basename(file_path)}")
        print("Missing gene columns:", missing_cols)


In [4]:
def dataset_for_prompt(df):
    if len(df) > 15:
        df = df.sample(15)

    use_d = True
    if len(df['diseases'].unique()) == 1:
        use_d = False

    s = ""
    sampled_indices = []
    for index, row in df.iterrows():
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, 101))
        if use_d:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            s += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"
        sampled_indices.append(index)
    return s, sampled_indices

In [5]:
def extract_first_last_sentences(sections):
    """
    Extract all sentences from abstract and summary sections, and first sentence of each paragraph
    from other sections, removing figure references and concatenating them.
    
    Args:
        sections (dict): Dictionary with section names as keys and text content as values
    
    Returns:
        str: Concatenated sentences from all sections
    """
    def clean_figure_refs(text):
        # Remove figure references like (Figure X) or (Fig. X)
        return re.sub(r'\s*(?:\(?)[Ff]ig(?:ure)?\.?\s*\d+[A-Za-z]?(?:[--]\d+[A-Za-z]?)?(?:\)?)', '', text)
    
    def split_into_sentences(text):
        # Split text into sentences, handling common abbreviations
        text = re.sub(r'(?<=[.!?])\s+(?=[A-Z])', '\n', text)
        return [s.strip() for s in text.split('\n') if s.strip()]
    
    def is_title(text):
        # Check if text is a title (single word or short sentence without period)
        return len(text.split()) <= 5 and '.' not in text
    
    result = []
    
    for section_name, content in sections.items():
        if not content:  # Skip empty sections
            continue
            
        # Split content into paragraphs
        paragraphs = content.split('\n')
        
        for para in paragraphs:
            if not para.strip():  # Skip empty paragraphs
                continue
                
            # Clean figure references
            para = clean_figure_refs(para)
            
            # Split into sentences
            sentences = split_into_sentences(para)
            
            if not sentences:  # Skip if no sentences after cleaning
                continue
                
            # Include all sentences from Abstract or Summary sections
            if section_name.lower() in ['abstract', 'summary']:
                result.extend(sentences)
            # For other sections, include only first sentence if not a title
            else:
                if len(sentences) == 1 and is_title(sentences[0]):
                    result.append(sentences[0])
                else:
                    result.append(sentences[0])
    
    return ' '.join(result)

In [6]:
prompt_prefix = """This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. 
The questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should follow a line of reasoning mentioned in the manuscript or results arrived by the researchers. When creating questions, you should first read the given manuscript, consider what the analysis the researchers did in the manuscript and the biological context of this study. Then look at the example cell sentences(or a part of them), think about what question the researchers asked or would ask about the cell sentence, what they wanted to and can infer, study, or understand. Imitate the interpretation of data in the manuscript. Then answer the question, quote specific cell sentences and genes as evidence, and include a chain of thoughts. You can borrow reasoning and analysis, conclusions from the manuscript but don't directly use or quote the manuscript in question or answer. Avoid questions about experiment design or procedures, avoid vague questions. Try to vary the type of things the question asks and the wording, include both open ended questions and yes or no questions.
Provide Context for the question, this is only the essential background information required to answer the question. Find context from the manuscript. Provide Answer, including detailed reasonings. Refer to specific cell sentences as [type] cell gene expression. Provide Keywords of the answer being the most essential parts of the correct answer. The keywords should be yes or no(must include if it is a yes or no question), or important gene names, or biological information in question. These keywords will be used during my training to validate my model's response. 
Give 20 question answer pairs as one list, don't give any other word. Strictly format like this: 
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
<|Context|>the context<|Question|>the content of the question<|Answer|>the content of the answer<|Keyword|>the answer keywords
...
"""

In [7]:
for filename in title_to_data['filename']:
    ds_number = int(re.search(r'\((\d+)\)', filename).group(1))
    flag = False
    for file_path in publication_files:
        if f"({ds_number})" in file_path:
            flag = True
    if not flag:
        print(f"No file found for dataset {ds_number}")

In [8]:
publication_path = "/users/zlyu12/Desktop/c2s-RL/RL_data/cell_type_result2024-12-09"
publication_files = [os.path.join(publication_path, f) for f in os.listdir(publication_path) if f.endswith('.csv')]
print(len(publication_files))

entire_batch = []
for file_path in publication_files:
    data_df = pd.read_csv(file_path)
    dataset_prompt, sampled_indices = dataset_for_prompt(data_df)

    ds_number = int(re.search(r'\((\d+)\)', file_path).group(1))
    dataset_name = [row["dataset"] for i, row in title_to_data.iterrows() if f"({ds_number})" in row['filename']]
    if len(dataset_name) != 1:
        print(f"Multiple papers found for dataset {ds_number}")
        print(dataset_name)
        continue
    dataset_name = dataset_name[0]

    if dataset_name not in parsed_articles:
        print(f"No parsed paper found for dataset {ds_number}")
        print(dataset_name)
        continue

    sections = parsed_articles[dataset_name]['sections']
    if 'Results' not in sections:
        print(f"No Results section found for {dataset_name}")
        continue
    
    publication_prompt = extract_first_last_sentences(sections)
    final_prompt = prompt_prefix + f"\nManuscript: {publication_prompt}\nDataset: {dataset_prompt}"
    
    entry = {
            "custom_id": "dataset_name:{}, url:{}, dataset_index:{}, used_rows:{}".format(dataset_name, parsed_articles[dataset_name]['url'], ds_number, sampled_indices), 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {
                "model": "gpt-4o", 
                "messages": [{"role": "user", "content": final_prompt}]
            }
        }
    entire_batch.append(entry)
    
print(len(entire_batch))
output_file = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input.jsonl"
with open(output_file, "w") as f:
    for entry in entire_batch:
        f.write(json.dumps(entry) + "\n")


91
91


In [11]:
# Split entries into two equal parts
half_size = len(entire_batch) // 5

for i in range(5):
    part_batch = entire_batch[i*half_size:(i+1)*half_size]

    output_file1 = f"/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_part{i}.jsonl"

    with open(output_file1, "w") as f:
        for entry in part_batch:
            f.write(json.dumps(entry) + "\n")

In [68]:
# Split entries into two equal parts
entire_batch = []
with open("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input.jsonl", "r") as f:
    for line in f:
        entry = json.loads(line)
        entire_batch.append(entry)
print(len(entire_batch))

half_size = len(entire_batch) // 5

part_batch = entire_batch[72:76]

output_file1 = f"/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_part{42}.jsonl"

with open(output_file1, "w") as f:
    for entry in part_batch:
        f.write(json.dumps(entry) + "\n")

part_batch = entire_batch[78:85]

output_file1 = f"/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_part{52}.jsonl"

with open(output_file1, "w") as f:
    for entry in part_batch:
        f.write(json.dumps(entry) + "\n")

part_batch = entire_batch[85:]

output_file1 = f"/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_part{62}.jsonl"

with open(output_file1, "w") as f:
    for entry in part_batch:
        f.write(json.dumps(entry) + "\n")

91


In [69]:
openai_api_key = open("/users/zlyu12/Desktop/cell2sentence/cell2sentence-dev/zyl/OpenAI_API_key.txt", "r").read()
client = openai.OpenAI(api_key=openai_api_key)

QA_file = client.files.create(
  file=open(f"/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_part52.jsonl", "rb"),
  purpose="batch"
)

In [70]:
batch_job = client.batches.create(
  input_file_id=QA_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h",
  metadata={"version": "2024-12-10", "description": "Publication conditioned QA, part 5"}
)
print(batch_job.id)
print(batch_job)

batch_6758836680f08191bf9175b2392429ec
Batch(id='batch_6758836680f08191bf9175b2392429ec', completion_window='24h', created_at=1733854054, endpoint='/v1/chat/completions', input_file_id='file-D24H1TTuG6haZJxJVySWPK', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1733940454, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'version': '2024-12-10', 'description': 'Publication conditioned QA, part 5'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [67]:
batch_job_id = "batch_6758836680f08191bf9175b2392429ec"
openai_api_key = open("/users/zlyu12/Desktop/cell2sentence/cell2sentence-dev/zyl/OpenAI_API_key.txt", "r").read()
client = openai.OpenAI(api_key=openai_api_key)
print(client.batches.retrieve(batch_job_id).status)

failed


In [49]:
openai_api_key = open("/users/zlyu12/Desktop/cell2sentence/cell2sentence-dev/zyl/OpenAI_API_key.txt", "r").read()
client = openai.OpenAI(api_key=openai_api_key)
part4 = []
file_path = f"/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_part{4}.jsonl"
with open(file_path, "r") as f:
    for line in f:
        entry = json.loads(line)
        part4.append(entry)
part4

[{'custom_id': 'dataset_name:Non-neuronal cells of the human fovea and peripheral retina, url:https://doi.org/10.1038/s41598-020-66092-9, dataset_index:700, used_rows:[0, 1, 2, 3, 4, 5, 6, 7]',
  'method': 'POST',
  'url': '/v1/chat/completions',
  'body': {'model': 'gpt-4o',
   'messages': [{'role': 'user',
     'content': "This is a scientific manuscript and an example of the single-cell RNA sequencing dataset it is associated with. The example dataset contains one representative cell from each type and the corresponding cell sentence(ranked gene names by expression). You need to create question-answer pairs for Large Language Models to learn analyzing scRNA-seq datasets when given pieces of data converted into cell sentences. I would like questions and answers about the given cell sentence in terms of cells, tissues, disease, or other relevant biological information. \nThe questions must be answerable by looking at the given cell sentences alone. The question-answer pairs should fol

In [52]:
part4_completions = []
for i, entry in enumerate(part4):
    if i < 6:  # Skip first 6 entries
        continue
    custom_id = entry['custom_id']
    completion = client.chat.completions.create(
    model=entry['body']['model'],
    messages=entry['body']['messages']
    )
    part4_completions.append((custom_id, completion))
part4_completions

[('dataset_name:Single-cell transcriptomics of human T cells reveals tissue and activation signatures in health and disease, url:https://doi.org/10.1038/s41467-019-12464-3, dataset_index:726, used_rows:[18, 6, 15, 4, 16, 11, 17, 13, 0, 10, 7, 14, 5, 20, 1]',
  ChatCompletion(id='chatcmpl-AcyiNodr7yMGGjuFX04dizb9KC1Uk', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="<|Context|>Human T cells exhibit distinct transcriptional profiles across different tissues, influenced by factors like tissue site which affects their persistence and function.<|Question|>In the immature alpha-beta T cell from the bone marrow, which gene is most highly expressed and what does this suggest in terms of cellular function or significance?<|Answer|>In the immature alpha-beta T cell from the bone marrow, RPS27 is the most highly expressed gene, suggesting active protein synthesis and ribosomal activity, as RPS27 is a ribosomal protein involved in ribosome asse

In [53]:
len(part4_completions)

11

In [54]:
with open("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/output_completions.jsonl", "a") as f:
    for custom_id, completion in part4_completions:
        f.write(json.dumps({"custom_id": custom_id, "completion": completion.choices[0].message.content}) + "\n")

In [64]:
# retrieve the batch job
output_file_id = client.batches.retrieve("batch_675881ee61148191bd779f9edcebdcdf").output_file_id

# get the output
batch_output_file = client.files.content(output_file_id)
print(batch_output_file.text)

# Dump batch_output_file as a JSON file
with open("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/output.jsonl", "a") as f:
    f.write(batch_output_file.text)

{"id": "batch_req_6758826eda7c8190bea946dc3e35f21c", "custom_id": "dataset_name:Non-neuronal cells of the human fovea and peripheral retina, url:https://doi.org/10.1038/s41598-020-66092-9, dataset_index:700, used_rows:[0, 1, 2, 3, 4, 5, 6, 7]", "response": {"status_code": 200, "request_id": "ae83df9fb76fa8101123dadf312a0a5b", "body": {"id": "chatcmpl-AcyqAk0ybx2Rp7rYJS3RDMIdpD7Ud", "object": "chat.completion", "created": 1733853698, "model": "gpt-4o-2024-08-06", "choices": [{"index": 0, "message": {"role": "assistant", "content": "<|Context|>The dataset includes gene expressions from Mueller cells located in the foveal and peripheral regions of the human retina, used to compare differences in gene expression between these regions.<|Question|>Which key genes are highly expressed in the Mueller cells of the fovea that might play a significant role in antioxidant response?<|Answer|>In the Mueller cells of the fovea, the gene GPX3 (Glutathione Peroxidase 3) is highly expressed. GPX3 is kno

In [71]:
output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/output.jsonl"
output_jsonl = []
with open(output_path, "r") as f:
    for line in f:
        output_jsonl.append(json.loads(line))
print(len(output_jsonl))
print(output_jsonl[0]['response']['body']['choices'][0]['message']['content'])

76
<|Context|>Proximal tubules are important for glucose metabolism and are affected by diabetic kidney disease, which results in changes in gene expression patterns related to inflammation and metabolic processes.<|Question|>How does the gene expression profile of the epithelial cell of proximal tubule reflect characteristics of diabetic kidney disease?<|Answer|>The epithelial cell of proximal tubule in the diabetic kidney shows distinct gene expression patterns such as high expression of genes like NEAT1, PDE4D, PTH2R, ACSM2A, and notable expressions like PKHD1, ZBTB20, and FHIT that may be associated with the dysregulation of metabolites processing and inflammation. Noting the expression of CUBN and LRP2 also hints towards disrupted reabsorption processes.<|Keyword|>NEAT1, PDE4D, PTH2R, ACSM2A, PKHD1, ZBTB20, FHIT, CUBN, LRP2

<|Context|>Endothelial cells play a role in maintaining vascular homeostasis, and in the context of kidney diseases such as diabetes mellitus their gene expre

In [75]:
def to_hf_dataset(responses):
    # Create a new HuggingFace Dataset
    qa_dict_list = []
    for response in responses:
        dataset_name = response['custom_id'].split(", url:")[0].split("dataset_name:")[1]
        url = response['custom_id'].split(", url:")[1].split(", dataset_index:")[0]
        dataset_index = response['custom_id'].split(", dataset_index:")[1].split(", used_rows:")[0]
        used_rows = response['custom_id'].split(", used_rows:")[1]

        dataset_directory_path = "/users/zlyu12/Desktop/c2s-RL/RL_data/cell_type_result2024-12-09"
        dataset_file_path = os.path.join(dataset_directory_path, f"cell_type_genes({dataset_index}).csv")
        data_df = pd.read_csv(dataset_file_path)
        cell_sentence_context = ""
        use_d = True
        if len(data_df['diseases'].unique()) == 1:
            use_d = False
        used_rows_list = eval(used_rows)  # Convert string representation of list to actual list
        for index, row in data_df.iloc[used_rows_list].iterrows():
            cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, 101))
            if use_d:
                cell_sentence_context += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
            else:
                cell_sentence_context += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"

        all_QA_pairs = response['response']['body']['choices'][0]['message']['content']
        for QA_pair in all_QA_pairs.split("<|Context|>"):
            if not QA_pair:
                continue
            try:
                context = QA_pair.split("<|Question|>")[0].strip()
                question = QA_pair.split("<|Question|>")[1].split("<|Answer|>")[0].strip()
                answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
                label = QA_pair.split("<|Keyword|>")[1].strip()
            except:
                print(QA_pair)
                continue

            entry = {
                "Context": context,
                "Summary_Dataset": cell_sentence_context,
                "Question": question, 
                "Answer": answer,
                "Keyword": label,
                "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
                "Dataset_Name": dataset_name,
                "Publication_URL": url,
                "Dataset_Index": dataset_index,
                "Used_Rows": used_rows,
            }
            qa_dict_list.append(entry)
    
    qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists
    
    hf_dataset = Dataset.from_dict(qa_dict)

    return hf_dataset

all_hf_dataset = to_hf_dataset(output_jsonl)

Genes involved in cellular adhesion and structure could reflect neurodegenerative progression, considering brain tissue integrity.</|Question|>Are there genes in mature microglial cell expression that suggest roles in cellular adhesion relevant to Alzheimer’s disease?<|Answer|>Yes, the gene CELF2 in mature microglial cell expression suggests roles in cellular adhesion. CELF2 influences RNA binding affecting cell junctions and integrity, impacting Alzheimer’s disease progression through tissue structural maintenance.<|Keyword|>Yes, CELF2, cellular adhesion
Distinct patterns in host defense-related genes could indicate response to pathological states.<|Question|>Are there any host defense-related genes expressed in microglia related to Alzheimer’s disease?<|Answer>Yes, genes like CST3 and ITM2B present in Alzheimer’s microglia are related to host defense, reflecting potential neuroprotective responses.<|Keyword|>yes, CST3, ITM2B


Cells within the retina can exhibit differential expressi

In [78]:
all_processed_datasets = np.unique(all_hf_dataset['Dataset_Index'])
len(all_processed_datasets)

76

In [80]:
output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/output_completions.jsonl"
output_jsonl = []
with open(output_path, "r") as f:
    for line in f:
        output_jsonl.append(json.loads(line))

all_processed_datasets = np.unique(all_hf_dataset['Dataset_Index'])

qa_dict_list = []
for response in output_jsonl:
    dataset_name = response['custom_id'].split(", url:")[0].split("dataset_name:")[1]
    url = response['custom_id'].split(", url:")[1].split(", dataset_index:")[0]
    dataset_index = response['custom_id'].split(", dataset_index:")[1].split(", used_rows:")[0]
    if dataset_index not in all_processed_datasets:
        print("processed: " + dataset_index)
        continue
    used_rows = response['custom_id'].split(", used_rows:")[1]

    dataset_directory_path = "/users/zlyu12/Desktop/c2s-RL/RL_data/cell_type_result2024-12-09"
    dataset_file_path = os.path.join(dataset_directory_path, f"cell_type_genes({dataset_index}).csv")
    data_df = pd.read_csv(dataset_file_path)
    cell_sentence_context = ""
    use_d = True
    if len(data_df['diseases'].unique()) == 1:
        use_d = False
    used_rows_list = eval(used_rows)  # Convert string representation of list to actual list
    for index, row in data_df.iloc[used_rows_list].iterrows():
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, 101))
        if use_d:
            cell_sentence_context += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            cell_sentence_context += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"

    all_QA_pairs = response['completion']
    for QA_pair in all_QA_pairs.split("<|Context|>"):
        if not QA_pair:
            continue
        try:
            context = QA_pair.split("<|Question|>")[0].strip()
            question = QA_pair.split("<|Question|>")[1].split("<|Answer|>")[0].strip()
            answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
            label = QA_pair.split("<|Keyword|>")[1].strip()
        except:
            print(QA_pair)
            continue

        entry = {
            "Context": context,
            "Summary_Dataset": cell_sentence_context,
            "Question": question, 
            "Answer": answer,
            "Keyword": label,
            "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
            "Dataset_Name": dataset_name,
            "Publication_URL": url,
            "Dataset_Index": dataset_index,
            "Used_Rows": used_rows,
        }
    qa_dict_list.append(entry)

qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists

from_completions_hf_dataset = Dataset.from_dict(qa_dict)
from_completions_hf_dataset


processed: 777
processed: 778
processed: 706
processed: 726
processed: 728
processed: 738
processed: 739
processed: 762
processed: 763
processed: 768
processed: 769
processed: 770
processed: 771
processed: 772


Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 4
})

In [82]:
# Combine the datasets
combined_dataset = concatenate_datasets([all_hf_dataset, from_completions_hf_dataset])
print(f"Combined dataset has {len(combined_dataset)} rows")
print(np.unique(combined_dataset["Dataset_Index"]))
print(len(np.unique(combined_dataset["Dataset_Index"])))
combined_dataset.save_to_disk("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/combined_half_hf_dataset")

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Combined dataset has 1405 rows
['191' '192' '198' '199' '23' '267' '268' '269' '272' '273' '274' '275'
 '276' '277' '280' '281' '282' '284' '285' '286' '287' '289' '290' '303'
 '304' '305' '462' '463' '464' '465' '466' '467' '468' '471' '473' '474'
 '475' '476' '477' '478' '479' '480' '481' '482' '486' '487' '50' '51'
 '513' '52' '546' '603' '604' '605' '606' '607' '608' '609' '610' '611'
 '612' '613' '614' '632' '633' '634' '635' '636' '637' '638' '698' '699'
 '700' '701' '702' '703']
76


Saving the dataset (1/1 shards): 100%|██████████| 1405/1405 [00:00<00:00, 37330.53 examples/s]


In [84]:
# Split entries into two equal parts
entire_batch = []
with open("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input.jsonl", "r") as f:
    for line in f:
        entry = json.loads(line)
        entire_batch.append(entry)
print(len(entire_batch))

used_datasets = np.unique(combined_dataset["Dataset_Index"])
remaining_batch = [entry for entry in entire_batch if entry["custom_id"].split(", dataset_index:")[1].split(", used_rows:")[0] not in used_datasets]
print(f"Number of remaining entries: {len(remaining_batch)}")

91
Number of remaining entries: 15


In [85]:
# Save remaining entries to new jsonl file
with open("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_additional.jsonl", "w") as f:
    for entry in remaining_batch:
        json.dump(entry, f)
        f.write("\n")


In [86]:
with open("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_additional2.jsonl", "w") as f:
    for entry in remaining_batch[2:]:
        json.dump(entry, f)
        f.write("\n")

In [87]:
openai_api_key = open("/users/zlyu12/Desktop/cell2sentence/cell2sentence-dev/zyl/OpenAI_API_key.txt", "r").read()
client = openai.OpenAI(api_key=openai_api_key)

QA_file = client.files.create(
  file=open(f"/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/input_additional2.jsonl", "rb"),
  purpose="batch"
)

In [88]:
batch_job = client.batches.create(
  input_file_id=QA_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h",
  metadata={"version": "2024-12-10", "description": "Publication conditioned QA, part additional"}
)
print(batch_job.id)
print(batch_job)

batch_67588ab7f0108191948eceb994637a1c
Batch(id='batch_67588ab7f0108191948eceb994637a1c', completion_window='24h', created_at=1733855928, endpoint='/v1/chat/completions', input_file_id='file-3hefqN1PxzaaDWE9ZtLmvA', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1733942328, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'version': '2024-12-10', 'description': 'Publication conditioned QA, part additional'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [93]:
batch_job_id = "batch_67588ab7f0108191948eceb994637a1c"
openai_api_key = open("/users/zlyu12/Desktop/cell2sentence/cell2sentence-dev/zyl/OpenAI_API_key.txt", "r").read()
client = openai.OpenAI(api_key=openai_api_key)
print(client.batches.retrieve(batch_job_id).status)

completed


In [94]:
# retrieve the batch job
output_file_id = client.batches.retrieve("batch_67588ab7f0108191948eceb994637a1c").output_file_id

# get the output
batch_output_file = client.files.content(output_file_id)
print(batch_output_file.text)

# Dump batch_output_file as a JSON file
with open("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/output_additional.jsonl", "a") as f:
    f.write(batch_output_file.text)

output_path = "/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/output_additional.jsonl"
output_jsonl = []
with open(output_path, "r") as f:
    for line in f:
        output_jsonl.append(json.loads(line))
print(len(output_jsonl))

{"id": "batch_req_67588ba6300c8190941b5b57a86119df", "custom_id": "dataset_name:Single-cell transcriptomics of human T cells reveals tissue and activation signatures in health and disease, url:https://doi.org/10.1038/s41467-019-12464-3, dataset_index:726, used_rows:[18, 6, 15, 4, 16, 11, 17, 13, 0, 10, 7, 14, 5, 20, 1]", "response": {"status_code": 200, "request_id": "2b846684a76ab812c76989aa6fadd78c", "body": {"id": "chatcmpl-AczQAJbxteIqc3Z9VOOHEDqL8sqpo", "object": "chat.completion", "created": 1733855930, "model": "gpt-4o-2024-08-06", "choices": [{"index": 0, "message": {"role": "assistant", "content": "<|Context|>Researchers investigated T cells in various tissues to understand their roles and functional responses, revealing tissue-specific signatures and activation states<|Question|>Which genes in the \"immature alpha-beta T cell\" from the lower lobe of the left lung suggest its active role in immune response?<|Answer|>The presence of genes such as TMSB4X and B2M indicates this 

In [95]:
additional_hf_dataset = to_hf_dataset(output_jsonl)
additional_hf_dataset

Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 238
})

In [96]:
combined_dataset = concatenate_datasets([combined_dataset, additional_hf_dataset])
combined_dataset

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 1643
})

In [99]:
combined_dataset.save_to_disk("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/combined_hf_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 1643/1643 [00:00<00:00, 61796.54 examples/s]


In [102]:
additional_responses = [
    """<|Context|>Single-cell RNA sequencing analysis of lymphatic endothelial cells from cervical lymph nodes identifies distinct cell types based on their gene expression profiles, enabling understanding of their roles in immune response and vascular organization.<|Question|>What is the primary function of endothelial cells in lymphatic vessels, as indicated by MALAT1 and other highly expressed genes in their cell sentence?<|Answer|>Endothelial cells in lymphatic vessels primarily regulate vascular structure and immune responses. Genes like MALAT1, TMSB4X, and CLDN5 suggest roles in maintaining structural integrity and endothelial barrier function. High expression of immune-related genes like HLA-B and CXCL2 indicates involvement in antigen presentation and leukocyte recruitment.<|Keyword|>MALAT1, TMSB4X, CLDN5, HLA-B, CXCL2, structural integrity, antigen presentation, immune response<|Context|>Gene expression profiles in endothelial cells are highly indicative of their specific tissue and immunological functions.<|Question|>Does the expression of MALAT1 in the cell sentence suggest a role in immune regulation?<|Answer|>Yes, MALAT1 expression is associated with immune regulation through its involvement in maintaining endothelial cell barrier function and regulating gene networks related to immune responses, supported by co-expression of genes like HLA-A and HLA-B.<|Keyword|>Yes, MALAT1, immune regulation, HLA-A, HLA-B<|Context|>Gene expression in lymphatic endothelial cells can indicate specialization for tissue-specific functions like antigen presentation or structural support.<|Question|>Which genes in the endothelial cell sentence are key to its role in structural maintenance of the lymphatic vessel?<|Answer|>Genes such as VIM, CLDN5, and TMSB4X are key to structural maintenance, with VIM supporting cytoskeletal organization, CLDN5 ensuring tight junction integrity, and TMSB4X regulating actin dynamics.<|Keyword|>VIM, CLDN5, TMSB4X, structural maintenance<|Context|>Distinct lymphatic endothelial cell populations exhibit unique gene expression patterns reflecting their immune and vascular roles.<|Question|>What role does HLA-B play in lymphatic endothelial cells, based on its expression in the given cell sentence?<|Answer|>HLA-B facilitates antigen presentation to immune cells, suggesting that lymphatic endothelial cells contribute to adaptive immunity by presenting antigens to T cells and promoting immune surveillance within lymph nodes.<|Keyword|>HLA-B, antigen presentation, adaptive immunity<|Context|>High expression of immune-related genes in endothelial cells suggests active roles in immune surveillance and response.<|Question|>Can the expression of CXCL2 in the endothelial cell sentence indicate a function in immune cell recruitment?<|Answer|>Yes, CXCL2 expression indicates a role in recruiting neutrophils and other immune cells to the lymphatic vessels, facilitating an immune response during inflammation or infection.<|Keyword|>Yes, CXCL2, immune cell recruitment, neutrophils<|Context|>Lymphatic endothelial cells express a range of ribosomal and metabolic genes essential for their function.<|Question|>How do RPL10 and RPS6 contribute to the functionality of lymphatic endothelial cells?<|Answer|>RPL10 and RPS6 are components of ribosomes, essential for protein synthesis. Their expression ensures the production of proteins required for endothelial cell maintenance, repair, and functional responses.<|Keyword|>RPL10, RPS6, protein synthesis<|Context|>Gene expression in lymphatic endothelial cells reflects their adaptability to inflammatory stimuli and immune challenges.<|Question|>Does the expression of JUNB suggest a response to inflammation in endothelial cells?<|Answer|>Yes, JUNB expression is linked to inflammatory responses, as it regulates genes involved in cytokine production and immune cell adhesion, enabling endothelial cells to respond to and mediate inflammation.<|Keyword|>Yes, JUNB, inflammation, cytokine production<|Context|>Markers of tight junctions and endothelial integrity are crucial for maintaining lymphatic vessel function.<|Question|>What role does CLDN5 play in the lymphatic endothelial cell sentence?<|Answer|>CLDN5 is critical for maintaining tight junctions, ensuring selective permeability and structural stability of lymphatic endothelial barriers, which is vital for lymphatic vessel function.<|Keyword|>CLDN5, tight junctions, endothelial barrier<|Context|>Expression of specific ribosomal proteins in lymphatic endothelial cells is indicative of their active metabolic and structural roles.<|Question|>Why is RPL3 important in the lymphatic endothelial cell gene expression profile?<|Answer|>RPL3 is vital for ribosomal function, facilitating the synthesis of proteins necessary for cellular repair, immune interactions, and maintaining endothelial structure under varying physiological conditions.<|Keyword|>RPL3, ribosomal function, protein synthesis<|Context|>Lymphatic endothelial cells contribute to vascular remodeling and immune response during inflammation.<|Question|>Do S100A6 and NFKBIA suggest roles in inflammation and vascular remodeling?<|Answer|>Yes, S100A6 and NFKBIA expression indicate roles in inflammation and vascular remodeling. S100A6 regulates cell proliferation and response to stress, while NFKBIA modulates inflammatory signaling pathways.<|Keyword|>Yes, S100A6, NFKBIA, inflammation, vascular remodeling<|Context|>Certain genes in lymphatic endothelial cells are associated with stress response and maintaining cell homeostasis.<|Question|>What is the function of HSPB1 in the endothelial cell sentence?<|Answer|>HSPB1 functions as a molecular chaperone, protecting cells from stress by stabilizing protein structures and assisting in recovery from cellular damage, critical for endothelial cell homeostasis.<|Keyword|>HSPB1, stress response, homeostasis<|Context|>Genes regulating cytoskeletal organization are pivotal for the migration and shape of lymphatic endothelial cells.<|Question|>How does ACTB contribute to lymphatic endothelial cell functions?<|Answer|>ACTB encodes β-actin, essential for maintaining cytoskeletal integrity, enabling cell migration, and adapting cellular shape to support lymphatic flow and immune cell transmigration.<|Keyword|>ACTB, cytoskeletal integrity, cell migration<|Context|>Ribosomal genes support the high metabolic demands of endothelial cells in the lymphatic system.<|Question|>What is the significance of high RPS27A expression in lymphatic endothelial cells?<|Answer|>RPS27A, encoding a ribosomal protein, is crucial for efficient protein synthesis, supporting the production of molecules necessary for endothelial cell repair, signaling, and immune functions.<|Keyword|>RPS27A, ribosomal protein, protein synthesis<|Context|>Immune regulatory genes in endothelial cells are indicative of their role in modulating local immune responses.<|Question|>How does HLA-C contribute to the immune functions of endothelial cells?<|Answer|>HLA-C plays a role in antigen presentation, helping modulate immune responses by interacting with T cells and natural killer (NK) cells to regulate immune activity in lymphatic nodes.<|Keyword|>HLA-C, antigen presentation, immune regulation<|Context|>Certain genes in lymphatic endothelial cells are involved in oxidative stress responses.<|Question|>What is the role of FTH1 in lymphatic endothelial cells?<|Answer|>FTH1 encodes ferritin heavy chain, involved in iron storage and oxidative stress response, helping protect endothelial cells from damage due to reactive oxygen species (ROS).<|Keyword|>FTH1, oxidative stress, iron storage<|Context|>Genes such as GAPDH are ubiquitous but also reflect metabolic activity in endothelial cells.<|Question|>What does the expression of GAPDH indicate in lymphatic endothelial cells?<|Answer|>GAPDH expression indicates high glycolytic activity, reflecting the metabolic demands of lymphatic endothelial cells to sustain energy production for their functions.<|Keyword|>GAPDH, glycolysis, metabolic activity<|Context|>Stress-related genes in lymphatic endothelial cells provide resilience under physiological stressors.<|Question|>How does NEAT1 expression impact lymphatic endothelial cell functions?<|Answer|>NEAT1 contributes to cellular resilience by participating in the formation of nuclear paraspeckles, which regulate gene expression during stress and maintain endothelial function.<|Keyword|>NEAT1, nuclear paraspeckles, stress resilience<|Context|>Genes expressed in endothelial cells are associated with interactions with immune cells and extracellular matrix.<|Question|>What is the importance of IGFBP7 in lymphatic endothelial cells?<|Answer|>IGFBP7 regulates interactions with the extracellular matrix and modulates responses to growth factors, supporting vascular integrity and endothelial-immune cell communication.<|Keyword|>IGFBP7, extracellular matrix, vascular integrity""",
    """Based on the manuscript and dataset provided, I will create 20 question-answer pairs that align with the manuscript's analysis and biological context, focusing on the interpretation of cell sentences in terms of cells, tissues, and diseases. Here is the structured output:

<|Context|>The endothelial cell of the lymphatic vessel is involved in immune surveillance and lymphatic fluid transport. Key genes include MALAT1 and B2M, relevant to normal and inflammatory states.<|Question|>What indicates the endothelial cell of the lymphatic vessel's immune involvement in lymphadenitis?<|Answer|>The upregulation of immune-related genes such as B2M and IFITM3 in the lymphadenitis state suggests increased immune activity. These genes are part of pathways responding to antigen presentation and inflammatory signaling.<|Keyword|>Yes, B2M, IFITM3, antigen presentation, inflammation

<|Context|>The expression of MALAT1 is conserved across normal and disease states in endothelial cells, with variations potentially indicating transcriptional activity related to cellular stress or inflammation.<|Question|>Does MALAT1 expression suggest stress or inflammation in lymphadenitis?<|Answer|>Yes, MALAT1 is expressed in both normal and lymphadenitis states, but the relative consistency suggests its role may not be directly linked to inflammation but rather to transcriptional stability.<|Keyword|>Yes, MALAT1, transcriptional stability

<|Context|>Peripheral lymph node endothelial cells are known to express genes involved in ribosomal and metabolic processes.<|Question|>Which gene expression suggests enhanced protein synthesis in lymphadenitis?<|Answer|>Genes such as RPS3, RPS19, and RPL13 are highly expressed, indicating active ribosomal biogenesis and increased protein synthesis.<|Keyword|>Yes, RPS3, RPS19, RPL13, protein synthesis

<|Context|>The lymphatic endothelial cells participate in antigen transport and immune cell recruitment in inflammatory conditions.<|Question|>What evidence in gene expression supports immune cell recruitment in lymphadenitis?<|Answer|>The upregulation of CCL20 and IFITM3 supports immune cell recruitment. These genes are associated with chemotactic signals and cellular defense responses.<|Keyword|>Yes, CCL20, IFITM3, immune recruitment

<|Context|>Normal lymphatic endothelial cells exhibit consistent expression of structural genes like ACTB and ACTG1.<|Question|>Do structural proteins play a role in lymphadenitis-related changes in endothelial cells?<|Answer|>Yes, consistent expression of ACTB and ACTG1 suggests their role in maintaining cell integrity during inflammation.<|Keyword|>Yes, ACTB, ACTG1, cell integrity

<|Context|>Genes related to oxidative stress responses, such as FTH1 and B2M, are expressed in lymphatic endothelial cells.<|Question|>Is there evidence of oxidative stress response in lymphadenitis endothelial cells?<|Answer|>Yes, upregulation of FTH1 indicates increased activity in managing oxidative stress during inflammation.<|Keyword|>Yes, FTH1, oxidative stress

<|Context|>Peripheral lymph nodes in lymphadenitis involve endothelial adaptations for immune interactions.<|Question|>Which genes are indicative of enhanced immune interactions in lymphadenitis endothelial cells?<|Answer|>Genes like IFITM3 and H2-K1 are upregulated, highlighting enhanced immune signaling and antigen presentation roles.<|Keyword|>Yes, IFITM3, H2-K1, immune signaling

<|Context|>Normal endothelial cells maintain high expression of ribosomal proteins, ensuring efficient protein synthesis.<|Question|>Does gene expression indicate altered protein synthesis in lymphadenitis endothelial cells?<|Answer|>Yes, elevated expression of RPL13 and RPS19 suggests enhanced protein synthesis during inflammation.<|Keyword|>Yes, RPL13, RPS19, protein synthesis

<|Context|>The inflammatory state of lymphadenitis involves increased antigen processing activities.<|Question|>What gene expression changes indicate increased antigen processing in lymphadenitis?<|Answer|>H2-D1 and H2-K1 expression are upregulated, indicating enhanced antigen processing and presentation.<|Keyword|>Yes, H2-D1, H2-K1, antigen processing

<|Context|>Cellular responses in lymphadenitis include heightened stress response markers.<|Question|>What evidence suggests cellular stress responses in lymphadenitis endothelial cells?<|Answer|>Upregulation of genes like TPT1 and FTH1 suggests cellular mechanisms managing stress and iron metabolism.<|Keyword|>Yes, TPT1, FTH1, stress response

<|Context|>Peripheral lymph nodes function in immune surveillance through chemokine-mediated recruitment.<|Question|>Which genes suggest chemokine activity in endothelial cells during lymphadenitis?<|Answer|>CCL20 and CXCL10 expression are indicative of chemokine-mediated immune cell recruitment.<|Keyword|>Yes, CCL20, CXCL10, chemokine activity

<|Context|>Ribosomal protein genes are highly expressed in lymphatic endothelial cells, maintaining their biosynthetic needs.<|Question|>Is there evidence of ribosomal activity enhancement in normal vs. lymphadenitis conditions?<|Answer|>Yes, consistent expression of RPS3 and RPL13 across conditions suggests stable or enhanced ribosomal activity.<|Keyword|>Yes, RPS3, RPL13, ribosomal activity

<|Context|>Inflammatory lymphadenitis alters the expression of genes related to immune signaling pathways.<|Question|>What genes reflect altered immune signaling in lymphadenitis endothelial cells?<|Answer|>Genes like IFITM3 and CCL20 are upregulated, reflecting immune signaling pathway activation.<|Keyword|>Yes, IFITM3, CCL20, immune signaling

<|Context|>Genes involved in cytoskeletal maintenance are crucial for endothelial cell function under stress.<|Question|>Does ACTB expression in lymphadenitis endothelial cells suggest cytoskeletal adaptation?<|Answer|>Yes, the stable expression of ACTB ensures cytoskeletal integrity under inflammatory stress.<|Keyword|>Yes, ACTB, cytoskeletal adaptation

<|Context|>Lymphatic endothelial cells respond to inflammation with transcriptional changes in immune response genes.<|Question|>Are transcriptional changes in IFITM3 significant in lymphadenitis?<|Answer|>Yes, increased IFITM3 expression highlights its role in antiviral defense and immune modulation.<|Keyword|>Yes, IFITM3, immune modulation

<|Context|>Gene expression in normal endothelial cells reflects homeostasis and structural maintenance.<|Question|>Is there evidence of structural maintenance in lymphadenitis endothelial cells?<|Answer|>Yes, consistent expression of ACTB and ACTG1 reflects preserved structural integrity.<|Keyword|>Yes, ACTB, ACTG1, structural maintenance

<|Context|>Ribosomal gene expression in lymphatic endothelial cells supports protein production needs.<|Question|>Which ribosomal genes are critical in lymphadenitis protein production?<|Answer|>Genes such as RPS19 and RPL13 are critical for maintaining protein synthesis in inflammation.<|Keyword|>Yes, RPS19, RPL13, protein synthesis

<|Context|>Inflammatory lymphadenitis involves alterations in iron metabolism-related genes.<|Question|>What gene expression changes suggest iron metabolism adaptation in lymphadenitis?<|Answer|>FTH1 is upregulated, indicating increased activity in iron storage and homeostasis during inflammation.<|Keyword|>Yes, FTH1, iron metabolism

<|Context|>Antigen presentation genes are crucial in the immune response of lymphatic endothelial cells.<|Question|>Are antigen presentation processes active in lymphadenitis endothelial cells?<|Answer|>Yes, increased expression of H2-D1 and H2-K1 suggests active antigen presentation.<|Keyword|>Yes, H2-D1, H2-K1, antigen presentation

<|Context|>Immune-related genes in endothelial cells indicate inflammatory adaptation.<|Question|>Does CCL20 expression suggest immune adaptation in lymphadenitis?<|Answer|>Yes, upregulation of CCL20 supports its role in immune cell recruitment and inflammation.<|Keyword|>Yes, CCL20, immune adaptation"""

]

In [103]:
remaining_batch[:2]
qa_dict_list = []
for entry, response in zip(remaining_batch[:2], additional_responses):
    dataset_name = entry['custom_id'].split(", url:")[0].split("dataset_name:")[1]
    url = entry['custom_id'].split(", url:")[1].split(", dataset_index:")[0]
    dataset_index = entry['custom_id'].split(", dataset_index:")[1].split(", used_rows:")[0]
    used_rows = entry['custom_id'].split(", used_rows:")[1]

    dataset_directory_path = "/users/zlyu12/Desktop/c2s-RL/RL_data/cell_type_result2024-12-09"
    dataset_file_path = os.path.join(dataset_directory_path, f"cell_type_genes({dataset_index}).csv")
    data_df = pd.read_csv(dataset_file_path)
    cell_sentence_context = ""
    use_d = True
    if len(data_df['diseases'].unique()) == 1:
        use_d = False
    used_rows_list = eval(used_rows)  # Convert string representation of list to actual list
    for index, row in data_df.iloc[used_rows_list].iterrows():
        cell_sentence = ' '.join(row[f'gene_{i}'] for i in range(1, 101))
        if use_d:
            cell_sentence_context += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Disease: {row['diseases']}, Gene Expression: {cell_sentence}\n"
        else:
            cell_sentence_context += f"Cell Type: {row['cell_type']}, Tissue: {row['tissue']}, Gene Expression: {cell_sentence}\n"

    all_QA_pairs = response
    for QA_pair in all_QA_pairs.split("<|Context|>"):
        if not QA_pair:
            continue
        try:
            context = QA_pair.split("<|Question|>")[0].strip()
            question = QA_pair.split("<|Question|>")[1].split("<|Answer|>")[0].strip()
            answer = QA_pair.split("<|Answer|>")[1].split("<|Keyword|>")[0].strip()
            label = QA_pair.split("<|Keyword|>")[1].strip()
        except:
            print(QA_pair)
            continue

        entry = {
            "Context": context,
            "Summary_Dataset": cell_sentence_context,
            "Question": question, 
            "Answer": answer,
            "Keyword": label,
            "full_QA_pair": "<|Question|>"+QA_pair.strip(), 
            "Dataset_Name": dataset_name,
            "Publication_URL": url,
            "Dataset_Index": dataset_index,
            "Used_Rows": used_rows,
        }
        qa_dict_list.append(entry)
    
qa_dict = {key: [d[key] for d in qa_dict_list] for key in qa_dict_list[0].keys()} # list of dicts to dict of lists
    
two_hf_dataset = Dataset.from_dict(qa_dict)

Based on the manuscript and dataset provided, I will create 20 question-answer pairs that align with the manuscript's analysis and biological context, focusing on the interpretation of cell sentences in terms of cells, tissues, and diseases. Here is the structured output:




In [105]:
combined_dataset = concatenate_datasets([combined_dataset, two_hf_dataset])
combined_dataset

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 1681
})

In [107]:
combined_dataset.save_to_disk("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/combined_hf_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 1681/1681 [00:00<00:00, 40397.09 examples/s]


In [111]:
publication_urls = np.unique(combined_dataset["Publication_URL"])

test_urls = np.random.choice(publication_urls, size=4, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in test_urls])

train_sft_urls = np.random.choice(publication_urls, size=12, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in train_sft_urls])

train_dpo_urls = np.random.choice(publication_urls, size=12, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in train_dpo_urls])

val_sft_urls = np.random.choice(publication_urls, size=2, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in val_sft_urls])

val_dpo_urls = np.random.choice(publication_urls, size=2, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in val_dpo_urls])

# Create splits based on publication URLs
test_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in test_urls)
train_sft_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in train_sft_urls)
train_dpo_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in train_dpo_urls) 
val_sft_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in val_sft_urls)
val_dpo_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in val_dpo_urls)

# Print sizes of each split
print(f"Test dataset size: {len(test_dataset)}")
print(f"Train SFT dataset size: {len(train_sft_dataset)}")
print(f"Train DPO dataset size: {len(train_dpo_dataset)}")
print(f"Val SFT dataset size: {len(val_sft_dataset)}")
print(f"Val DPO dataset size: {len(val_dpo_dataset)}")

dataset_dict = DatasetDict({
    'test': test_dataset,
    'train_sft': train_sft_dataset, 
    'train_dpo': train_dpo_dataset,
    'val_sft': val_sft_dataset,
    'val_dpo': val_dpo_dataset
})

# Save the dataset dictionary to disk
dataset_dict.save_to_disk("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09_hf_dataset")


Filter: 100%|██████████| 1681/1681 [00:00<00:00, 53492.85 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 51723.03 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 53254.47 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 55658.73 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 56153.88 examples/s]


Test dataset size: 302
Train SFT dataset size: 897
Train DPO dataset size: 372
Val SFT dataset size: 74
Val DPO dataset size: 36


Saving the dataset (1/1 shards): 100%|██████████| 302/302 [00:00<00:00, 9308.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 897/897 [00:00<00:00, 20221.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 372/372 [00:00<00:00, 17014.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 74/74 [00:00<00:00, 3134.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 36/36 [00:00<00:00, 3624.81 examples/s]


In [2]:
combined_dataset = load_from_disk("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09/combined_hf_dataset")
combined_dataset

Dataset({
    features: ['Context', 'Summary_Dataset', 'Question', 'Answer', 'Keyword', 'full_QA_pair', 'Dataset_Name', 'Publication_URL', 'Dataset_Index', 'Used_Rows'],
    num_rows: 1681
})

In [6]:
# Set random seed for reproducibility
np.random.seed(123)

publication_urls = np.unique(combined_dataset["Publication_URL"])

test_urls = np.random.choice(publication_urls, size=2, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in test_urls])

train_sft_urls = np.random.choice(publication_urls, size=14, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in train_sft_urls])

train_dpo_urls = np.random.choice(publication_urls, size=10, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in train_dpo_urls])

val_sft_urls = np.random.choice(publication_urls, size=4, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in val_sft_urls])

val_dpo_urls = np.random.choice(publication_urls, size=2, replace=False)
publication_urls = np.array([url for url in publication_urls if url not in val_dpo_urls])

# Create splits based on publication URLs
test_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in test_urls)
train_sft_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in train_sft_urls)
train_dpo_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in train_dpo_urls) 
val_sft_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in val_sft_urls)
val_dpo_dataset = combined_dataset.filter(lambda x: x["Publication_URL"] in val_dpo_urls)

# Print sizes of each split
print(f"Test dataset size: {len(test_dataset)}")
print(f"Train SFT dataset size: {len(train_sft_dataset)}")
print(f"Train DPO dataset size: {len(train_dpo_dataset)}")
print(f"Val SFT dataset size: {len(val_sft_dataset)}")
print(f"Val DPO dataset size: {len(val_dpo_dataset)}")


Filter: 100%|██████████| 1681/1681 [00:00<00:00, 43636.59 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 43435.24 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 42757.75 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 43721.55 examples/s]
Filter: 100%|██████████| 1681/1681 [00:00<00:00, 39564.91 examples/s]

Test dataset size: 77
Train SFT dataset size: 907
Train DPO dataset size: 548
Val SFT dataset size: 111
Val DPO dataset size: 38





In [7]:
dataset_dict = DatasetDict({
    'test': test_dataset,
    'train_sft': train_sft_dataset, 
    'train_dpo': train_dpo_dataset,
    'val_sft': val_sft_dataset,
    'val_dpo': val_dpo_dataset
})

# Save the dataset dictionary to disk
dataset_dict.save_to_disk("/users/zlyu12/Desktop/c2s-RL/Create_Dataset/2024-12-09_hf_dataset_seed123")


Saving the dataset (1/1 shards): 100%|██████████| 77/77 [00:00<00:00, 4613.40 examples/s]


Saving the dataset (1/1 shards): 100%|██████████| 907/907 [00:00<00:00, 12564.52 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 548/548 [00:00<00:00, 10677.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 111/111 [00:00<00:00, 3798.82 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 38/38 [00:00<00:00, 3061.42 examples/s]
