In [12]:
import pandas as pd
import tiktoken

# Load the gene markers and biological processes files
gene_markers = pd.read_csv("Cluster16_20.csv")  # Gene markers
biological_processes = pd.read_csv("cell_data_cluster16.csv")  # Biological processes


In [13]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pickle
import time
import logging
from lxml import etree

# Configure logging
logging.basicConfig(level=logging.INFO)

# Constants and configuration
SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
DATE_RANGE = "2022/01/01[PDAT] : 2024/12/31[PDAT]"
SEARCH_TERM = f"peripheral blood mononuclear cell AND ({DATE_RANGE})"
BATCH_SIZE = 250
UNWANTED_PHRASES = [
    "Not applicable.",
    "The authors declare that they have no competing interests.",
]
MODEL_NAME = "all-MiniLM-L6-v2"
FAISS_INDEX_FILE = "article_embeddings.index"
TITLES_FILE = "titles.pkl"
MAX_CHUNK_SIZE = 500

# Initialize SentenceTransformer model
model = SentenceTransformer(MODEL_NAME)

# Helper function to split text into smaller chunks
def chunk_text(text, max_size):
    chunks = []
    words = text.split()
    while len(words) > max_size:
        chunk = " ".join(words[:max_size])
        chunks.append(chunk)
        words = words[max_size:]
    chunks.append(" ".join(words))
    return chunks

# Function to validate and debug XML
def validate_and_debug_xml(combined_articles):
    logging.info(f"Validating combined XML... Size: {len(combined_articles)} characters")
    articles = combined_articles.split('<?xml')  # Split content by XML declaration
    logging.info(f"Found {len(articles)} potential XML documents.")

    for i, article in enumerate(articles):
        if not article.strip():
            continue
        article = f"<?xml{article}"
        try:
            ET.fromstring(article)
        except ET.ParseError as e:
            logging.error(f"Error parsing article {i}: {e}")
            with open(f"problematic_article_{i}.xml", "w", encoding="utf-8") as f:
                f.write(article)

    wrapped_content = f"<root>{combined_articles}</root>"
    try:
        return etree.fromstring(wrapped_content, parser=etree.XMLParser(recover=True))
    except etree.XMLSyntaxError as e:
        logging.error(f"Failed to parse XML content: {e}")
        return None

# Step 1: Search for PMIDs
logging.info("Searching for PMIDs...")
search_params = {
    "db": "pmc",
    "term": SEARCH_TERM,
    "retmax": 1000,
    "retmode": "json",
}
response = requests.get(SEARCH_URL, params=search_params)
response.raise_for_status()
pmids = response.json().get("esearchresult", {}).get("idlist", [])

if not pmids:
    logging.error("No PMIDs found! Exiting.")
    exit()

logging.info(f"Found {len(pmids)} PMIDs.")

# Step 2: Fetch articles
logging.info("Fetching articles in batches...")
all_articles = []
for i in range(0, len(pmids), BATCH_SIZE):
    batch_pmids = pmids[i:i + BATCH_SIZE]
    fetch_params = {
        "db": "pmc",
        "id": ",".join(batch_pmids),
        "retmode": "xml",
        "rettype": "full",
    }
    response = requests.get(FETCH_URL, params=fetch_params)
    response.raise_for_status()
    all_articles.append(response.text)
    time.sleep(0.5)

combined_articles = "".join(all_articles)

# Step 3: Validate XML
logging.info("Validating and parsing XML...")
root = validate_and_debug_xml(combined_articles)
if root is None:
    logging.error("XML validation failed. Exiting.")
    exit()

# Step 4: Parse articles
logging.info("Parsing articles...")
articles_dict = {}
for i, article in enumerate(root.findall(".//article")):
    try:
        title_elem = article.find(".//article-title")
        title = title_elem.text.strip() if title_elem is not None and title_elem.text else f"Article_{i}"

        paragraphs = [
            p.text.strip() for p in article.findall(".//p")
            if p.text and p.text.strip() not in UNWANTED_PHRASES
        ]

        chunked_paragraphs = []
        for paragraph in paragraphs:
            chunked_paragraphs.extend(chunk_text(paragraph, MAX_CHUNK_SIZE))

        if title and chunked_paragraphs:
            articles_dict[title] = chunked_paragraphs
    except Exception as e:
        logging.error(f"Error parsing article {i}: {e}")

logging.info(f"Extracted {len(articles_dict)} articles.")

# Step 5: Save articles
output_file = "dictionary_filtered_articles.txt"
with open(output_file, "w", encoding="utf-8") as file:
    for title, paragraphs in articles_dict.items():
        file.write(f"Title: {title}\n")
        file.write("Paragraphs:\n")
        for paragraph in paragraphs:
            file.write(f"{paragraph}\n")
        file.write("=" * 50 + "\n")

logging.info(f"Saved filtered articles to {output_file}.")

# Step 6: Generate embeddings
logging.info("Generating embeddings...")
embeddings = {
    title: model.encode(f"{title}. " + " ".join(paragraphs))
    for title, paragraphs in articles_dict.items()
}

titles = list(embeddings.keys())
embedding_matrix = np.array(list(embeddings.values()))
normalized_matrix = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1, keepdims=True)

# Step 7: Save FAISS index
logging.info("Saving FAISS index...")
dimension = normalized_matrix.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(normalized_matrix)

faiss.write_index(index, FAISS_INDEX_FILE)
with open(TITLES_FILE, "wb") as f:
    pickle.dump(titles, f)

logging.info(f"Saved FAISS index to {FAISS_INDEX_FILE} and titles to {TITLES_FILE}.")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:root:Searching for PMIDs...
INFO:root:Found 1000 PMIDs.
INFO:root:Fetching articles in batches...
INFO:root:Validating and parsing XML...
INFO:root:Validating combined XML... Size: 151665969 characters
INFO:root:Found 5 potential XML documents.
INFO:root:Parsing articles...
INFO:root:Extracted 995 articles.
INFO:root:Saved filtered articles to dictionary_filtered_articles.txt.
INFO:root:Generating embeddings...
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 33.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.81it/s]
Batches: 100%|████

In [14]:
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pickle
import tiktoken
import logging

# Set the API key and model name for OpenAI
client = OpenAI(
    api_key="YourAPIkey",
)

# Load the FAISS index and article titles
FAISS_INDEX_FILE = "article_embeddings.index"
TITLES_FILE = "titles.pkl"

# Load the FAISS index and titles from file
index = faiss.read_index(FAISS_INDEX_FILE)
with open(TITLES_FILE, "rb") as f:
    titles = pickle.load(f)

# Initialize SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to query FAISS index
def query_articles(query_text, k=4):
    logging.info(f"Querying FAISS index with: {query_text}")
    query_embedding = model.encode(query_text)
    query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalize
    distances, indices = index.search(np.array([query_embedding]), k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if dist > 0.40:  # Filter by similarity threshold
            title = titles[idx]
            content = articles_dict.get(title, ["No content available"])  # Retrieve content
            
            # Log content for debugging
            if content == ["No content available"]:
                logging.warning(f"No content available for title: {title}")
            else:
                logging.info(f"Retrieved content for title: {title}, content length: {len(content)}")

            results.append((title, dist, content))
    
    return results

# Construct the prompt for GPT-4
prompt = "Identify the cell type of human peripheral blood mononuclear cells using the following genes, biological processes, and PMC journal articles:\n\n"

# Iterate over clusters to construct the prompt
for cluster in gene_markers.columns:
    # Get the top genes for the cluster
    genes = gene_markers[cluster].dropna().tolist()
    # Convert genes to strings (if they are integers)
    genes = [str(gene) for gene in genes]
    
    # Get the biological processes for the cluster
    if cluster in biological_processes.columns:
        bio_processes = biological_processes[cluster].dropna().tolist()
        bio_process_text = ", ".join(bio_processes)
    else:
        bio_process_text = "No biological processes available"

    # Add cluster information to the prompt
    prompt += f"Cluster: {cluster}\n"
    prompt += f"Top Genes: {', '.join(genes)}\n"
    prompt += f"Biological Processes: {bio_process_text}\n\n"

# Get relevant articles based on the user's query
example_query = "immune cell"  # Adjust based on the query
retrieved_articles = query_articles(example_query)

prompt += "\nAdditional Context from PubMed Articles:\n"
for title, score, content in retrieved_articles:
    prompt += f"Title: {title}, Score: {score}\n"
    prompt += f"Content:\n"
    for paragraph in content:
        prompt += f"{paragraph}\n"
    prompt += "\n"  # Add a blank line between articles

# Tokenizer for GPT-4
tokenizer = tiktoken.encoding_for_model("gpt-4o")
token_count = len(tokenizer.encode(prompt))
print(f"Prompt uses {token_count} tokens.")

# Query GPT using the OpenAI API (chat-based method)
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=16000,  # Adjust based on the expected response length
    temperature=0.3,  # Adjust for creativity
    top_p=0.9
)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:root:Querying FAISS index with: immune cell
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.05it/s]
INFO:root:Retrieved content for title: Cancer Immunotherapy Using AIRE Conditioning of the Tumor Epitopeome, content length: 67
INFO:root:Retrieved content for title: T lymphocyte-dependent IL-10 down-regulates a cytokine storm driven by, content length: 64
INFO:root:Retrieved content for title: Insights into Reproductive Immunology and Placental Pathology, content length: 19
INFO:root:Retrieved content for title: Mouse innate resistance to, content length: 35


Prompt uses 8618 tokens.


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [15]:
print(response.choices[0].message.content.strip())

Based on the provided information, the cluster of genes and the context from the PubMed articles, the cell type of the human peripheral blood mononuclear cells (PBMCs) that is most likely being described is hematopoietic stem and progenitor cells (HSPCs), particularly those related to early hematopoietic lineage commitment.

Here's the reasoning:

1. **Gene Expression**: The top genes listed, such as CD34, GATA2, and LYL1, are commonly associated with hematopoietic stem and progenitor cells. CD34 is a well-known marker for hematopoietic stem cells and progenitors. GATA2 is a transcription factor critical for hematopoiesis, and LYL1 is involved in early hematopoietic lineage commitment.

2. **Cluster Marker**: CDK6 is involved in cell cycle regulation and is expressed in various cell types, including hematopoietic progenitors. Its presence suggests a role in cell proliferation, which is consistent with progenitor cells.

3. **Lack of Specific Biological Processes**: The absence of speci