In [3]:
import pandas as pd
import tiktoken

# Load the gene markers and biological processes files, specifying index column
gene_markers = pd.read_csv("cluster_gene_info_10.csv")  # Gene markers
biological_processes = pd.read_csv("GO_cluster_10.csv")  # Biological processes

# Initialize the prompt
prompt = "You are a bioinformatics expert. Analyze the following cluster information:\n\n"

# Iterate over clusters to construct the prompt
for cluster in gene_markers.columns:
    # Get the top genes for the cluster
    genes = gene_markers[cluster].dropna().tolist()
    
    # Convert genes to strings (if they are integers)
    genes = [str(gene) for gene in genes]
    
    # Get the biological processes for the cluster
    if cluster in biological_processes.columns:
        bio_processes = biological_processes[cluster].dropna().tolist()
        bio_process_text = ", ".join(bio_processes)
    else:
        bio_process_text = "No biological processes available"

    # Add cluster information to the prompt
    prompt += f"Cluster: {cluster}\n"
    prompt += f"Top Genes: {', '.join(genes)}\n"
    prompt += f"Biological Processes: {bio_process_text}\n\n"

# Print the prompt
print(prompt)






You are a bioinformatics expert. Analyze the following cluster information:

Cluster: Cluster_0
Top Genes: AW112010, Plac8, Ly6a, S100a11, S100a10, Ccl5, Actg1, S100a6, Glrx, Lgals1, Ctla2a, Cxcr6, Id2, Actb, Pglyrp1, Nkg7, Anxa2, Ccr5, Gzmk, Cxcr3, Ahnak
Biological Processes: GO:0002376~immune system process, GO:0045087~innate immune response, GO:0006954~inflammatory response, GO:0009615~response to virus, GO:0010628~positive regulation of gene expression, GO:0009617~response to bacterium, GO:0043065~positive regulation of apoptotic process, GO:0031640~killing of cells of another organism, GO:0042832~defense response to protozoan, GO:0045071~negative regulation of viral genome replication, GO:0045893~positive regulation of DNA-templated transcription, GO:0060326~cell chemotaxis, GO:0007204~positive regulation of cytosolic calcium ion concentration, GO:0008284~positive regulation of cell population proliferation, GO:0071346~cellular response to type II interferon, GO:1905686~positive r

In [2]:
print(gene_markers)

   Cluster_0
0   AW112010
1      Plac8
2       Ly6a
3    S100a11
4    S100a10
5       Ccl5
6      Actg1
7     S100a6
8       Glrx
9     Lgals1
10    Ctla2a
11     Cxcr6
12       Id2
13      Actb
14   Pglyrp1
15      Nkg7
16     Anxa2
17      Ccr5
18      Gzmk
19     Cxcr3
20     Ahnak


In [3]:
# Tokenizer for a specific model (e.g., "gpt-4")
tokenizer = tiktoken.encoding_for_model("gpt-4")

# Count tokens in the prompt
token_count = len(tokenizer.encode(prompt))
print(f"Prompt uses {token_count} tokens.")

Prompt uses 482 tokens.


In [4]:
print(gene_markers)

   Cluster_0
0   AW112010
1      Plac8
2       Ly6a
3    S100a11
4    S100a10
5       Ccl5
6      Actg1
7     S100a6
8       Glrx
9     Lgals1
10    Ctla2a
11     Cxcr6
12       Id2
13      Actb
14   Pglyrp1
15      Nkg7
16     Anxa2
17      Ccr5
18      Gzmk
19     Cxcr3
20     Ahnak


In [5]:
print(biological_processes)

                                            Cluster_0
0                    GO:0002376~immune system process
1                   GO:0045087~innate immune response
2                    GO:0006954~inflammatory response
3                        GO:0009615~response to virus
4   GO:0010628~positive regulation of gene expression
5                    GO:0009617~response to bacterium
6   GO:0043065~positive regulation of apoptotic pr...
7     GO:0031640~killing of cells of another organism
8            GO:0042832~defense response to protozoan
9   GO:0045071~negative regulation of viral genome...
10  GO:0045893~positive regulation of DNA-template...
11                         GO:0060326~cell chemotaxis
12  GO:0007204~positive regulation of cytosolic ca...
13  GO:0008284~positive regulation of cell populat...
14  GO:0071346~cellular response to type II interf...
15  GO:1905686~positive regulation of plasma membr...
16  GO:0071222~cellular response to lipopolysaccha...
17                          

In [6]:
print(gene_markers.columns)
print(biological_processes.columns)

Index(['Cluster_0'], dtype='object')
Index(['Cluster_0'], dtype='object')


In [7]:
from openai import OpenAI

# Instantiate the OpenAI client with the API key
client = OpenAI(api_key="YourAPIKey")

# Create a completion using the Davinci model
completion = client.completions.create(model="davinci-002", prompt="Hello world")

# Create a chat completion using the GPT-3.5 Turbo model
chat_completion = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hello world"}])

print(completion)


Completion(id='cmpl-AYFl7dDGn58nfFnTXzwBBxBpG3cfO', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text='! Guys, im having a little trouble here.today i changed to a wayland')], created=1732727093, model='davinci-002', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=16, prompt_tokens=2, total_tokens=18, completion_tokens_details=None, prompt_tokens_details=None))


In [6]:
from openai import OpenAI
## Set the API key and model name
client = OpenAI(
    api_key = "YourAPIKey",
)
# Define the prompt for the GPT model
# Initialize the prompt
prompt = "You are a genious bioinformatics expert. Analyze the following cluster information, biological processes and infer a cell type:\n\n"

# Iterate over clusters to construct the prompt
for cluster in gene_markers.columns:
    # Get the top genes for the cluster
    genes = gene_markers[cluster].dropna().tolist()
    
    # Convert genes to strings (if they are integers)
    genes = [str(gene) for gene in genes]
    
    # Get the biological processes for the cluster
    if cluster in biological_processes.columns:
        bio_processes = biological_processes[cluster].dropna().tolist()
        bio_process_text = ", ".join(bio_processes)
    else:
        bio_process_text = "No biological processes available"

    # Add cluster information to the prompt
    prompt += f"Cluster: {cluster}\n"
    prompt += f"Top Genes: {', '.join(genes)}\n"
    prompt += f"Biological Processes: {bio_process_text}\n\n"
 
# Tokenizer for a specific model (e.g., "gpt-4")
tokenizer = tiktoken.encoding_for_model("gpt-4o")
 
# Count tokens in the prompt
token_count = len(tokenizer.encode(prompt))
print(f"Prompt uses {token_count} tokens.")
 
# Query GPT using the new OpenAI API (chat-based method)
response = client.chat.completions.create(
    model="gpt-4o",  # Use "gpt-4" if you have access, or "gpt-3.5" for earlier models
    messages=[{"role": "user", "content": prompt}],
    max_tokens=2000,  # Adjust based on the expected response length
    temperature=0.3,  # Adjust for creativity
    top_p=0.9
)
 


Prompt uses 490 tokens.


In [7]:
print(response.choices[0].message.content.strip())

Based on the provided cluster information and biological processes, we can infer the likely cell type for Cluster_0.

### Top Genes:
- **AW112010, Plac8, Ly6a, S100a11, S100a10, Ccl5, Actg1, S100a6, Glrx, Lgals1, Ctla2a, Cxcr6, Id2, Actb, Pglyrp1, Nkg7, Anxa2, Ccr5, Gzmk, Cxcr3, Ahnak**

### Biological Processes:
- **Immune system process, innate immune response, inflammatory response, response to virus, response to bacterium, killing of cells of another organism, defense response to protozoan, negative regulation of viral genome replication, cell chemotaxis, cellular response to type II interferon, cellular response to lipopolysaccharide, inflammatory response to antigenic stimulus, defense response to virus, granzyme-mediated programmed cell death signaling pathway, apoptotic process**

### Analysis:
1. **Immune System and Inflammatory Response:** The presence of processes related to the immune system, innate immune response, and inflammatory response suggests that this cluster is re