<a href="https://colab.research.google.com/github/edudati/llmGeneontology/blob/main/UniProt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import requests
import pandas as pd
import json
import time
from google.colab import files


# Gene list (example)
genes = ['FXN', 'ATM', 'INVALIDGENE']
organism = '9606'  # Homo sapiens

# Function to get UniProt ID
def get_uniprot_id(gene):
    url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene}+AND+organism_id:{organism}+AND+reviewed:true&format=json&size=1"
    response = requests.get(url)
    time.sleep(0.2)
    if response.status_code == 200:
        data = response.json()
        if data.get("results"):
            return data["results"][0]["primaryAccession"]
        else:
            return "Not found"
    else:
        return "Error"

# Get all genes
results = {gene: get_uniprot_id(gene) for gene in genes}

# Convert to DataFrame
df = pd.DataFrame(list(results.items()), columns=["gene_name", "uniProt_id"])

# Save to Excel
df.to_excel("output-01_uniprot_ids.xlsx", index=False)

files.download("output-01_uniprot_ids.xlsx")

df.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,gene_name,uniProt_id
0,FXN,Q16595
1,ATM,Q13315
2,INVALIDGENE,Not found


Um únido ProtId pode ter vários GO names porque o mesmo gene pode ter múltiplas funções moleculares, participar de vários processos biológicos e/ou estar presente em vários processos biológicos. As definições dos GO Ids é que serão usadas para retirarmos as palavras chave e para compararmos a performance.

In [24]:

# Função para buscar GO IDs para um UniProt ID
def get_go_ids(uniprot_id):
    if uniprot_id in ["Not found", "Error"]:
        return "Not found"

    url = f"https://www.ebi.ac.uk/QuickGO/services/annotation/search?geneProductId=UniProtKB:{uniprot_id}&limit=100"
    response = requests.get(url)
    time.sleep(0.2)
    if response.status_code == 200:
        data = response.json()
        go_ids = set()
        for result in data.get("results", []):
            go_id = result.get("goId")
            if go_id:
                go_ids.add(go_id)
        return list(go_ids)
    else:
        return "Error"

# Aplicar no DataFrame
df["go_ids"] = df["uniProt_id"].apply(get_go_ids)

# Save to Excel
df.to_excel("output-02_go_ids.xlsx", index=False)

files.download("output-02_go_ids.xlsx")


# Exibir exemplo
df.head()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,gene_name,uniProt_id,go_ids
0,FXN,Q16595,"[GO:0046716, GO:0016540, GO:0090201, GO:000575..."
1,ATM,Q13315,"[GO:0008340, GO:0007420, GO:0003677, GO:000016..."
2,INVALIDGENE,Not found,Not found


In [25]:
def get_go_definition(go_id):
    url = f"https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{go_id}"
    response = requests.get(url)
    time.sleep(0.2)
    if response.status_code == 200:
        data = response.json()
        results = data.get("results", [])
        if results:
            name = results[0].get("name", "")
            definition = results[0].get("definition", {}).get("text", "")
            return {"goId": go_id, "name": name, "definition": definition}
    return {"goId": go_id, "name": "", "definition": ""}



def get_all_definitions(go_ids):
    if go_ids == "Not found" or not isinstance(go_ids, list):
        return "Not found"
    return [get_go_definition(go_id) for go_id in go_ids]


df["go_terms"] = df["go_ids"].apply(get_all_definitions)

rows = []

# Transformar a estrutura expandida
for _, row in df.iterrows():
    gene = row["gene_name"]
    uniprot = row["uniProt_id"]
    terms = row["go_terms"]

    if terms == "Not found":
        rows.append({
            "gene_name": gene,
            "uniProt_id": uniprot,
            "goId": "Not found",
            "name": "",
            "definition": ""
        })
    else:
        for term in terms:
            rows.append({
                "gene_name": gene,
                "uniProt_id": uniprot,
                "goId": term["goId"],
                "name": term["name"],
                "definition": term["definition"]
            })

# Criar novo DataFrame expandido
expanded_df = pd.DataFrame(rows)

# Exportar para Excel
expanded_df.to_excel("output-03_go_terms_expanded.xlsx", index=False)

files.download("output-03_go_terms_expanded.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string

tokenizer = RegexpTokenizer(r'\w+')

stop_words = set(stopwords.words('english'))

def extract_keywords(text):
    if not isinstance(text, str):
        return []
    words = tokenizer.tokenize(text.lower())
    keywords = [
        word for word in words
        if word.isalpha() and word not in stop_words
    ]
    return list(set(keywords))

expanded_df["keywords"] = expanded_df["definition"].apply(extract_keywords)

# Exportar para Excel
expanded_df.to_excel("output-04_expanded_with_keywords.xlsx", index=False)

files.download("output-04_expanded_with_keywords.xlsx")





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Formato condensado por gene

In [27]:
# Agrupar por gene e consolidar os dados
grouped = expanded_df.groupby(["gene_name", "uniProt_id"])

summary_rows = []

for (gene, uniprot), group in grouped:
    go_ids = list(group["goId"].dropna().unique())

    # Somar todas as palavras-chave e remover duplicatas
    all_keywords = set()
    for kw_list in group["keywords"]:
        if isinstance(kw_list, list):
            all_keywords.update(kw_list)

    summary_rows.append({
        "gene_name": gene,
        "uniProt_id": uniprot,
        "go_ids": go_ids,
        "keywords": list(all_keywords)
    })

# Criar DataFrame final
summary_df = pd.DataFrame(summary_rows)

# Exportar para Excel
summary_df.to_excel("output-05_gene_summary.xlsx", index=False)

# Baixar no Colab
files.download("output-05_gene_summary.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>