<a href="https://colab.research.google.com/github/edudati/LLMsGeneOntology/blob/main/functionDescriptionUniProt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# --- Parte 1: Imports e configurações ---

import requests
import pandas as pd
import time

# Instalar e configurar NLTK
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# --- Parte 2: Funções ---

# 2.1 Pegar o Uniprot Id a partir do nome do gene
def get_uniprot_id(gene):
    url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene}+AND+organism_id:9606+AND+reviewed:true&format=json&size=1"
    response = requests.get(url)
    time.sleep(0.2)
    if response.status_code == 200:
        data = response.json()
        if data.get("results"):
            return data["results"][0]["primaryAccession"]
    return "Not found"


In [24]:
# --- Parte 2: Funções ---

# 2.2 Pegar a função do gene descrita no Uniprot
def get_function_description(uniprot_id):
    if uniprot_id == "Not found":
        return "Not found"
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    response = requests.get(url)
    time.sleep(0.2)
    if response.status_code == 200:
        data = response.json()
        comments = data.get("comments", [])
        for comment in comments:
            if comment.get("commentType") == "FUNCTION":
                texts = comment.get("texts", [])
                return " ".join(text["value"] for text in texts if "value" in text)
    return "Not found"

In [25]:
# --- Parte 2: Funções ---

# 2.3 Pegar os GO Ids a partir do Uniprot Id
def get_go_ids(uniprot_id):
    if uniprot_id in ["Not found", "Error"]:
        return "Not found"
    url = f"https://www.ebi.ac.uk/QuickGO/services/annotation/search?geneProductId=UniProtKB:{uniprot_id}&limit=100"
    response = requests.get(url)
    time.sleep(0.2)
    if response.status_code == 200:
        data = response.json()
        go_ids = set()
        for result in data.get("results", []):
            go_id = result.get("goId")
            if go_id:
                go_ids.add(go_id)
        return list(go_ids)
    return "Error"


In [26]:
# --- Parte 2: Funções ---

# 2.4 Pegar as definições para os GO Ids
def get_go_definition(go_id):
    url = f"https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{go_id}"
    response = requests.get(url)
    time.sleep(0.2)
    if response.status_code == 200:
        data = response.json()
        results = data.get("results", [])
        if results:
            name = results[0].get("name", "")
            definition = results[0].get("definition", {}).get("text", "")
            aspect = results[0].get("aspect", "")
            return {"goId": go_id, "name": name, "definition": definition, "aspect": aspect}
    return {"goId": go_id, "name": "", "definition": "", "aspect": ""}



In [27]:
# --- Parte 2: Funções ---

# 2.5 Extrair palavas-chave de um texto
def extract_keywords(text):
    if not isinstance(text, str):
        return []
    words = tokenizer.tokenize(text.lower())
    return list(set(word for word in words if word.isalpha() and word not in stop_words))


In [28]:
# --- Parte 3: Execução ---

genes = ['FXN', 'ATM', 'INVALIDGENE']  # Substitua pela lista real

# Para planilha 1
summary_rows = []

# Para planilha 2
go_rows = []

for gene in genes:
    uniprot_id = get_uniprot_id(gene)
    function_desc = get_function_description(uniprot_id)

    # Extrair palavras-chave
    if isinstance(function_desc, str):
        words = tokenizer.tokenize(function_desc.lower())
        function_keywords = list(set(
            word for word in words if word.isalpha() and word not in stop_words
        ))
    else:
        function_keywords = []

    go_ids = get_go_ids(uniprot_id)

    # Adicionar à planilha 1
    summary_rows.append({
        "gene_name": gene,
        "uniProt_id": uniprot_id,
        "function_description": function_desc,
        "function_keywords": ", ".join(function_keywords),
        "go_ids_list": go_ids if isinstance(go_ids, list) else []
    })

    # Adicionar à planilha 2
    if isinstance(go_ids, list):
        for go_id in go_ids:
            go = get_go_definition(go_id)
            go_rows.append({
                "gene_name": gene,
                "uniProt_id": uniprot_id,
                "goId": go["goId"],
                "goAspect": go["aspect"],
                "goName": go["name"],
                "goDefinition": go["definition"]
            })
    else:
        go_rows.append({
            "gene_name": gene,
            "uniProt_id": uniprot_id,
            "goId": "Not found",
            "goAspect": "",
            "goName": "",
            "goDefinition": ""
        })


In [29]:
# --- Parte 4: Exportar ---

summary_df = pd.DataFrame(summary_rows)
go_df = pd.DataFrame(go_rows)

summary_df.to_excel("output-01_summary_by_gene.xlsx", index=False)
go_df.to_excel("output-02_go_terms_expanded.xlsx", index=False)

from google.colab import files
files.download("output-01_summary_by_gene.xlsx")
files.download("output-02_go_terms_expanded.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
# --- Parte 5: Análise linguística ---

# Instalar bibliotecas necessárias
!pip install nltk spacy
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')


import spacy
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer

# Carregar modelo do spaCy para NER
nlp = spacy.load("en_core_web_sm")

# Inicializadores
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

# Simular df_func se necessário
# df_func = pd.read_excel("output-01_Function-Description-with-Keywords.xlsx")

# Funções de processamento
def lemmatise_text(text):
    if not isinstance(text, str):
        return []
    tokens = tokenizer.tokenize(text.lower())
    return list(set(lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words))

def get_pos_tags(text):
    if not isinstance(text, str):
        return []
    tokens = tokenizer.tokenize(text)
    return pos_tag(tokens)

def get_bigrams(text):
    if not isinstance(text, str):
        return []
    tokens = tokenizer.tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return [" ".join(bg) for bg in ngrams(tokens, 2)]

def get_named_entities(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return list(set(ent.text for ent in doc.ents))

def extract_pos(text):
    if not isinstance(text, str):
        return [], []
    tagged = pos_tag(tokenizer.tokenize(text.lower()))
    verbs = [word for word, tag in tagged if tag.startswith("VB")]
    adjs = [word for word, tag in tagged if tag.startswith("JJ")]
    return list(set(verbs)), list(set(adjs))

def extract_pos_spacy(text):
    if not isinstance(text, str):
        return [], []
    doc = nlp(text)
    verbs = list(set([token.text for token in doc if token.pos_ == "VERB"]))
    adjs = list(set([token.text for token in doc if token.pos_ == "ADJ"]))
    return verbs, adjs


# Aplicar ao DataFrame
linguistic_rows = []

for _, row in df_func.iterrows():
    desc = row["function_description"]
    tokens = tokenizer.tokenize(desc.lower()) if isinstance(desc, str) else []
    content_words = [w for w in tokens if w.isalpha() and w not in stop_words]
    verbs, adjs = extract_pos_spacy(desc)

    linguistic_rows.append({
        "gene_name": row["gene_name"],
        "uniProt_id": row["uniProt_id"],
        "function_description": desc,
        "keywords": list(set(content_words)),
        "lemmatised_words": list(set(lemmatizer.lemmatize(w) for w in content_words)),
        "pos_tags": list(set(pos_tag(tokens))) if tokens else [],
        "verbs": verbs,
        "adjectives": adjs,
        "bigrams": list(set(" ".join(bg) for bg in ngrams(content_words, 2))),
        "named_entities": list(set(ent.text for ent in nlp(desc).ents)) if isinstance(desc, str) else []
    })


linguistic_df = pd.DataFrame(linguistic_rows)

# Exportar para Excel
linguistic_df.to_excel("output-03_function_linguistic_analysis.xlsx", index=False)

from google.colab import files
files.download("output-03_function_linguistic_analysis.xlsx")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>