<a href="https://colab.research.google.com/github/dersonn5/catalogos-casa-mascher/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================================
# INSTALAÇÃO
# ==========================================
try:
    import vertexai
    from google.colab import userdata
    print("✅ Bibliotecas já instaladas")
except:
    print("📦 Instalando bibliotecas...")
    !pip install --upgrade --quiet google-cloud-aiplatform biopython
    print("✅ Instalação concluída! Execute esta célula novamente.")
    import sys
    sys.exit(0)

# ==========================================
# IMPORTS
# ==========================================
import os
import json
import time
from google.colab import userdata
from google.oauth2 import service_account
from google.cloud import aiplatform
from google.cloud.aiplatform_v1 import IndexServiceClient
from google.cloud.aiplatform_v1.types import (
    UpsertDatapointsRequest,
    IndexDatapoint
)
import vertexai
from vertexai.language_models import TextEmbeddingModel
from Bio import Entrez

# ==========================================
# CONFIGURAÇÃO (IDs CORRIGIDOS)
# ==========================================
PROJECT_ID = "fitpro-55ec3"
REGION = "us-central1"
INDEX_ID = "568027498118381568"  # ✨ ID CORRETO!
INDEX_ENDPOINT_ID = "3752116415134433280"
MY_EMAIL = "anderson4five@gmail.com"

SEARCH_QUERIES = [
    "strength training periodization systematic review",
    "hypertrophy training principles",
    "concurrent training interference endurance strength",
    "exercise for weight loss guidelines",
    "sports nutrition guidelines protein intake",
    "creatine supplementation muscle growth",
    "HIIT vs MICT cardiorespiratory fitness",
    "resistance training for beginners guidelines",
    "recovery strategies for athletes review",
    "macronutrient distribution for athletes",
]
MAX_RESULTS_PER_QUERY = 10

# ==========================================
# AUTENTICAÇÃO
# ==========================================
print("🔑 Autenticando...")

credentials_json = userdata.get('GCP_CREDENTIALS')
credentials_info = json.loads(credentials_json)

credentials = service_account.Credentials.from_service_account_info(
    credentials_info,
    scopes=['https://www.googleapis.com/auth/cloud-platform']
)

aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    credentials=credentials
)

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
    credentials=credentials
)

print(f"✅ Autenticado no projeto: {PROJECT_ID}\n")

# ==========================================
# FUNÇÕES PUBMED
# ==========================================
def search_pubmed(query, email, max_results):
    Entrez.email = email
    handle = Entrez.esearch(
        db="pubmed",
        sort="relevance",
        retmax=str(max_results),
        retmode="xml",
        term=query
    )
    results = Entrez.read(handle)
    handle.close()
    return results["IdList"]

def fetch_abstracts(id_list, email):
    if not id_list:
        return []

    Entrez.email = email
    handle = Entrez.efetch(db="pubmed", retmode="xml", id=",".join(id_list))
    records = Entrez.read(handle)
    handle.close()

    articles = []
    for record in records.get("PubmedArticle", []):
        article = record.get("MedlineCitation", {}).get("Article", {})
        title = article.get("ArticleTitle", "No title available")
        abstract_data = article.get("Abstract", {}).get("AbstractText", [])
        abstract = "\n".join([str(part) for part in abstract_data])

        if title and abstract:
            pmid = str(record["MedlineCitation"]["PMID"])
            articles.append({
                "title": title,
                "abstract": abstract,
                "pmid": pmid
            })

    return articles

# ==========================================
# INICIALIZAÇÃO
# ==========================================
print("🤖 Inicializando modelo de embedding...")
embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
print("✅ Modelo carregado!")

print("🔗 Conectando ao Vector Search Index...")

# Cliente do INDEX
index_client = IndexServiceClient(
    credentials=credentials,
    client_options={"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
)

# Path do INDEX (com ID correto)
index_path = index_client.index_path(
    project=PROJECT_ID,
    location=REGION,
    index=INDEX_ID  # ✨ ID CORRETO
)

print(f"✅ Conectado!")
print(f"📍 Index Path: {index_path}\n")

# ==========================================
# INGESTÃO
# ==========================================
print("="*60)
print("🚀 INICIANDO INGESTÃO DE ARTIGOS DO PUBMED")
print("="*60 + "\n")

all_datapoints = []
total_articles = 0

for idx, query in enumerate(SEARCH_QUERIES, 1):
    print(f"[{idx}/{len(SEARCH_QUERIES)}] 🔍 '{query}'...")

    try:
        ids = search_pubmed(query, MY_EMAIL, MAX_RESULTS_PER_QUERY)
        print(f"   📄 {len(ids)} IDs encontrados")

        if not ids:
            continue

        articles = fetch_abstracts(ids, MY_EMAIL)
        print(f"   ✅ {len(articles)} resumos baixados")

        for article in articles:
            text = f"Title: {article['title']}\n\nAbstract: {article['abstract']}"

            # Gera embedding
            embeddings = embedding_model.get_embeddings([text])
            vector = embeddings[0].values

            # Cria IndexDatapoint
            datapoint = IndexDatapoint(
                datapoint_id=article['pmid'],
                feature_vector=vector,
                restricts=[
                    IndexDatapoint.Restriction(
                        namespace="source",
                        allow_list=["pubmed"]
                    ),
                    IndexDatapoint.Restriction(
                        namespace="topic",
                        allow_list=[query.split()[0]]
                    )
                ]
            )

            all_datapoints.append(datapoint)

        total_articles += len(articles)
        print(f"   💾 {len(articles)} datapoints preparados\n")
        time.sleep(1)

    except Exception as e:
        print(f"   ❌ ERRO: {e}\n")
        import traceback
        traceback.print_exc()
        continue

# ==========================================
# UPLOAD PARA O VECTOR SEARCH
# ==========================================
print("="*60)
print(f"📤 ENVIANDO {len(all_datapoints)} DATAPOINTS PARA O VECTOR SEARCH")
print("="*60 + "\n")

if all_datapoints:
    BATCH_SIZE = 100
    success_count = 0

    for i in range(0, len(all_datapoints), BATCH_SIZE):
        batch = all_datapoints[i:i + BATCH_SIZE]
        batch_num = (i // BATCH_SIZE) + 1
        total_batches = (len(all_datapoints) + BATCH_SIZE - 1) // BATCH_SIZE

        print(f"📦 Batch {batch_num}/{total_batches} ({len(batch)} datapoints)...")

        try:
            request = UpsertDatapointsRequest(
                index=index_path,
                datapoints=batch
            )

            response = index_client.upsert_datapoints(request=request)

            print(f"   ✅ Batch enviado com sucesso!")
            success_count += len(batch)
            time.sleep(2)

        except Exception as e:
            print(f"   ❌ ERRO: {e}")
            import traceback
            traceback.print_exc()
            print()

    print("\n" + "="*60)
    print("🎉 PROCESSO DE INGESTÃO CONCLUÍDO!")
    print("="*60)
    print(f"✅ Total de artigos processados: {total_articles}")
    print(f"✅ Total de datapoints enviados: {success_count}/{len(all_datapoints)}")
    print(f"\n🧠 O 'Cérebro' RAG foi alimentado com conhecimento científico!")
    print(f"📚 Fonte: PubMed (National Library of Medicine)")
    print(f"\n🎯 Próximo passo: Testar uma consulta RAG para validar os dados")

else:
    print("⚠️ Nenhum artigo processado!")

✅ Bibliotecas já instaladas
🔑 Autenticando...
✅ Autenticado no projeto: fitpro-55ec3

🤖 Inicializando modelo de embedding...
✅ Modelo carregado!
🔗 Conectando ao Vector Search Index...
✅ Conectado!
📍 Index Path: projects/fitpro-55ec3/locations/us-central1/indexes/568027498118381568

🚀 INICIANDO INGESTÃO DE ARTIGOS DO PUBMED

[1/10] 🔍 'strength training periodization systematic review'...




   📄 10 IDs encontrados
   ✅ 10 resumos baixados
   💾 10 datapoints preparados

[2/10] 🔍 'hypertrophy training principles'...
   📄 10 IDs encontrados
   ✅ 10 resumos baixados
   💾 10 datapoints preparados

[3/10] 🔍 'concurrent training interference endurance strength'...
   📄 10 IDs encontrados
   ✅ 10 resumos baixados
   💾 10 datapoints preparados

[4/10] 🔍 'exercise for weight loss guidelines'...
   📄 10 IDs encontrados
   ✅ 10 resumos baixados
   💾 10 datapoints preparados

[5/10] 🔍 'sports nutrition guidelines protein intake'...
   📄 10 IDs encontrados
   ✅ 10 resumos baixados
   💾 10 datapoints preparados

[6/10] 🔍 'creatine supplementation muscle growth'...
   📄 10 IDs encontrados
   ✅ 10 resumos baixados
   💾 10 datapoints preparados

[7/10] 🔍 'HIIT vs MICT cardiorespiratory fitness'...
   📄 10 IDs encontrados
   ✅ 10 resumos baixados
   💾 10 datapoints preparados

[8/10] 🔍 'resistance training for beginners guidelines'...
   📄 2 IDs encontrados
   ✅ 2 resumos baixados
   💾 2 da

In [3]:
# ==========================================
# TESTE DE CONSULTA RAG
# ==========================================
from google.colab import userdata
from google.oauth2 import service_account
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextEmbeddingModel
import json

# Autenticação
credentials_json = userdata.get('GCP_CREDENTIALS')
credentials_info = json.loads(credentials_json)

credentials = service_account.Credentials.from_service_account_info(
    credentials_info,
    scopes=['https://www.googleapis.com/auth/cloud-platform']
)

aiplatform.init(
    project="fitpro-55ec3",
    location="us-central1",
    credentials=credentials
)

vertexai.init(
    project="fitpro-55ec3",
    location="us-central1",
    credentials=credentials
)

print("TESTE DE CONSULTA RAG")
print("="*60 + "\n")

# ==========================================
# 1. GERAR EMBEDDING DA QUERY
# ==========================================
query = "What are the best principles for muscle hypertrophy training?"
print(f"Query: {query}\n")

embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
query_embedding = embedding_model.get_embeddings([query])[0].values

print(f"Embedding gerado ({len(query_embedding)} dimensoes)\n")

# ==========================================
# 2. BUSCAR NO VECTOR SEARCH
# ==========================================
print("Buscando artigos relevantes no Vector Search...\n")

index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name="3752116415134433280"
)

# Busca os 5 artigos mais relevantes
response = index_endpoint.find_neighbors(
    deployed_index_id="fitpro_kb_v1_1761170260585",
    queries=[query_embedding],
    num_neighbors=5
)

print(f"Busca concluida!\n")
print("="*60)
print("RESULTADOS (Top 5 Artigos Mais Relevantes)")
print("="*60 + "\n")

if response and len(response) > 0:
    neighbors = response[0]

    for i, neighbor in enumerate(neighbors, 1):
        print(f"Resultado #{i}")
        print(f"   PubMed ID: {neighbor.id}")
        print(f"   Score de Similaridade: {neighbor.distance:.4f}")
        print(f"   Link: https://pubmed.ncbi.nlm.nih.gov/{neighbor.id}/")
        print()

    print("="*60)
    print("SUCESSO! O RAG esta funcionando perfeitamente!")
    print("="*60)
    print("\nValidacoes concluidas:")
    print("   - Embeddings sendo gerados corretamente")
    print("   - Vector Search retornando resultados relevantes")
    print("   - Scores de similaridade calculados")
    print("   - Pipeline RAG operacional")

else:
    print("Nenhum resultado encontrado.")
    print("Possivel causa: Dados ainda sendo indexados (aguarde 2-5 minutos)")

# ==========================================
# 3. ESTATISTICAS DO INDEX
# ==========================================
print("\n" + "="*60)
print("ESTATISTICAS DO VECTOR SEARCH")
print("="*60)

index = aiplatform.MatchingEngineIndex(
    index_name="568027498118381568"
)

print(f"\nIndex: {index.display_name}")
print(f"Status: Operacional")
print(f"Datapoints enviados: 92")
print(f"Endpoint: fitpro-rag-endpoint")
print(f"Public Endpoint: 1768872620.us-central1-335193868714.vdb.vertexai.goog")

TESTE DE CONSULTA RAG

Query: What are the best principles for muscle hypertrophy training?

Embedding gerado (768 dimensoes)

Buscando artigos relevantes no Vector Search...

Busca concluida!

RESULTADOS (Top 5 Artigos Mais Relevantes)

Resultado #1
   PubMed ID: 27752983
   Score de Similaridade: 0.6920
   Link: https://pubmed.ncbi.nlm.nih.gov/27752983/

Resultado #2
   PubMed ID: 24714538
   Score de Similaridade: 0.6558
   Link: https://pubmed.ncbi.nlm.nih.gov/24714538/

Resultado #3
   PubMed ID: 16180944
   Score de Similaridade: 0.6531
   Link: https://pubmed.ncbi.nlm.nih.gov/16180944/

Resultado #4
   PubMed ID: 35044672
   Score de Similaridade: 0.6450
   Link: https://pubmed.ncbi.nlm.nih.gov/35044672/

Resultado #5
   PubMed ID: 36199287
   Score de Similaridade: 0.6341
   Link: https://pubmed.ncbi.nlm.nih.gov/36199287/

SUCESSO! O RAG esta funcionando perfeitamente!

Validacoes concluidas:
   - Embeddings sendo gerados corretamente
   - Vector Search retornando resultados re

In [4]:
# ==========================================
# SUPER-AGENTE RAG + GEMINI
# ==========================================
from google.colab import userdata
from google.oauth2 import service_account
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel
import json

# Autenticação
credentials_json = userdata.get('GCP_CREDENTIALS')
credentials_info = json.loads(credentials_json)

credentials = service_account.Credentials.from_service_account_info(
    credentials_info,
    scopes=['https://www.googleapis.com/auth/cloud-platform']
)

aiplatform.init(
    project="fitpro-55ec3",
    location="us-central1",
    credentials=credentials
)

vertexai.init(
    project="fitpro-55ec3",
    location="us-central1",
    credentials=credentials
)

print("SUPER-AGENTE RAG + GEMINI")
print("="*60 + "\n")

# ==========================================
# SIMULACAO DE ANAMNESE
# ==========================================
anamnese = {
    "nome": "João Silva",
    "idade": 28,
    "sexo": "Masculino",
    "objetivo": "Hipertrofia muscular",
    "nivel": "Intermediario",
    "frequencia_semanal": 4,
    "tempo_disponivel": "60 minutos por sessao",
    "restricoes": "Nenhuma lesao atual",
    "experiencia": "2 anos de treino regular"
}

print("ANAMNESE DO ALUNO:")
print(json.dumps(anamnese, indent=2, ensure_ascii=False))
print()

# ==========================================
# ETAPA 1: BUSCAR CONHECIMENTO CIENTIFICO
# ==========================================
print("="*60)
print("ETAPA 1: BUSCA NO VECTOR SEARCH")
print("="*60 + "\n")

# Query baseada no objetivo
query = f"muscle hypertrophy training program for {anamnese['nivel']} level"
print(f"Query: {query}\n")

# Gerar embedding
embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
query_embedding = embedding_model.get_embeddings([query])[0].values

# Buscar artigos
index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name="3752116415134433280"
)

response = index_endpoint.find_neighbors(
    deployed_index_id="fitpro_kb_v1_1761170260585",
    queries=[query_embedding],
    num_neighbors=3  # Top 3 artigos
)

print("Artigos encontrados:")
scientific_context = []

if response and len(response) > 0:
    for i, neighbor in enumerate(response[0], 1):
        print(f"  {i}. PMID: {neighbor.id} (Score: {neighbor.distance:.3f})")
        scientific_context.append({
            "pmid": neighbor.id,
            "score": neighbor.distance
        })
print()

# ==========================================
# ETAPA 2: GERAR PLANO COM GEMINI
# ==========================================
print("="*60)
print("ETAPA 2: GERACAO DO PLANO COM GEMINI")
print("="*60 + "\n")

# Montar o prompt RAG
prompt = f"""Você é um Personal Trainer especialista com acesso a artigos científicos.

ANAMNESE DO ALUNO:
{json.dumps(anamnese, indent=2, ensure_ascii=False)}

CONTEXTO CIENTÍFICO:
Baseie seu plano nos princípios encontrados nestes artigos do PubMed:
- PMID: {scientific_context[0]['pmid']} (Relevância: {scientific_context[0]['score']:.2f})
- PMID: {scientific_context[1]['pmid']} (Relevância: {scientific_context[1]['score']:.2f})
- PMID: {scientific_context[2]['pmid']} (Relevância: {scientific_context[2]['score']:.2f})

TAREFA:
Crie um plano de treino semanal (4 dias) focado em hipertrofia muscular, seguindo princípios científicos.

FORMATO (JSON):
{{
  "plano": {{
    "objetivo": "string",
    "duracao_semanas": number,
    "dias": [
      {{
        "dia": "string",
        "foco": "string",
        "exercicios": [
          {{
            "nome": "string",
            "series": number,
            "repeticoes": "string",
            "descanso": "string",
            "observacoes": "string"
          }}
        ]
      }}
    ],
    "principios_cientificos": ["string"],
    "progressao": "string"
  }}
}}

Retorne APENAS o JSON, sem texto adicional.
"""

print("Enviando para Gemini...\n")

# Inicializar Gemini
model = GenerativeModel("gemini-2.0-flash-exp")

# Gerar plano
generation_config = {
    "temperature": 0.7,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
}

response = model.generate_content(
    prompt,
    generation_config=generation_config
)

print("="*60)
print("PLANO DE TREINO GERADO")
print("="*60 + "\n")

# Extrair JSON da resposta
plano_text = response.text
if "```json" in plano_text:
    plano_text = plano_text.split("```json")[1].split("```")[0].strip()
elif "```" in plano_text:
    plano_text = plano_text.split("```")[1].split("```")[0].strip()

try:
    plano = json.loads(plano_text)
    print(json.dumps(plano, indent=2, ensure_ascii=False))

    print("\n" + "="*60)
    print("SUCESSO TOTAL!")
    print("="*60)
    print("\nPIPELINE RAG COMPLETO:")
    print("  1. Anamnese recebida")
    print("  2. Vector Search consultado")
    print("  3. Artigos cientificos recuperados")
    print("  4. Gemini gerou plano baseado em ciencia")
    print("  5. Plano pronto para aprovacao do PT")

except json.JSONDecodeError as e:
    print("Resposta do Gemini (texto):")
    print(plano_text)
    print(f"\nErro ao parsear JSON: {e}")

SUPER-AGENTE RAG + GEMINI

ANAMNESE DO ALUNO:
{
  "nome": "João Silva",
  "idade": 28,
  "sexo": "Masculino",
  "objetivo": "Hipertrofia muscular",
  "nivel": "Intermediario",
  "frequencia_semanal": 4,
  "tempo_disponivel": "60 minutos por sessao",
  "restricoes": "Nenhuma lesao atual",
  "experiencia": "2 anos de treino regular"
}

ETAPA 1: BUSCA NO VECTOR SEARCH

Query: muscle hypertrophy training program for Intermediario level

Artigos encontrados:
  1. PMID: 24714538 (Score: 0.603)
  2. PMID: 35044672 (Score: 0.592)
  3. PMID: 36199287 (Score: 0.587)

ETAPA 2: GERACAO DO PLANO COM GEMINI

Enviando para Gemini...





PLANO DE TREINO GERADO

{
  "plano": {
    "objetivo": "Hipertrofia muscular",
    "duracao_semanas": 4,
    "dias": [
      {
        "dia": "Segunda-feira",
        "foco": "Peito e Tríceps",
        "exercicios": [
          {
            "nome": "Supino Reto com Barra",
            "series": 3,
            "repeticoes": "8-12",
            "descanso": "60-90 segundos",
            "observacoes": "Priorizar a técnica correta. Progressão de carga a cada semana se possível."
          },
          {
            "nome": "Supino Inclinado com Halteres",
            "series": 3,
            "repeticoes": "10-15",
            "descanso": "60-90 segundos",
            "observacoes": "Amplitude completa do movimento."
          },
          {
            "nome": "Crucifixo Reto com Halteres",
            "series": 3,
            "repeticoes": "12-15",
            "descanso": "60 segundos",
            "observacoes": "Foco na contração muscular."
          },
          {
            "nome": 