## Análise de Code Smells nas releases do Mastra


In [1]:
# --------------------------------------------
# 1. Instalação das dependências
# --------------------------------------------
!pip install -q transformers accelerate huggingface_hub gitpython tree-sitter


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/635.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m634.9/635.4 kB[0m [31m23.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m635.4/635.4 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# --------------------------------------------
# 2. Imports
# --------------------------------------------
import os
import re
import json
import gc
import torch
from git import Repo
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from google.colab import userdata
from datetime import datetime

In [3]:
# --------------------------------------------
# 3. Autenticação no Hugging Face
# --------------------------------------------
# O token do Hugging Face (HF_TOKEN) deve estar previamente
# cadastrado nas Secrets do Google Colab:
# Menu lateral → Secrets (🔑) → Add new secret
# Name: HF_TOKEN
# Value: <seu_token_do_hugging_face>
#
# O token NÃO deve ser hardcoded no notebook.


login(token=userdata.get('HF_TOKEN'))

In [4]:
# --------------------------------------------
# 4. Clonando o repositório Mastra
# --------------------------------------------
REPO_URL = "https://github.com/mastra-ai/mastra.git"
BASE_DIR = "/content/mastra"


if not os.path.exists(BASE_DIR):
  Repo.clone_from(REPO_URL, BASE_DIR)


repo = Repo(BASE_DIR)

In [5]:
# --------------------------------------------
# 5. Selecionando releases (tags)
# --------------------------------------------
# Últimas 3 releases
TAGS = sorted(repo.tags, key=lambda t: t.commit.committed_datetime, reverse=True)[:3]

In [6]:
# --------------------------------------------
# 6. Modelos selecionados
# --------------------------------------------
MODELS = {
    "qwen_small": "Qwen/Qwen2.5-Coder-3B-Instruct",
    "qwen_medium": "Qwen/Qwen2.5-Coder-7B-Instruct",
    "starcoder": "bigcode/starcoder2-7b"
}

In [7]:
# --------------------------------------------
# 7. Função para carregamento de modelo
# --------------------------------------------


def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )

    model.eval()
    return tokenizer, model

In [8]:
# --------------------------------------------
# 8. Prompt de análise de Code Smells
# --------------------------------------------

PROMPT_TEMPLATE = """
Você é um especialista em engenharia de software e qualidade de código.

Analise o código-fonte abaixo e identifique possíveis CODE SMELLS,
considerando EXCLUSIVAMENTE a taxonomia de code smells apresentada no portal
Refactoring Guru (https://refactoring.guru/refactoring/smells).

Utilize SOMENTE os code smells pertencentes às seguintes categorias do
Refactoring Guru:
- Bloaters
- Object-Orientation Abusers
- Change Preventers
- Dispensables
- Couplers

NÃO invente novos code smells, NÃO utilize classificações fora dessa lista
e NÃO utilize sinônimos diferentes para os nomes oficiais.

A resposta DEVE ser apresentada EXCLUSIVAMENTE em formato JSON válido,
sem texto adicional, comentários ou explicações fora do JSON.

Utilize EXATAMENTE a seguinte estrutura:

{
  "code_smells": [
    {
      "name": "Nome exato do code smell conforme Refactoring Guru",
      "category": "Uma das categorias: Bloaters | Object-Orientation Abusers | Change Preventers | Dispensables | Couplers",
      "snippet": "Trecho de código relevante ou descrição precisa da localização",
      "justification": "Justificativa técnica detalhada",
      "impact": "Impacto potencial na manutenibilidade, legibilidade, desempenho e testabilidade",
      "refactoring": "Sugestão de refatoração alinhada ao Refactoring Guru"
    }
  ]
}

Caso nenhum code smell da lista seja identificado, retorne:

{
  "code_smells": []
}

Código:
"""

VALID_SMELLS = {
    # Bloaters
    "Long Method",
    "Large Class",
    "Primitive Obsession",
    "Long Parameter List",
    "Data Clumps",

    # Object-Orientation Abusers
    "Switch Statements",
    "Temporary Field",
    "Refused Bequest",
    "Alternative Classes with Different Interfaces",

    # Change Preventers
    "Divergent Change",
    "Shotgun Surgery",
    "Parallel Inheritance Hierarchies",

    # Dispensables
    "Lazy Class",
    "Speculative Generality",
    "Dead Code",
    "Duplicated Code",
    "Data Class",

    # Couplers
    "Feature Envy",
    "Inappropriate Intimacy",
    "Message Chains",
    "Middle Man"
}



In [9]:
# --------------------------------------------
# 9. Coleta de arquivos de código
# --------------------------------------------

ALLOWED_PACKAGE_DIRS = [
    "packages/core",
    "packages/memory",
    "packages/rag",
    "packages/agent-builder",
    "packages/server",
    "packages/auth",
    "packages/deployer",
    "packages/cli",
]

EXCLUDED_PATTERNS = [".test.", ".spec."]
MAX_CHARS = 3000
FILES_PER_BATCH = 5

def collect_source_files(base_dir):
    files_collected = []
    for pkg in ALLOWED_PACKAGE_DIRS:
        src_dir = os.path.join(base_dir, pkg, "src")
        if not os.path.exists(src_dir):
            continue
        for root, _, files in os.walk(src_dir):
            for file in files:
                full = os.path.join(root, file)
                if (
                    full.endswith((".ts", ".js")) and
                    not any(p in full for p in EXCLUDED_PATTERNS)
                ):
                    files_collected.append(full)
    return files_collected

def read_code_safely(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()[:MAX_CHARS]


In [22]:
# --------------------------------------------
# 10. Análise por modelo
# --------------------------------------------

def extract_last_valid_json(text):
    json_blocks = re.findall(
        r"```json\s*(\{.*?\})\s*```",
        text,
        re.DOTALL
    )

    for block in reversed(json_blocks):
        try:
            return json.loads(block)
        except json.JSONDecodeError:
            continue

    return {
        "code_smells": [],
        "extraction_error": True
    }


def analyze_with_model(tokenizer, model, code_snippet):
    inputs = tokenizer(
        PROMPT_TEMPLATE + code_snippet,
        return_tensors="pt",
        truncation=True,
        max_length=4096
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=600,
        do_sample=False
    )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return extract_last_valid_json(full_text)



In [23]:
# --------------------------------------------
# 11. Inicialização global
# --------------------------------------------

PARTIAL_PATH = "/content/resultados_parciais.json"
FINAL_PATH = "/content/analise_code_smells_mastra.json"

if os.path.exists(PARTIAL_PATH):
    with open(PARTIAL_PATH, "r", encoding="utf-8") as f:
        results = json.load(f)
else:
    results = {}


In [24]:
# --------------------------------------------
# 12. Função de análise por modelo
# --------------------------------------------

def run_model_pipeline(
    model_key,
    models,
    tags,
    repo,
    base_dir,
    results,
    partial_path,
    files_per_batch,
    max_files=None  #parâmetro opcional
):
    if model_key not in models:
        raise ValueError(f"Modelo '{model_key}' não encontrado em MODELS.")

    model_name = models[model_key]

    print(f"\nExecutando modelo: {model_key}")
    tokenizer, model = load_model(model_name)

    for tag in tags:
        repo.git.checkout(tag)
        tag_name = str(tag)
        results.setdefault(tag_name, {})

        source_files = collect_source_files(base_dir)

        # Limite de arquivos para teste
        if max_files is not None:
            source_files = source_files[:max_files]

        for i in range(0, len(source_files), files_per_batch):
            batch = source_files[i:i + files_per_batch]

            for file_path in batch:
                results[tag_name].setdefault(file_path, {})

                # Evita reprocessamento
                if model_key in results[tag_name][file_path]:
                    continue

                print(f"[{model_key}] {tag_name} → {file_path}")

                try:
                    code = read_code_safely(file_path)
                    analysis = analyze_with_model(tokenizer, model, code)
                    results[tag_name][file_path][model_key] = analysis

                except Exception as e:
                    print(f"Erro: {e}")

                finally:
                    torch.cuda.empty_cache()
                    gc.collect()

            # Checkpoint incremental
            with open(partial_path, "w", encoding="utf-8") as f:
                json.dump(results, f, indent=2, ensure_ascii=False)

    # Limpeza explícita
    del model
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Modelo {model_key} finalizado.")

In [25]:
# --------------------------------------------
# 13. Pipeline Qwen 3B
# --------------------------------------------

run_model_pipeline(
    model_key="qwen_small",
    models=MODELS,
    tags=TAGS,
    repo=repo,
    base_dir=BASE_DIR,
    results=results,
    partial_path=PARTIAL_PATH,
    files_per_batch=FILES_PER_BATCH,
    max_files=5
)



Executando modelo: qwen_small


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[qwen_small] @mastra/client-js@0.17.1 → /content/mastra/packages/core/src/zod-to-json.ts
[qwen_small] @mastra/client-js@0.17.1 → /content/mastra/packages/core/src/utils.ts
[qwen_small] @mastra/client-js@0.17.1 → /content/mastra/packages/core/src/index.ts
[qwen_small] @mastra/client-js@0.17.1 → /content/mastra/packages/core/src/base.ts
[qwen_small] @mastra/core@0.24.8 → /content/mastra/packages/core/src/zod-to-json.ts
[qwen_small] @mastra/core@0.24.8 → /content/mastra/packages/core/src/utils.ts
[qwen_small] @mastra/core@0.24.8 → /content/mastra/packages/core/src/index.ts
[qwen_small] @mastra/core@0.24.8 → /content/mastra/packages/core/src/base.ts
[qwen_small] @mastra/dane@0.1.25 → /content/mastra/packages/core/src/zod-to-json.ts
[qwen_small] @mastra/dane@0.1.25 → /content/mastra/packages/core/src/utils.ts
[qwen_small] @mastra/dane@0.1.25 → /content/mastra/packages/core/src/index.ts
[qwen_small] @mastra/dane@0.1.25 → /content/mastra/packages/core/src/base.ts
Modelo qwen_small finalizado.

In [None]:
# --------------------------------------------
# 14. Pipeline Qwen 7B
# --------------------------------------------

run_model_pipeline(
    model_key="qwen_medium",
    models=MODELS,
    tags=TAGS,
    repo=repo,
    base_dir=BASE_DIR,
    results=results,
    partial_path=PARTIAL_PATH,
    files_per_batch=FILES_PER_BATCH,
    max_files=5
)


Executando modelo: qwen_medium


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

In [None]:
# --------------------------------------------
# 15. Pipeline StarCoder2 7B
# --------------------------------------------

run_model_pipeline(
    model_key="starcoder",
    models=MODELS,
    tags=TAGS,
    repo=repo,
    base_dir=BASE_DIR,
    results=results,
    partial_path=PARTIAL_PATH,
    files_per_batch=FILES_PER_BATCH,
    max_files=5
)


In [None]:
# --------------------------------------------
# 16. Consolidação final dos resultados
# --------------------------------------------
final_output = {
    "metadata": {
        "project": "mastra-ai/mastra",
        "analysis_date": datetime.now().isoformat(),
        "models": list(MODELS.keys()),
        "releases": [str(t) for t in TAGS],
        "environment": "Google Colab (Free Tier)",
        "context_limit_chars": MAX_CHARS
    },
    "results": results
}

with open(FINAL_PATH, "w", encoding="utf-8") as f:
    json.dump(final_output, f, indent=2, ensure_ascii=False)

print("✅ Análise concluída com sucesso.")
print(f"📁 Arquivo final: {FINAL_PATH}")


In [None]:
# --------------------------------------------
# 17. Pós-processamento
# --------------------------------------------
# Objetivos desta etapa:
# 1. Consolidar os code smells identificados pelos diferentes modelos
# 2. Identificar concordância entre modelos (maior confiabilidade)
# 3. Organizar os resultados para análise comparativa
# 4. Gerar estruturas auxiliares para tabelas e discussão dos resultados