In [None]:
!pip install -U transformers accelerate sentencepiece


In [1]:
from google.colab import files
uploaded = files.upload()


Saving core-0.1.1.py to core-0.1.1.py
Saving core-0.2.2.py to core-0.2.2.py
Saving core-0.2.4.py to core-0.2.4.py


In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
MODEL_NAME = "Phi-3"

SYSTEM_PROMPT = """
Atue como um Engenheiro de Software SÃªnior especialista em Python e Engenharia de Software.
Analise o cÃ³digo procurando por Code Smells conforme o Refactoring Guru,
com foco em: Long Method, Magic Numbers e Cognitive Complexity.

Para cada code smell identificado, forneÃ§a:
- LocalizaÃ§Ã£o
- EvidÃªncia
- Justificativa tÃ©cnica
- SugestÃ£o de refatoraÃ§Ã£o
"""

device = "cuda"
dtype = torch.float16


def carregar_modelo():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        dtype=dtype,
        low_cpu_mem_usage=True
    ).eval()

    return tokenizer, model


def analisar_codigo(codigo, tokenizer, model):
    MAX_CHARS = 2500

    codigo = codigo[:MAX_CHARS]

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"Analise o cÃ³digo:\n\n{codigo}"}
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(device)

    attention_mask = input_ids.ne(tokenizer.pad_token_id)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=300,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    resposta = tokenizer.decode(
        output[0][input_ids.shape[1]:],
        skip_special_tokens=True
    )

    return resposta


In [3]:
tokenizer, model = carregar_modelo()

resultados = {}

for nome_arquivo in uploaded.keys():
    print(f"\nðŸ“‚ Analisando: {nome_arquivo}")

    with open(nome_arquivo, "r", encoding="utf-8") as f:
        codigo = f.read()

    resultado = analisar_codigo(codigo, tokenizer, model)
    resultados[nome_arquivo] = resultado

    print("\n--- RESULTADO ---\n")
    print(resultado)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]


ðŸ“‚ Analisando: core-0.1.1.py

--- RESULTADO ---

Localization:
- Long Method:
  - Evidence: The `download_models` method contains multiple responsibilities, including checking for existing assets, downloading assets, and handling custom paths.
  - Justification: The method is doing too much, which makes it hard to understand and maintain.
  - Suggestion: Break down the method into smaller, more focused methods, such as `check_existing_assets`, `download_assets`, and `handle_custom_path`.

- Magic Numbers:
  - Evidence: The method uses hardcoded strings like "huggingface", "local", and "custom" to represent different sources for downloading models.
  - Justification: These strings are not self-explanatory and can be confusing for someone reading the code.
  - Suggestion: Use constants or enums to represent these sources, making the code more readable and maintainable.

- Cognitive Complexity:
  - Evidence: The method has a high cognitive complexity due to its multiple responsibilitie

In [4]:
from google.colab import files

for nome, texto in resultados.items():
    saida = f"resultado_{nome.replace('.py','')}.txt"

    with open(saida, "w", encoding="utf-8") as f:
        f.write(texto)

    files.download(saida)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>