<a href="https://colab.research.google.com/github/edinoliver/Hogwarts/blob/main/Harry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Hogwards - The school for learning

In [22]:
# =========================
# MINI-DUOLINGO A1‚ÄìB1 (Colab)
# Palavras: Oxford 3000 (A1‚ÄìB1)
# Frases: ManyThings/Tatoeba EN-PT, filtradas para usar SOMENTE palavras Oxford A1‚ÄìB1
# =========================

!pip -q install PyPDF2
import os, subprocess, zipfile, re, unicodedata, random
import pandas as pd
from PyPDF2 import PdfReader
from difflib import SequenceMatcher

# -------------------------
# 0) Fun√ß√µes utilit√°rias
# -------------------------
def normalize_text(s: str) -> str:
    s = str(s).strip().lower()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')  # remove acentos
    s = re.sub(r"[^\w\s']", " ", s)  # mant√©m ap√≥strofo
    s = re.sub(r"\s+", " ", s).strip()
    return s

def similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, normalize_text(a), normalize_text(b)).ratio()

def is_correct(user_answer: str, accepted_answers, threshold=0.86):
    best_ans, best_sc = "", 0.0
    for ans in accepted_answers:
        sc = similarity(user_answer, ans)
        if sc > best_sc:
            best_ans, best_sc = ans, sc
    return (best_sc >= threshold), best_ans, best_sc

def tokenize_en(s: str):
    # tokens alfab√©ticos + ap√≥strofo interno (don't)
    s = normalize_text(s)
    return re.findall(r"[a-z]+(?:'[a-z]+)?", s)

# -------------------------
# 1) Baixar Oxford 3000 por CEFR (com fallback)
# Fonte oficial do PDF (Oxford Learner's Dictionaries) [1](https://langeek.co/en/vocab/level-based)
# Alternativa .co.uk [5](https://www.esl-lounge.com/student/word-bank.php)
# Espelho GitHub [6](https://englishintake.com/learn-english/vocabulary/)
# -------------------------
targets = [
  "https://www.oxfordlearnersdictionaries.com/external/pdf/wordlists/oxford-3000-5000/The_Oxford_3000_by_CEFR_level.pdf",   # [1](https://langeek.co/en/vocab/level-based)
  "https://www.oxfordlearnersdictionaries.co.uk/us/external/pdf/wordlists/oxford-3000-5000/The_Oxford_3000_by_CEFR_level.pdf", # [5](https://www.esl-lounge.com/student/word-bank.php)
  "https://raw.githubusercontent.com/XA2005/CEFR-World-List/main/The_Oxford_3000_by_CEFR_level.pdf", # [6](https://englishintake.com/learn-english/vocabulary/)
]
ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
pdf_path = "oxford_3000_cefr.pdf"

def wget(url, out):
    cmd = ["wget", "-O", out, url, "--user-agent", ua, "--tries=10", "--waitretry=3", "--timeout=20", "--retry-connrefused"]
    r = subprocess.run(cmd, capture_output=True, text=True)
    return r.returncode, r.stderr

if os.path.exists(pdf_path):
    os.remove(pdf_path)

ok = False
for url in targets:
    code, err = wget(url, pdf_path)
    if os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 50_000:
        ok = True
        break

if not ok:
    raise RuntimeError("N√£o consegui baixar o PDF. Fa√ßa upload manual com o nome 'oxford_3000_cefr.pdf' na aba Files e rode de novo.")

print("‚úÖ PDF Oxford 3000 baixado:", pdf_path, f"({os.path.getsize(pdf_path)/1024:.1f} KB)")

# -------------------------
# 2) Extrair palavras A1‚ÄìB1 do Oxford 3000 por n√≠vel
# (O PDF √© 'Oxford 3000 by CEFR level') [1](https://langeek.co/en/vocab/level-based)
# -------------------------
LEVELS = ["A1", "A2", "B1", "B2"]

def extract_pages_text(pdf_path: str):
    reader = PdfReader(pdf_path)
    return [(p.extract_text() or "") for p in reader.pages]

def parse_oxford_words_by_level(pages_text):
    text = "\n".join(pages_text).replace("\xa0", " ")
    text = re.sub(r"[ \t]+", " ", text)

    found = []
    for lv in LEVELS:
        m = re.search(rf"\b{lv}\b", text)
        if m:
            found.append((lv, m.start()))
    found.sort(key=lambda x: x[1])
    if len(found) < 2:
        return {}

    blocks = {}
    for i, (lv, start) in enumerate(found):
        end = found[i+1][1] if i+1 < len(found) else len(text)
        blocks[lv] = text[start:end]

    pos_pattern = r"\b(adj|adv|aux|conj|det|modal|n|num|prep|pron|v)\.?\b"

    def extract_items(block: str):
        b = re.sub(pos_pattern, " ", block, flags=re.I)
        b = re.sub(r"\b\d+\b", " ", b)
        b = re.sub(r"\s+", " ", b).strip()

        # tokens simples; depois filtramos com regras
        tokens = re.findall(r"[A-Za-z][A-Za-z'\-]*", b)
        out, seen = [], set()
        for t in tokens:
            w = t.lower()
            if len(w) >= 2 and w not in seen:
                seen.add(w)
                out.append(w)
        return out

    return {lv: extract_items(bl) for lv, bl in blocks.items()}

pages = extract_pages_text(pdf_path)
words_by_level = parse_oxford_words_by_level(pages)

A1_words = words_by_level.get("A1", [])
A2_words = words_by_level.get("A2", [])
B1_words = words_by_level.get("B1", [])

# Banco final (A1‚ÄìB1)
bank_A1_B1 = list(dict.fromkeys(A1_words + A2_words + B1_words))

# Conjunto para filtragem de frases: apenas palavras "a-z" e com ap√≥strofo
oxford_set = set([w for w in bank_A1_B1 if re.fullmatch(r"[a-z]+(?:'[a-z]+)?", w)])

print("‚úÖ Banco Oxford A1‚ÄìB1 pronto:", len(oxford_set), "palavras (aprox.)")

# -------------------------
# 3) Baixar e preparar frases EN-PT (ManyThings/Tatoeba)
# ManyThings/Anki fornece pares tab-delimited (derivados do Tatoeba) [2](https://github.com/XA2005/CEFR-World-List/blob/main/The_Oxford_3000_by_CEFR_level.pdf)
# Tatoeba tamb√©m disponibiliza downloads oficiais [3](https://www.oxfordlearnersdictionaries.com/external/pdf/wordlists/oxford-3000-5000/The_Oxford_3000_by_CEFR_level.pdf)
# -------------------------
zip_name = "por-eng.zip"
if not os.path.exists(zip_name):
    !wget -O por-eng.zip "https://www.manythings.org/anki/por-eng.zip"
print("‚úÖ ZIP de frases baixado:", zip_name)

# extrair
extract_dir = "tatoeba_por_eng"
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_name, 'r') as z:
        z.extractall(extract_dir)

txt_files = [f for f in os.listdir(extract_dir) if f.endswith(".txt")]
if not txt_files:
    raise RuntimeError("N√£o encontrei arquivo .txt dentro do zip por-eng.zip")
pairs_path = os.path.join(extract_dir, txt_files[0])

# carregar dataset (formato t√≠pico: id \t en \t pt)
df = pd.read_csv(pairs_path, sep="\t", header=None, names=["id", "en", "pt"], quoting=3)
df = df.dropna(subset=["en", "pt"])
print("‚úÖ Pares carregados:", len(df))

# -------------------------
# 4) Filtrar frases para A1‚ÄìB1:
# Regras:
#  - ingl√™s e pt com 3..12 palavras (ajust√°vel)
#  - ingl√™s cont√©m SOMENTE palavras do Oxford A1‚ÄìB1 (tokeniza√ß√£o simples)
#  - remove frases com caracteres estranhos
# -------------------------
MIN_WORDS = 3
MAX_WORDS = 12

def word_count_simple(s):
    return len(re.findall(r"[A-Za-z√Ä-√ø']+", str(s)))

def looks_ok_en(s):
    s = str(s)
    # bloqueia s√≠mbolos ‚Äúestranhos‚Äù que aparecem em legendas/c√≥digos
    if re.search(r"[_#@/\\\[\]{}<>]", s):
        return False
    return True

def only_oxford_words(en_sentence):
    toks = tokenize_en(en_sentence)
    if not toks:
        return False
    return all(t in oxford_set for t in toks)

# limpeza leve
df["en"] = df["en"].astype(str).str.strip()
df["pt"] = df["pt"].astype(str).str.strip()

filtered = df[
    df["en"].apply(looks_ok_en) &
    df["en"].apply(word_count_simple).between(MIN_WORDS, MAX_WORDS) &
    df["pt"].apply(word_count_simple).between(MIN_WORDS, MAX_WORDS) &
    df["en"].apply(only_oxford_words)
].copy()

filtered = filtered.drop_duplicates(subset=["en", "pt"])
print("‚úÖ Frases filtradas (somente Oxford A1‚ÄìB1):", len(filtered))

if len(filtered) < 50:
    print("‚ö†Ô∏è Ficou pouca frase. Voc√™ pode relaxar o filtro aumentando MAX_WORDS ou diminuindo exig√™ncia.")
else:
    print("Exemplos:")
    print(filtered.sample(5, random_state=42)[["en","pt"]].to_string(index=False))

# -------------------------
# 5) TESTE: 10 palavras + 10 frases rand√¥micas filtradas
# Palavras: autoavalia√ß√£o (Oxford n√£o traz tradu√ß√£o PT) [1](https://langeek.co/en/vocab/level-based)
# Frases: corre√ß√£o autom√°tica com similaridade (EN->PT do corpus) [2](https://github.com/XA2005/CEFR-World-List/blob/main/The_Oxford_3000_by_CEFR_level.pdf)
# -------------------------
def run_test(word_bank, sentences_df, n_words=10, n_sentences=10, seed=None, threshold=0.86):
    if seed is not None:
        random.seed(seed)

    chosen_words = random.sample(list(word_bank), k=min(n_words, len(word_bank)))
    chosen_rows = sentences_df.sample(n=min(n_sentences, len(sentences_df)), random_state=seed)
    chosen_sentences = list(zip(chosen_rows["en"].tolist(), chosen_rows["pt"].tolist()))

    total = len(chosen_words) + len(chosen_sentences)
    correct = 0

    print("\nüü¶ TESTE A1‚ÄìB1 ‚Äî 10 palavras + 10 frases (RAND√îMICAS e filtradas)")
    print("‚Ä¢ Palavras: Oxford 3000 A1‚ÄìB1 (por CEFR).")  # [1](https://langeek.co/en/vocab/level-based)
    print("‚Ä¢ Frases: ManyThings/Tatoeba EN‚ÄìPT filtradas para usar apenas vocabul√°rio Oxford A1‚ÄìB1.")  # [2](https://github.com/XA2005/CEFR-World-List/blob/main/The_Oxford_3000_by_CEFR_level.pdf)
    print("‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ\n")

    # Palavras (autoavalia√ß√£o)
    print("üü© Parte 1 ‚Äî Palavras (autoavalia√ß√£o)\n")
    for i, w in enumerate(chosen_words, 1):
        _ = input(f"{i:02d}) Traduza a palavra: '{w}' ‚Üí ")
        s = input("   Voc√™ considera que acertou? (s/n) ‚Üí ").strip().lower()
        if s.startswith("s"):
            correct += 1
            print("   ‚úÖ Marcado como correto.\n")
        else:
            print("   ‚ùå Marcado como incorreto.\n")

    # Frases (corre√ß√£o autom√°tica)
    print("\nüü® Parte 2 ‚Äî Frases (rand√¥micas, apenas vocabul√°rio A1‚ÄìB1)\n")
    base = len(chosen_words)
    for j, (en, pt) in enumerate(chosen_sentences, 1):
        user = input(f"{base + j:02d}) Traduza: \"{en}\" ‚Üí ")
        ok, best, sc = is_correct(user, [pt], threshold=threshold)
        if ok:
            correct += 1
            print(f"   ‚úÖ Correto! (similaridade {sc:.2f})\n")
        else:
            print(f"   ‚ùå Gabarito: {pt} (sua similaridade: {sc:.2f})\n")

    grade = (correct / total) * 10 if total else 0.0
    print("‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ")
    print(f"üìå Acertos: {correct}/{total}")
    print(f"üèÅ Nota final: {grade:.1f} / 10")
    print("‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ")
    print("\nüîñ Atribui√ß√£o: Senten√ßas derivadas do Tatoeba (licen√ßa CC BY).")  # [3](https://www.oxfordlearnersdictionaries.com/external/pdf/wordlists/oxford-3000-5000/The_Oxford_3000_by_CEFR_level.pdf)[4](https://anyflip.com/fnsg/tvbr/basic)


‚úÖ PDF Oxford 3000 baixado: oxford_3000_cefr.pdf (113.7 KB)
‚úÖ Banco Oxford A1‚ÄìB1 pronto: 2980 palavras (aprox.)
--2026-02-24 11:03:45--  https://www.manythings.org/anki/por-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6633028 (6.3M) [application/zip]
Saving to: ‚Äòpor-eng.zip‚Äô


2026-02-24 11:03:47 (9.13 MB/s) - ‚Äòpor-eng.zip‚Äô saved [6633028/6633028]

‚úÖ ZIP de frases baixado: por-eng.zip
‚úÖ Pares carregados: 0
‚úÖ Frases filtradas (somente Oxford A1‚ÄìB1): 0
‚ö†Ô∏è Ficou pouca frase. Voc√™ pode relaxar o filtro aumentando MAX_WORDS ou diminuindo exig√™ncia.


In [23]:
# Executar o teste
run_test(oxford_set, filtered, n_words=10, n_sentences=10, threshold=0.86, seed=None)


üü¶ TESTE A1‚ÄìB1 ‚Äî 10 palavras + 10 frases (RAND√îMICAS e filtradas)
‚Ä¢ Palavras: Oxford 3000 A1‚ÄìB1 (por CEFR).
‚Ä¢ Frases: ManyThings/Tatoeba EN‚ÄìPT filtradas para usar apenas vocabul√°rio Oxford A1‚ÄìB1.
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üü© Parte 1 ‚Äî Palavras (autoavalia√ß√£o)

01) Traduza a palavra: 'ski' ‚Üí Esqui
   Voc√™ considera que acertou? (s/n) ‚Üí s
   ‚úÖ Marcado como correto.

02) Traduza a palavra: 'scary' ‚Üí Assustado
   Voc√™ considera que acertou? (s/n) ‚Üí s
   ‚úÖ Marcado como correto.



KeyboardInterrupt: Interrupted by user