# SPR 2026 - TF-IDF + Classical ML

Baseline com TF-IDF e modelos clássicos de ML.

**Modelos:** Logistic Regression, SVM, LightGBM, XGBoost

**Formato:** Code Competition (Kaggle) / Google Colab

In [1]:
# ============================================================
# SETUP - Ambiente e Dados
# ============================================================
import os
import sys

# Verificar Colab PRIMEIRO (mais confiável)
IS_COLAB = 'google.colab' in sys.modules
IS_KAGGLE = os.path.exists('/kaggle/input') and not IS_COLAB

print(f"Ambiente: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_BASE = '/content/drive/MyDrive/SPR_2026_outputs'
    DATA_DIR = f'{DRIVE_BASE}/data'
    OUTPUT_DIR = DRIVE_BASE
    
    # Verificar se dados existem no Drive
    if not os.path.exists(f'{DATA_DIR}/train.csv'):
        print("⚠️ Dados não encontrados no Drive!")
        print("Execute primeiro o notebook 00_download_data.ipynb")
        raise FileNotFoundError(f"Arquivo não encontrado: {DATA_DIR}/train.csv")
elif IS_KAGGLE:
    DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
    OUTPUT_DIR = '/kaggle/working'
else:
    DATA_DIR = '../data'
    OUTPUT_DIR = '../submissions'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

Ambiente: Kaggle
DATA_DIR: /kaggle/input/spr-2026-mammography-report-classification
OUTPUT_DIR: /kaggle/working


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

SEED = 42
N_FOLDS = 5
np.random.seed(SEED)

## 1. Carregar Dados

In [3]:
# Treino
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
print(f"Train: {train.shape}")

# Teste (Code Competition pattern)
test_path = os.path.join(DATA_DIR, 'test.csv')
if os.path.exists(test_path):
    test = pd.read_csv(test_path)
    print(f"Test: {test.shape}")
    assert set(['ID', 'report']).issubset(test.columns)
else:
    test = None
    print("test.csv não disponível - será carregado no runtime Kaggle")

Train: (18272, 3)
Test: (4, 2)


## 2. TF-IDF Vectorization

In [4]:
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
)

X = tfidf.fit_transform(train['report'])
y = train['target'].values
print(f"TF-IDF shape: {X.shape}")

TF-IDF shape: (18272, 8683)


## 3. Comparar Modelos

In [5]:
models = {
    'LogisticRegression': LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED),
    'LinearSVC': LinearSVC(C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=SEED, verbose=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=200, max_depth=10, random_state=SEED, eval_metric='mlogloss'),
}

results = {}
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for name, model in models.items():
    print(f"Training {name}...")
    scores = cross_val_score(model, X, y, cv=skf, scoring='f1_macro', n_jobs=-1)
    results[name] = scores
    print(f"  F1-Macro: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

Training LogisticRegression...
  F1-Macro: 0.6828 (+/- 0.0502)
Training LinearSVC...
  F1-Macro: 0.7166 (+/- 0.0596)
Training LightGBM...




  F1-Macro: 0.6839 (+/- 0.0293)
Training XGBoost...
  F1-Macro: 0.6654 (+/- 0.0169)


In [6]:
# Melhor modelo
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'F1_Mean': [s.mean() for s in results.values()],
    'F1_Std': [s.std() for s in results.values()],
}).sort_values('F1_Mean', ascending=False)

print("\nResultados:")
print(results_df.to_string(index=False))

best_name = results_df.iloc[0]['Model']
print(f"\nMelhor modelo: {best_name}")


Resultados:
             Model  F1_Mean   F1_Std
         LinearSVC 0.716618 0.029813
          LightGBM 0.683918 0.014636
LogisticRegression 0.682833 0.025091
           XGBoost 0.665446 0.008446

Melhor modelo: LinearSVC


## 4. Treinar Modelo Final e Gerar Submissão

In [7]:
# Treinar em todos os dados
best_model = models[best_name]
best_model.fit(X, y)
print(f"Modelo {best_name} treinado em {X.shape[0]} amostras")

Modelo LinearSVC treinado em 18272 amostras


In [8]:
# ============================================================
# Geração de Submissão
# ============================================================
# Carregar test
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

# Fazer predições
X_test = tfidf.transform(test['report'])
predictions = best_model.predict(X_test)

# Criar submission
submission = pd.DataFrame({'ID': test['ID'], 'target': predictions})

# SEMPRE salvar submission.csv no diretório atual (exigido pelo Kaggle)
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv salvo no diretório atual")

# Também salvar no OUTPUT_DIR para persistência (Colab/Local)
if not IS_KAGGLE:
    submission_path = os.path.join(OUTPUT_DIR, 'submission_tfidf.csv')
    submission.to_csv(submission_path, index=False)
    print(f"✅ Cópia salva em: {submission_path}")

print(f"\nDistribuição das predições:")
print(submission['target'].value_counts().sort_index())

✅ submission.csv salvo no diretório atual

Distribuição das predições:
target
2    3
6    1
Name: count, dtype: int64


In [9]:
# Download no Colab (opcional)
if IS_COLAB and os.path.exists('submission.csv'):
    from google.colab import files
    files.download('submission.csv')