# SPR 2026 - TF-IDF + Classical ML Baseline
## Notebook para Google Colab

Baseline clássico usando TF-IDF com modelos de ML tradicionais.

**Modelos testados:**
- Logistic Regression
- SVM (LinearSVC)
- Random Forest
- LightGBM
- XGBoost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 1. Setup e Download dos Dados

In [None]:
# Instalar dependências
!pip install kaggle lightgbm xgboost -q

In [None]:
# Configurar Kaggle API
import os

# Opção 1: Usar secrets do Colab
try:
    from google.colab import userdata
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
    os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
    print("Kaggle credentials loaded from Colab secrets")
except:
    # Opção 2: Configurar manualmente (descomentar e preencher)
    # os.environ['KAGGLE_USERNAME'] = 'seu_username'
    # os.environ['KAGGLE_KEY'] = 'sua_key'
    print("Configure suas credenciais Kaggle manualmente")

# Criar diretório kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"'$KAGGLE_USERNAME'","key":"'$KAGGLE_KEY'"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download dos dados
!kaggle competitions download -c spr-2026-mammography-report-classification
!unzip -o spr-2026-mammography-report-classification.zip -d data/
!ls -la data/

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Configurações
SEED = 42
N_FOLDS = 5

np.random.seed(SEED)

## 2. Carregar Dados

In [None]:
train_df = pd.read_csv('data/train.csv')
print(f"Train shape: {train_df.shape}")
print(f"\nTarget distribution:")
print(train_df['target'].value_counts().sort_index())

# Test (se disponível)
test_path = 'data/test.csv'
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
    print(f"\nTest shape: {test_df.shape}")
else:
    test_df = None
    print("\nTest file not available")

## 3. TF-IDF Vectorization

In [None]:
# TF-IDF com diferentes configurações
tfidf_configs = {
    'basic': TfidfVectorizer(max_features=10000, ngram_range=(1, 1)),
    'ngram': TfidfVectorizer(max_features=15000, ngram_range=(1, 2)),
    'char': TfidfVectorizer(max_features=10000, ngram_range=(2, 5), analyzer='char_wb'),
}

# Usar configuração ngram por padrão
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
)

X = tfidf.fit_transform(train_df['report'])
y = train_df['target'].values

print(f"TF-IDF matrix shape: {X.shape}")

## 4. Testar Diferentes Modelos

In [None]:
# Definir modelos
models = {
    'LogisticRegression': LogisticRegression(
        C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED
    ),
    'LinearSVC': LinearSVC(
        C=1.0, max_iter=1000, class_weight='balanced', random_state=SEED
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=200, max_depth=20, class_weight='balanced', random_state=SEED, n_jobs=-1
    ),
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=200, max_depth=10, class_weight='balanced', random_state=SEED, verbose=-1
    ),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=200, max_depth=10, random_state=SEED, use_label_encoder=False, eval_metric='mlogloss'
    ),
}

# Cross-validation para cada modelo
results = {}
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for name, model in models.items():
    print(f"\nTraining {name}...")
    scores = cross_val_score(model, X, y, cv=skf, scoring='f1_macro', n_jobs=-1)
    results[name] = scores
    print(f"{name}: F1-Macro = {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

In [None]:
# Resumo dos resultados
print("\n" + "="*50)
print("RESUMO DOS RESULTADOS")
print("="*50)

results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'F1-Macro Mean': [scores.mean() for scores in results.values()],
    'F1-Macro Std': [scores.std() for scores in results.values()],
}).sort_values('F1-Macro Mean', ascending=False)

print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
print(f"\nMelhor modelo: {best_model_name}")

## 5. Treinar Modelo Final e Gerar Submissão

In [None]:
# Treinar melhor modelo em todos os dados
best_model = models[best_model_name]
best_model.fit(X, y)

print(f"Modelo {best_model_name} treinado em {X.shape[0]} amostras")

In [None]:
# Gerar submissão
if test_df is not None:
    X_test = tfidf.transform(test_df['report'])
    predictions = best_model.predict(X_test)
    
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'target': predictions,
    })
    
    submission.to_csv('submission_tfidf.csv', index=False)
    print("Submissão salva: submission_tfidf.csv")
    print(f"Shape: {submission.shape}")
    print(f"\nDistribuição das predições:")
    print(submission['target'].value_counts().sort_index())
else:
    print("Test file não disponível - execute no ambiente Kaggle")

In [None]:
# Download da submissão (Colab)
try:
    from google.colab import files
    files.download('submission_tfidf.csv')
except:
    print("Submissão disponível em: submission_tfidf.csv")