# SPR 2026 - TF-IDF + LightGBM

Modelo TF-IDF com LightGBM para classificação BI-RADS.

**Nota:** Otimizado para Kaggle (offline, sem internet).

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

In [None]:
# Carregar dados
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

In [None]:
# TF-IDF
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_train = tfidf.fit_transform(train['report'])
X_test = tfidf.transform(test['report'])
y_train = train['target'].values

print(f'TF-IDF shape: {X_train.shape}')

In [None]:
# LightGBM
model = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=12,
    learning_rate=0.05,
    num_leaves=64,
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1,
    verbose=-1
)

# Validação
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')
print(f'CV F1-Macro: {scores.mean():.4f} (+/- {scores.std()*2:.4f})')

In [None]:
# Treinar final
model.fit(X_train, y_train)
print('Modelo treinado!')

In [None]:
# Predições
predictions = model.predict(X_test)

# Submission
submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

submission.to_csv('submission.csv', index=False)
print('submission.csv criado!')
print(submission['target'].value_counts().sort_index())