# SPR 2026 - TF-IDF Baseline

Notebook para submissão no Kaggle.

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

SEED = 42
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'

In [2]:
# Carregar dados
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

Train: (18272, 3)
Test: (4, 2)


In [3]:
# TF-IDF
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
)

X_train = tfidf.fit_transform(train['report'])
X_test = tfidf.transform(test['report'])
y_train = train['target'].values

print(f'TF-IDF Train: {X_train.shape}')
print(f'TF-IDF Test: {X_test.shape}')

TF-IDF Train: (18272, 8683)
TF-IDF Test: (4, 8683)


In [4]:
# Treinar modelo
model = LogisticRegression(
    C=1.0, 
    max_iter=1000, 
    class_weight='balanced', 
    random_state=SEED
)
model.fit(X_train, y_train)
print('Modelo treinado!')

Modelo treinado!


In [5]:
# Predições
predictions = model.predict(X_test)

# Criar submission
submission = pd.DataFrame({
    'ID': test['ID'],
    'target': predictions
})

# Salvar
submission.to_csv('submission.csv', index=False)

print('Submissão salva!')
print(submission['target'].value_counts().sort_index())

Submissão salva!
target
2    2
3    1
6    1
Name: count, dtype: int64


In [6]:
# Verificar arquivo
!ls -la submission.csv
!head submission.csv

-rw-r--r-- 1 root root 39 Feb 18 16:13 submission.csv
ID,target
Acc0,6
Acc2,2
Acc4,2
Acc10,3
