# SPR 2026 - Exploratory Data Analysis

Análise exploratória dos dados de mamografia para classificação BI-RADS.

**Compatível com:** Google Colab / Kaggle Notebooks

In [2]:
# ============================================================
# SETUP - Ambiente, Download de Dados e Google Drive
# ============================================================
import os
import sys

IS_KAGGLE = os.path.exists('/kaggle/input')
IS_COLAB = 'google.colab' in sys.modules

print(f"Ambiente: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")

if IS_KAGGLE:
    DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
    OUTPUT_DIR = '/kaggle/working'
elif IS_COLAB:
    # Montar Google Drive para salvar outputs
    from google.colab import drive
    drive.mount('/content/drive')
    
    DRIVE_OUTPUT = '/content/drive/MyDrive/SPR_2026_outputs'
    os.makedirs(DRIVE_OUTPUT, exist_ok=True)
    OUTPUT_DIR = DRIVE_OUTPUT
    
    # Download dados Kaggle
    !pip install kaggle -q
    from google.colab import userdata
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
    os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
    !mkdir -p ~/.kaggle
    !echo '{"username":"'$KAGGLE_USERNAME'","key":"'$KAGGLE_KEY'"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle competitions download -c spr-2026-mammography-report-classification -q
    !mkdir -p data && unzip -o -q spr-2026-mammography-report-classification.zip -d data/
    DATA_DIR = 'data'
else:
    DATA_DIR = '../data'
    OUTPUT_DIR = '../submissions'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

Ambiente: Kaggle
DATA_DIR: /kaggle/input/spr-2026-mammography-report-classification
OUTPUT_DIR: /kaggle/working


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_colwidth', 200)

## 1. Carregar Dados

In [6]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
print(f"Train shape: {train.shape}")
print(f"Colunas: {train.columns.tolist()}")

# Test (Code Competition - pode não existir em dev)
test_path = os.path.join(DATA_DIR, 'test.csv')
if os.path.exists(test_path):
    test = pd.read_csv(test_path)
    print(f"Test shape: {test.shape}")
else:
    test = None
    print("test.csv não disponível - existe apenas no runtime de avaliação Kaggle")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/spr-2026-mammography-report-classification/train.csv'

In [None]:
train.info()
train.head()

## 2. Distribuição do Target (BI-RADS)

In [None]:
BIRADS_LABELS = {
    0: 'Incompleto', 1: 'Negativo', 2: 'Benigno', 3: 'Provavelmente Benigno',
    4: 'Suspeito', 5: 'Altamente Sugestivo', 6: 'Malignidade Comprovada'
}

target_counts = train['target'].value_counts().sort_index()
print("Distribuição do Target:")
for idx, count in target_counts.items():
    print(f"  {idx} ({BIRADS_LABELS[idx]}): {count} ({count/len(train)*100:.1f}%)")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
colors = sns.color_palette('husl', 7)

axes[0].bar(target_counts.index, target_counts.values, color=colors)
axes[0].set_xlabel('BI-RADS Category')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribuição das Classes BI-RADS')

axes[1].pie(target_counts.values, labels=[f'{i}' for i in target_counts.index], 
            autopct='%1.1f%%', colors=colors)
axes[1].set_title('Proporção das Classes')

plt.tight_layout()
plt.show()

## 3. Análise dos Textos

In [None]:
train['text_length'] = train['report'].apply(len)
train['word_count'] = train['report'].apply(lambda x: len(x.split()))

print("Estatísticas de Comprimento:")
print(train[['text_length', 'word_count']].describe())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
train.boxplot(column='text_length', by='target', ax=axes[0])
axes[0].set_title('Caracteres por Classe')
train.boxplot(column='word_count', by='target', ax=axes[1])
axes[1].set_title('Palavras por Classe')
plt.suptitle('')
plt.tight_layout()
plt.show()

In [None]:
print("Exemplos de relatórios por classe:")
for target in sorted(train['target'].unique()):
    sample = train[train['target'] == target].iloc[0]
    print(f"\nBI-RADS {target} ({BIRADS_LABELS[target]}):")
    print("-"*40)
    print(sample['report'][:400] + "..." if len(sample['report']) > 400 else sample['report'])

In [None]:
print("="*60)
print("RESUMO")
print("="*60)
print(f"Amostras: {len(train)} | Classes: {train['target'].nunique()}")
print(f"Palavras média: {train['word_count'].mean():.0f} | Max chars: {train['text_length'].max()}")
print(f"Imbalance ratio: {target_counts.max() / target_counts.min():.1f}x")