# Analise de Labeling Errors - BI-RADS != 2

**Objetivo:** Identificar possiveis erros de rotulagem nos dados de treino.

BI-RADS 2 (achados benignos) e a classe mais frequente e provavelmente mais consistente.
Classes menos frequentes (0, 1, 3, 4, 5, 6) podem ter mais ruido.

---
**Tecnicas:**
1. Confident Learning (cleanlab)
2. Analise de embeddings + outliers
3. Cross-validation loss analysis
4. Textos curtos/ambiguos

---

In [None]:
# Setup
import os
import numpy as np
import pandas as pd
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Paths
DATA_DIR = '/kaggle/input/spr-2026-mammography-report-classification'
if not os.path.exists(DATA_DIR):
    DATA_DIR = '../data'  # Local

SEED = 42
np.random.seed(SEED)

In [None]:
# Carregar dados
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
print(f'Train shape: {train_df.shape}')
print(f'\nDistribuicao de classes:')
print(train_df['label'].value_counts().sort_index())

## 1. Analise Basica - Textos Curtos/Ambiguos

Textos muito curtos ou genericos podem indicar rotulagem incorreta.

In [None]:
# Adicionar features basicas
train_df['text_len'] = train_df['text'].str.len()
train_df['word_count'] = train_df['text'].str.split().str.len()

# Estatisticas por classe
print('Estatisticas de comprimento por classe:')
print(train_df.groupby('label')['text_len'].describe().round(1))

In [None]:
# Textos MUITO curtos (< 50 chars) - suspeitos
short_texts = train_df[train_df['text_len'] < 50].copy()
print(f'Textos com < 50 caracteres: {len(short_texts)}')
print(f'\nDistribuicao:')
print(short_texts['label'].value_counts().sort_index())

if len(short_texts) > 0:
    print(f'\nExemplos:')
    for _, row in short_texts.head(10).iterrows():
        print(f"  [{row['label']}] {row['text'][:80]}")

In [None]:
# Analise apenas de BI-RADS != 2
non_birads2 = train_df[train_df['label'] != 2].copy()
print(f'Amostras com BI-RADS != 2: {len(non_birads2)}')
print(f'\nDistribuicao:')
print(non_birads2['label'].value_counts().sort_index())

## 2. Confident Learning com Cleanlab

Cleanlab usa probabilidades de classificacao para detectar possiveis erros.

In [None]:
# Instalar cleanlab se necessario
try:
    import cleanlab
except ImportError:
    !pip install -q cleanlab
    import cleanlab

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

print(f'Cleanlab version: {cleanlab.__version__}')

In [None]:
# Vetorizar textos
print('Vetorizando textos...')
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2
)
X = vectorizer.fit_transform(train_df['text'])
y = train_df['label'].values

print(f'X shape: {X.shape}')

In [None]:
# Obter probabilidades via cross-validation
print('Calculando probabilidades (cross-validation)...')
clf = LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced')

pred_probs = cross_val_predict(
    clf, X, y,
    cv=5,
    method='predict_proba'
)

print(f'Probabilidades shape: {pred_probs.shape}')

In [None]:
# Encontrar label issues
from cleanlab.filter import find_label_issues

issue_indices = find_label_issues(
    labels=y,
    pred_probs=pred_probs,
    return_indices_ranked_by='self_confidence'
)

print(f'Possiveis erros de rotulagem: {len(issue_indices)}')
print(f'Percentual: {100*len(issue_indices)/len(y):.2f}%')

In [None]:
# Criar dataframe de issues
issues_df = train_df.iloc[issue_indices].copy()
issues_df['pred_probs'] = [pred_probs[i] for i in issue_indices]
issues_df['pred_label'] = pred_probs[issue_indices].argmax(axis=1)
issues_df['confidence'] = pred_probs[issue_indices].max(axis=1)

# Filtrar apenas BI-RADS != 2
issues_non2 = issues_df[issues_df['label'] != 2].copy()
print(f'Issues em BI-RADS != 2: {len(issues_non2)}')
print(f'\nDistribuicao por classe:')
print(issues_non2['label'].value_counts().sort_index())

In [None]:
# Top issues (mais provavel erro)
print('Top 20 possiveis erros de rotulagem (BI-RADS != 2):')
print('=' * 80)

for i, (idx, row) in enumerate(issues_non2.head(20).iterrows()):
    print(f"\n[{i+1}] ID: {row['id']}")
    print(f"    Label atual: {row['label']} | Predicao: {row['pred_label']} | Conf: {row['confidence']:.3f}")
    print(f"    Texto: {row['text'][:150]}...")

## 3. Analise de Confusao entre Classes

Verificar quais pares de classes tem mais confusao.

In [None]:
# Matriz de confusao dos issues
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Confusao geral
y_pred = pred_probs.argmax(axis=1)
cm = confusion_matrix(y, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=range(7), yticklabels=range(7))
plt.xlabel('Predicao')
plt.ylabel('Label Real')
plt.title('Matriz de Confusao (Cross-Validation)')
plt.tight_layout()
plt.show()

In [None]:
# Pares de confusao mais frequentes nos issues
confusion_pairs = []
for idx in issue_indices:
    real = y[idx]
    pred = pred_probs[idx].argmax()
    if real != pred:
        confusion_pairs.append((real, pred))

print('Top 10 pares de confusao (real -> predicao):')
for pair, count in Counter(confusion_pairs).most_common(10):
    print(f'  {pair[0]} -> {pair[1]}: {count} casos')

## 4. Salvar Issues para Revisao Manual

Exportar os possiveis erros para revisao.

In [None]:
# Salvar para revisao
output_cols = ['id', 'text', 'label', 'pred_label', 'confidence', 'text_len']
issues_to_save = issues_non2[output_cols].copy()
issues_to_save = issues_to_save.sort_values('confidence', ascending=False)

# No Kaggle, salvar em /kaggle/working/
output_path = '/kaggle/working/labeling_issues_birads_non2.csv'
if not os.path.exists('/kaggle/working'):
    output_path = 'labeling_issues_birads_non2.csv'

issues_to_save.to_csv(output_path, index=False)
print(f'Salvo em: {output_path}')
print(f'Total de issues para revisao: {len(issues_to_save)}')

## 5. Resumo

Analise dos resultados e proximos passos.

In [None]:
# Resumo final
print('=' * 60)
print('RESUMO DA ANALISE DE LABELING ERRORS')
print('=' * 60)
print(f'\nTotal de amostras: {len(train_df)}')
print(f'Amostras BI-RADS != 2: {len(non_birads2)}')
print(f'\nPossiveis erros detectados: {len(issue_indices)} ({100*len(issue_indices)/len(train_df):.1f}%)')
print(f'Erros em BI-RADS != 2: {len(issues_non2)}')
print(f'\nDistribuicao de erros por classe:')
for label in sorted(issues_df['label'].unique()):
    count = len(issues_df[issues_df['label'] == label])
    total = len(train_df[train_df['label'] == label])
    print(f'  BI-RADS {label}: {count}/{total} ({100*count/total:.1f}%)')