# TCGA-BRCA Data Loading

This notebook loads TCGA-BRCA RNA-seq and clinical data from the GDC API.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, str(Path().resolve().parent / "src"))

from data_loader import TCGADataLoader

## Initialize Data Loader

In [None]:
# Initialize loader
loader = TCGADataLoader(data_dir="../data")

## Load File Manifest

In [None]:
# Get file manifest for RNA-seq expression data
manifest = loader.get_file_manifest()
print(f"Found {len(manifest)} RNA-seq files")
manifest.head()

## Load Clinical Data

In [None]:
# Get clinical data
clinical = loader.get_clinical_data()
print(f"Found {len(clinical)} cases")
clinical.head()

## Load PAM50 Subtypes

In [None]:
# Get PAM50 subtype annotations (with auto-download if not found)
pam50 = loader.get_pam50_subtypes(auto_download=True)
if not pam50.empty:
    print(f"Found {len(pam50)} PAM50 annotations")
    print(f"\nPAM50 Subtype Distribution:")
    if 'pam50_subtype' in pam50.columns:
        print(pam50['pam50_subtype'].value_counts())
    pam50.head()
else:
    print("PAM50 subtypes not found. Trying manual download...")
    try:
        file_path = loader.download_pam50_from_url()
        pam50 = loader.load_pam50_from_file(str(file_path))
        print(f"Successfully loaded {len(pam50)} PAM50 annotations")
    except Exception as e:
        print(f"Failed to download PAM50: {e}")

## Data Summary

In [None]:
# Summary statistics
print("=" * 60)
print("DATA SUMMARY")
print("=" * 60)

print(f"\nFile Manifest: {len(manifest)} files")
print(f"Clinical Data: {len(clinical)} cases")
if not pam50.empty:
    print(f"PAM50 Annotations: {len(pam50)} cases")

print("\nClinical Data Columns:")
for col in clinical.columns:
    print(f"  - {col}")

## Data Cleaning & Preparation

In [None]:
# Check data types and basic info
print("Clinical Data Info:")
print(clinical.info())
print("\n" + "="*60)
print("\nManifest Info:")
print(manifest.info())
print("\n" + "="*60)
if not pam50.empty:
    print("\nPAM50 Info:")
    print(pam50.info())

In [None]:
# Check for missing values across all datasets
print("Missing Values in Clinical Data:")
missing_clinical = clinical.isnull().sum()
missing_clinical = missing_clinical[missing_clinical > 0].sort_values(ascending=False)
if len(missing_clinical) > 0:
    print(missing_clinical)
    print(f"\nTotal missing values: {missing_clinical.sum()}")
    print(f"Percentage missing: {(missing_clinical.sum() / clinical.size) * 100:.2f}%")
else:
    print("No missing values found!")

print("\n" + "="*60)
print("Missing Values in Manifest:")
missing_manifest = manifest.isnull().sum()
missing_manifest = missing_manifest[missing_manifest > 0].sort_values(ascending=False)
if len(missing_manifest) > 0:
    print(missing_manifest)
else:
    print("No missing values found!")

print("\n" + "="*60)
if not pam50.empty:
    print("Missing Values in PAM50:")
    missing_pam50 = pam50.isnull().sum()
    missing_pam50 = missing_pam50[missing_pam50 > 0].sort_values(ascending=False)
    if len(missing_pam50) > 0:
        print(missing_pam50)
    else:
        print("No missing values found!")

In [None]:
# Clean clinical data
clinical_clean = clinical.copy()

# Convert numeric columns
numeric_cols = ['age_at_index', 'days_to_death', 'days_to_birth']
for col in numeric_cols:
    if col in clinical_clean.columns:
        clinical_clean[col] = pd.to_numeric(clinical_clean[col], errors='coerce')

# Clean up tumor stage (remove leading text like 'stage ')
if 'tumor_stage' in clinical_clean.columns:
    clinical_clean['tumor_stage'] = clinical_clean['tumor_stage'].str.replace('stage ', '', case=False, regex=False)

# Standardize gender values
if 'gender' in clinical_clean.columns:
    clinical_clean['gender'] = clinical_clean['gender'].str.strip().str.title()

# Create age groups
if 'age_at_index' in clinical_clean.columns:
    clinical_clean['age_group'] = pd.cut(
        clinical_clean['age_at_index'],
        bins=[0, 40, 50, 60, 70, 100],
        labels=['<40', '40-50', '50-60', '60-70', '70+']
    )

print("Clinical data cleaned!")
print(f"Original shape: {clinical.shape}")
print(f"Cleaned shape: {clinical_clean.shape}")
print(f"\nFirst few rows:")
clinical_clean.head()

## Clean PAM50 Subtypes

In [None]:
# Clean PAM50 subtypes if available
if not pam50.empty:
    pam50_clean = pam50.copy()
    
    # Standardize PAM50 subtype names (remove extra spaces, standardize case)
    if 'pam50_subtype' in pam50_clean.columns:
        pam50_clean['pam50_subtype'] = pam50_clean['pam50_subtype'].str.strip()
        # Common PAM50 subtypes: LumA, LumB, Basal, Her2, Normal
        # Map variations to standard names
        subtype_mapping = {
            'Luminal A': 'LumA',
            'Luminal B': 'LumB',
            'Basal-like': 'Basal',
            'HER2-enriched': 'Her2',
            'HER2': 'Her2',
            'Normal-like': 'Normal'
        }
        pam50_clean['pam50_subtype'] = pam50_clean['pam50_subtype'].replace(subtype_mapping)
        
        # Filter to only valid subtypes (drop Unknown/NA)
        valid_subtypes = ['LumA', 'LumB', 'Basal', 'Her2', 'Normal']
        pam50_clean = pam50_clean[pam50_clean['pam50_subtype'].isin(valid_subtypes)].copy()
        
        print(f"PAM50 subtypes cleaned!")
        print(f"Original count: {len(pam50)}")
        print(f"Cleaned count: {len(pam50_clean)}")
        print(f"\nSubtype distribution:")
        print(pam50_clean['pam50_subtype'].value_counts())
        pam50_clean.head()
    else:
        print("No 'pam50_subtype' column found in PAM50 data")
        pam50_clean = pam50.copy()
else:
    print("No PAM50 data available")
    pam50_clean = pd.DataFrame()

## Merge Clinical Data with PAM50 Subtypes

In [None]:
# Merge clinical data with PAM50 subtypes
if not pam50_clean.empty:
    # Merge on case_id
    clinical_with_subtypes = clinical_clean.merge(
        pam50_clean[['case_id', 'pam50_subtype']],
        on='case_id',
        how='left'
    )
    
    print(f"Merged dataset shape: {clinical_with_subtypes.shape}")
    print(f"\nCases with PAM50 subtypes: {clinical_with_subtypes['pam50_subtype'].notna().sum()}")
    print(f"Cases without PAM50 subtypes: {clinical_with_subtypes['pam50_subtype'].isna().sum()}")
    print(f"\nSubtype distribution:")
    print(clinical_with_subtypes['pam50_subtype'].value_counts(dropna=False))
    
    clinical_with_subtypes.head()
else:
    print("Cannot merge: PAM50 data not available")
    clinical_with_subtypes = clinical_clean.copy()

## Explore Cleaned Data

In [None]:
# Summary statistics for cleaned clinical data
if not clinical_with_subtypes.empty:
    print("Summary Statistics:")
    print("="*60)
    
    # Categorical variables
    if 'gender' in clinical_with_subtypes.columns:
        print("\nGender Distribution:")
        print(clinical_with_subtypes['gender'].value_counts(dropna=False))
    
    if 'vital_status' in clinical_with_subtypes.columns:
        print("\nVital Status Distribution:")
        print(clinical_with_subtypes['vital_status'].value_counts(dropna=False))
    
    if 'tumor_stage' in clinical_with_subtypes.columns:
        print("\nTumor Stage Distribution:")
        print(clinical_with_subtypes['tumor_stage'].value_counts(dropna=False).head(10))
    
    if 'age_group' in clinical_with_subtypes.columns:
        print("\nAge Group Distribution:")
        print(clinical_with_subtypes['age_group'].value_counts(dropna=False))
    
    # Numeric variables
    print("\nNumeric Statistics:")
    numeric_cols_clean = clinical_with_subtypes.select_dtypes(include=['float64', 'int64']).columns
    if len(numeric_cols_clean) > 0:
        print(clinical_with_subtypes[numeric_cols_clean].describe())

## Save Cleaned Data

In [None]:
# Save cleaned datasets
data_dir = Path("../data")

# Save cleaned clinical data
clinical_clean_path = data_dir / "clinical_data_clean.tsv"
clinical_clean.to_csv(clinical_clean_path, sep="\t", index=False)
print(f"Saved cleaned clinical data to: {clinical_clean_path}")

# Save clinical data with subtypes if available
if not pam50_clean.empty:
    clinical_subtypes_path = data_dir / "clinical_data_with_subtypes.tsv"
    clinical_with_subtypes.to_csv(clinical_subtypes_path, sep="\t", index=False)
    print(f"Saved clinical data with subtypes to: {clinical_subtypes_path}")
    
    # Save cleaned PAM50
    pam50_clean_path = data_dir / "pam50_subtypes_clean.tsv"
    pam50_clean.to_csv(pam50_clean_path, sep="\t", index=False)
    print(f"Saved cleaned PAM50 subtypes to: {pam50_clean_path}")

print("\n" + "="*60)
print("Data cleaning complete!")
print("="*60)

## Next Steps

1. ✅ **Data loaded**: Manifest, Clinical, and PAM50 subtypes
2. ✅ **Data cleaned**: Missing values, data types, standardization
3. ✅ **Data merged**: Clinical data with PAM50 subtypes
4. ⏭️ **Next**: Download and process expression data
   - Download expression files using `loader.download_file(file_id, file_name)`
   - Parse expression data (TSV format)
   - Merge expression data with clinical annotations
   - Filter for samples with PAM50 subtypes
   - Proceed to preprocessing and ML pipeline

## Clean Clinical Data