In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import gc

# Process TSV files


data_dir = "/Users/zoey/Documents/2026spring/"
tsv_files = list(Path(data_dir).rglob("*.tsv"))

print(f"Found {len(tsv_files)} TSV files")

def process_file(file_path: Path) -> pd.DataFrame:
    """Read and process a single TSV file"""
    # Read TSV file
    data = pd.read_csv(file_path, sep='\t', comment='#')
    
    # Get file UUID from parent directory name
    file_uuid = file_path.parent.name
    mask = (
        data['gene_name'].notna() & 
        (~data['gene_name'].astype(str).str.startswith('N_'))
    )
    data_clean = data.loc[mask, ['gene_name', 'unstranded']].copy()
    
    # Remove duplicates by summing counts
    data_clean = data_clean.groupby('gene_name', as_index=False)['unstranded'].sum()
    
    # Rename column to UUID
    data_clean.rename(columns={'unstranded': file_uuid}, inplace=True)
    
    return data_clean

# Process all files
print("\nProcessing first file...")
expression_matrix = process_file(tsv_files[0])
print(f"First file: {len(expression_matrix)} unique genes")

print("\nProcessing remaining files...")
for i, file_path in enumerate(tsv_files[1:], start=2):
    if i % 50 == 0:
        print(f"Processing file {i}/{len(tsv_files)}")
        gc.collect()
    
    file_data = process_file(file_path)
    expression_matrix = expression_matrix.merge(file_data, on='gene_name', how='left')

print(f"\nRaw counts matrix: {len(expression_matrix)} genes x {expression_matrix.shape[1]-1} samples")
print(f"Matrix size: {expression_matrix.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


# Prepare count matrix
print("\n=== PREPARING MATRIX ===")

# Set gene_name as index
counts_matrix = expression_matrix.set_index('gene_name')

# Free memory
del expression_matrix
gc.collect()

# Convert to numeric and round to integers
counts_matrix = counts_matrix.astype(float).round().astype(int)

print(f"Count matrix: {counts_matrix.shape[0]} genes x {counts_matrix.shape[1]} samples")
print(f"Matrix size: {counts_matrix.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Filter low-count genes

print("\n=== FILTERING LOW-COUNT GENES ===")

# Keep genes with at least 10 reads in at least 10 samples
keep = (counts_matrix >= 10).sum(axis=1) >= 10
counts_filtered = counts_matrix[keep]

print(f"After filtering: {len(counts_filtered)} genes kept out of {len(counts_matrix)}")

# Free memory
del counts_matrix
gc.collect()

Found 884 TSV files

Processing first file...
First file: 59427 unique genes

Processing remaining files...
Processing file 50/884
Processing file 100/884
Processing file 150/884
Processing file 200/884
Processing file 250/884
Processing file 300/884
Processing file 350/884
Processing file 400/884
Processing file 450/884
Processing file 500/884
Processing file 550/884
Processing file 600/884
Processing file 650/884
Processing file 700/884
Processing file 750/884
Processing file 800/884
Processing file 850/884

Raw counts matrix: 59427 genes x 884 samples
Matrix size: 404.47 MB

=== PREPARING MATRIX ===
Count matrix: 59427 genes x 884 samples
Matrix size: 404.47 MB

=== FILTERING LOW-COUNT GENES ===
After filtering: 36538 genes kept out of 59427


0

In [None]:
# ============================================================
# STEP 4: VST Normalization using PyDESeq2
# ============================================================

print("\n=== VST NORMALIZATION ===")


# Log2(CPM + 1) normalization
# Calculate library sizes
lib_sizes = counts_filtered.sum(axis=0)

# Calculate CPM (Counts Per Million)
cpm = counts_filtered.div(lib_sizes / 1e6, axis=1)

# Log2 transform
vst_counts = np.log2(cpm + 1)

# Transpose so samples are rows
vst_counts = vst_counts.T

print(f"Log2(CPM+1) normalized: {vst_counts.shape[0]} samples x {vst_counts.shape[1]} genes")

# Free memory
del counts_filtered
gc.collect()

# Map UUIDs to TCGA Patient Barcodes

print("\n=== MAPPING UUIDs TO PATIENT IDs ===")

# Load the manifest file (adjust path as needed)
manifest = pd.read_csv("gdc_manifest.txt", sep="\t")

# Create UUID to filename mapping from manifest
uuid_map = dict(zip(manifest['id'], manifest['filename']))

# Load clinical/sample sheet data if available
mapping_successful = False

try:
    sample_sheet = pd.read_csv("gdc_sample_sheet.tsv", sep="\t")
    
    # Create UUID to TCGA barcode mapping
    uuid_to_barcode = dict(zip(sample_sheet['File ID'], sample_sheet['Sample ID']))
    
    # Map the index
    patient_ids = [uuid_to_barcode.get(uuid, uuid) for uuid in vst_counts.index]
    
    # Count successfully mapped samples
    mapped_count = sum(1 for uuid, pid in zip(vst_counts.index, patient_ids) if pid != uuid)
    
    vst_counts.index = patient_ids
    vst_counts.index.name = 'patient_id'
    
    mapping_successful = True
    print(f"✓ Mapped {mapped_count}/{len(vst_counts.index)} UUIDs to patient barcodes")
    
    # Show unmapped samples if any
    unmapped = [uuid for uuid in vst_counts.index if uuid in vst_counts.index and not uuid.startswith('TCGA-')]
    if unmapped:
        print(f"⚠ Warning: {len(unmapped)} samples could not be mapped")
    
except FileNotFoundError:
    print("⚠ Sample sheet not found. Using UUIDs as identifiers.")
    print("To get patient IDs, download the sample sheet from GDC portal.")
    vst_counts.index.name = 'sample_uuid'

except Exception as e:
    print(f"⚠ Error during mapping: {e}")
    print("Using UUIDs as identifiers.")
    vst_counts.index.name = 'sample_uuid'


print("\n=== SAVING OUTPUT ===")

# Save to CSV (samples as rows, genes as columns)
vst_counts.to_csv("tcga-brca-vst-normalized-counts.csv")

print("✓ VST-normalized counts saved: tcga-brca-vst-normalized-counts.csv")


print("\n=== FINAL OUTPUT ===")
print(f"Dimensions: {vst_counts.shape[0]} samples x {vst_counts.shape[1]} genes")

# Updated message based on whether mapping succeeded
if mapping_successful:
    print("\nFirst 5 patient IDs:")
else:
    print("\nFirst 5 sample IDs (UUIDs):")
print(vst_counts.index[:5].tolist())

print("\nFirst 5 gene names:")
print(vst_counts.columns[:5].tolist())

print("\nData preview:")
print(vst_counts.iloc[:5, :6])

print("\nValue range:")
print(f"Min: {vst_counts.values.min():.3f}")
print(f"Max: {vst_counts.values.max():.3f}")
print(f"Mean: {vst_counts.values.mean():.3f}")

print("\n=== PROCESSING COMPLETE ===")
if mapping_successful:
    print("✓ Sample names mapped to TCGA patient barcodes")
else:
    print("⚠ Sample names are file UUIDs from folder names")
    print("Download gdc_sample_sheet.tsv from GDC portal to map to patient IDs")

print("\nTo load in Python:")
print('df = pd.read_csv("tcga-brca-vst-normalized-counts.csv", index_col=0)')


=== VST NORMALIZATION ===
PyDESeq2 not installed. Using alternative normalization...


NameError: name 'counts_filtered' is not defined