In [1]:
import warnings
from pathlib import Path

import pandas as pd

warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## Configuration

Define the cohorts and data directory.

In [2]:
# Define cohorts
cohorts = {
    'BRCA': 'Breast Cancer',
    'LUAD': 'Lung Adenocarcinoma',
    'LUSC': 'Lung Squamous Cell Carcinoma',
    'SKCM': 'Melanoma',
    'MESO': 'Mesothelioma',
    'COAD': 'Colon Cancer',
    'LAML': 'Acute Myeloid Leukemia'
}

# Data directory - use absolute path to ensure it works regardless of working directory
data_dir = Path('/workspace/data/GDCdata')

print(f"Data directory: {data_dir}")
print(f"Data directory exists: {data_dir.exists()}")
print(f"Number of cohorts: {len(cohorts)}")

# List available files
if data_dir.exists():
    files = sorted([f.name for f in data_dir.glob('TCGA-*.tsv')])
    print(f"\nAvailable data files: {len(files)}")
    for f in files[:5]:
        print(f"  - {f}")

Data directory: /workspace/data/GDCdata
Data directory exists: True
Number of cohorts: 7

Available data files: 14
  - TCGA-BRCA.clinical.tsv
  - TCGA-BRCA.star_fpkm-uq.tsv
  - TCGA-COAD.clinical.tsv
  - TCGA-COAD.star_fpkm-uq.tsv
  - TCGA-LAML.clinical.tsv


## Helper Functions

In [3]:
def load_rnaseq_data(cohort_name, data_dir):
    """
    Load RNA-Seq FPKM-UQ data for a cohort.
    Transposes the data so samples are rows and genes are columns.
    """
    file_path = data_dir / f"TCGA-{cohort_name}.star_fpkm-uq.tsv"

    if not file_path.exists():
        print(f"Warning: {file_path} not found!")
        return None

    print(f"Loading RNA-Seq data from {file_path.name}...")

    # Read the data (first column is Ensembl_ID, first row is sample names)
    df = pd.read_csv(file_path, sep='\t', index_col=0)

    # Transpose: samples as rows, genes as columns
    df_t = df.T

    # Reset index to make sample IDs a column
    df_t.reset_index(inplace=True)
    df_t.rename(columns={'index': 'sample'}, inplace=True)

    print(f"  Shape: {df_t.shape[0]} samples x {df_t.shape[1]-1} genes")

    return df_t


def load_clinical_data(cohort_name, data_dir):
    """
    Load clinical/phenotype data for a cohort.
    """
    file_path = data_dir / f"TCGA-{cohort_name}.clinical.tsv"

    if not file_path.exists():
        print(f"Warning: {file_path} not found!")
        return None

    print(f"Loading clinical data from {file_path.name}...")

    # Read clinical data
    df = pd.read_csv(file_path, sep='\t')

    print(f"  Shape: {df.shape[0]} samples x {df.shape[1]} clinical features")

    return df


def merge_cohort_data(cohort_name, data_dir):
    """
    Merge RNA-Seq and clinical data for a cohort on sample ID.
    """
    print(f"\n{'='*60}")
    print(f"Processing {cohort_name} ({cohorts[cohort_name]})")
    print(f"{'='*60}")

    # Load data
    rnaseq_df = load_rnaseq_data(cohort_name, data_dir)
    clinical_df = load_clinical_data(cohort_name, data_dir)

    if rnaseq_df is None or clinical_df is None:
        print(f"Skipping {cohort_name} due to missing data.")
        return None

    # Merge on sample ID
    print("\nMerging data on 'sample' column...")
    merged_df = pd.merge(
        clinical_df,
        rnaseq_df,
        on='sample',
        how='inner'  # Only keep samples present in both datasets
    )

    print(
        f"  Merged shape: {merged_df.shape[0]} samples x {merged_df.shape[1]} total features")
    print(f"  Clinical features: {clinical_df.shape[1]}")
    print(f"  Gene expression features: {rnaseq_df.shape[1]-1}")

    # Display sample statistics
    print("\nSample overlap:")
    print(f"  Clinical samples: {clinical_df.shape[0]}")
    print(f"  RNA-Seq samples: {rnaseq_df.shape[0]}")
    print(f"  Merged samples: {merged_df.shape[0]}")

    return merged_df


print("Helper functions defined!")

Helper functions defined!


## Process All Cohorts

Load and merge data for each cohort.

In [4]:
# Dictionary to store merged dataframes
merged_cohorts = {}

# Process each cohort
for cohort_name in cohorts.keys():
    merged_df = merge_cohort_data(cohort_name, data_dir)

    if merged_df is not None:
        merged_cohorts[cohort_name] = merged_df

print(f"\n{'='*60}")
print(f"Summary: Successfully merged {len(merged_cohorts)} cohorts")
print(f"{'='*60}")
for cohort_name in merged_cohorts.keys():
    print(f"  {cohort_name}: {merged_cohorts[cohort_name].shape[0]} samples")


Processing BRCA (Breast Cancer)
Loading RNA-Seq data from TCGA-BRCA.star_fpkm-uq.tsv...
  Shape: 1226 samples x 60660 genes
Loading clinical data from TCGA-BRCA.clinical.tsv...
  Shape: 1255 samples x 85 clinical features

Merging data on 'sample' column...
  Merged shape: 1226 samples x 60745 total features
  Clinical features: 85
  Gene expression features: 60660

Sample overlap:
  Clinical samples: 1255
  RNA-Seq samples: 1226
  Merged samples: 1226

Processing LUAD (Lung Adenocarcinoma)
Loading RNA-Seq data from TCGA-LUAD.star_fpkm-uq.tsv...
  Shape: 589 samples x 60660 genes
Loading clinical data from TCGA-LUAD.clinical.tsv...
  Shape: 721 samples x 89 clinical features

Merging data on 'sample' column...
  Merged shape: 589 samples x 60749 total features
  Clinical features: 89
  Gene expression features: 60660

Sample overlap:
  Clinical samples: 721
  RNA-Seq samples: 589
  Merged samples: 589

Processing LUSC (Lung Squamous Cell Carcinoma)
Loading RNA-Seq data from TCGA-LUSC.

## Explore Individual Cohorts

### Example: BRCA (Breast Cancer)

In [5]:
# Access a specific cohort
if 'BRCA' in merged_cohorts:
    brca_df = merged_cohorts['BRCA']

    print(f"BRCA Cohort Shape: {brca_df.shape}")
    print("\nFirst few columns:")
    print(brca_df.columns[:20].tolist())

    print("\nFirst 3 samples:")
    display(brca_df.head(3))

    print("\nData types:")
    print(brca_df.dtypes.value_counts())
else:
    print("BRCA data not available")

BRCA Cohort Shape: (1226, 60745)

First few columns:
['sample', 'id', 'disease_type', 'case_id', 'submitter_id', 'primary_site', 'alcohol_history.exposures', 'race.demographic', 'gender.demographic', 'ethnicity.demographic', 'vital_status.demographic', 'age_at_index.demographic', 'days_to_birth.demographic', 'year_of_birth.demographic', 'year_of_death.demographic', 'primary_site.project', 'project_id.project', 'disease_type.project', 'name.project', 'name.program.project']

First 3 samples:


Unnamed: 0,sample,id,disease_type,case_id,submitter_id,primary_site,alcohol_history.exposures,race.demographic,gender.demographic,ethnicity.demographic,...,ENSG00000288661.1,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
0,TCGA-BH-A0W3-01A,3c612e12-6de8-44fa-a095-805c45474821,Ductal and Lobular Neoplasms,3c612e12-6de8-44fa-a095-805c45474821,TCGA-BH-A0W3,Breast,Not Reported,white,female,not hispanic or latino,...,0.0,0.0,0.200253,0.0,0.0,0.0,1.950991,0.0,0.035483,0.271784
1,TCGA-AR-A24V-01A,3cb06c7a-f2a8-448b-91a8-dd201bbf2ddd,Ductal and Lobular Neoplasms,3cb06c7a-f2a8-448b-91a8-dd201bbf2ddd,TCGA-AR-A24V,Breast,Not Reported,white,female,not hispanic or latino,...,0.0,0.0,0.125254,0.0,0.0,0.0,2.812519,0.0,0.011353,0.195096
2,TCGA-E9-A1NE-01A,3d676bba-154b-4d22-ab59-d4d4da051b94,Ductal and Lobular Neoplasms,3d676bba-154b-4d22-ab59-d4d4da051b94,TCGA-E9-A1NE,Breast,Not Reported,white,female,not hispanic or latino,...,0.0,0.0,0.094371,0.0,0.0,0.0,2.20085,0.0,0.036187,0.224287



Data types:
float64    60674
object        68
bool           2
int64          1
Name: count, dtype: int64


### Example: LUAD (Lung Adenocarcinoma)

In [6]:
# Access LUAD cohort
if 'LUAD' in merged_cohorts:
    luad_df = merged_cohorts['LUAD']

    print(f"LUAD Cohort Shape: {luad_df.shape}")
    print("\nSample IDs (first 5):")
    print(luad_df['sample'].head().tolist())

    print("\nClinical features (first 20):")
    print(luad_df.columns[:20].tolist())
else:
    print("LUAD data not available")

LUAD Cohort Shape: (589, 60749)

Sample IDs (first 5):
['TCGA-55-A48X-01A', 'TCGA-NJ-A55R-01A', 'TCGA-53-A4EZ-01A', 'TCGA-44-6777-01A', 'TCGA-44-6777-11A']

Clinical features (first 20):
['sample', 'id', 'disease_type', 'case_id', 'submitter_id', 'primary_site', 'cigarettes_per_day.exposures', 'alcohol_history.exposures', 'years_smoked.exposures', 'pack_years_smoked.exposures', 'race.demographic', 'gender.demographic', 'ethnicity.demographic', 'vital_status.demographic', 'age_at_index.demographic', 'days_to_birth.demographic', 'year_of_birth.demographic', 'year_of_death.demographic', 'primary_site.project', 'project_id.project']


## Check Sample ID Format

Verify that sample IDs are in the expected TCGA format (e.g., TCGA-AA-3814-01A).

In [7]:
# Check sample ID format across all cohorts
for cohort_name, df in merged_cohorts.items():
    print(f"\n{cohort_name}:")
    print(f"  Total samples: {df.shape[0]}")
    print("  Sample ID examples:")
    print(f"    {df['sample'].iloc[0]}")
    print(f"    {df['sample'].iloc[1]}")
    print(f"    {df['sample'].iloc[2]}")

    # Check for duplicates
    duplicates = df['sample'].duplicated().sum()
    if duplicates > 0:
        print(f"  WARNING: {duplicates} duplicate sample IDs found!")
    else:
        print("  ✓ No duplicate sample IDs")


BRCA:
  Total samples: 1226
  Sample ID examples:
    TCGA-BH-A0W3-01A
    TCGA-AR-A24V-01A
    TCGA-E9-A1NE-01A
  ✓ No duplicate sample IDs

LUAD:
  Total samples: 589
  Sample ID examples:
    TCGA-55-A48X-01A
    TCGA-NJ-A55R-01A
    TCGA-53-A4EZ-01A
  ✓ No duplicate sample IDs

LUSC:
  Total samples: 552
  Sample ID examples:
    TCGA-18-3421-01A
    TCGA-37-4133-01A
    TCGA-L3-A524-01A
  ✓ No duplicate sample IDs

SKCM:
  Total samples: 473
  Sample ID examples:
    TCGA-EE-A29R-06A
    TCGA-FR-A8YC-06A
    TCGA-BF-A3DN-01A
  ✓ No duplicate sample IDs

MESO:
  Total samples: 87
  Sample ID examples:
    TCGA-3H-AB3K-01A
    TCGA-XT-AASU-01A
    TCGA-UD-AABY-01A
  ✓ No duplicate sample IDs

COAD:
  Total samples: 514
  Sample ID examples:
    TCGA-AA-3970-01A
    TCGA-AA-3844-01A
    TCGA-AA-3521-01A
  ✓ No duplicate sample IDs

LAML:
  Total samples: 151
  Sample ID examples:
    TCGA-AB-2810-03A
    TCGA-AB-2986-03A
    TCGA-AB-2847-03A
  ✓ No duplicate sample IDs


## Save Merged Data

Save merged dataframes to files for later use.

In [9]:
output_dir = Path('/workspace/data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

for cohort_name, df in merged_cohorts.items():
    output_file = output_dir / f"TCGA-{cohort_name}_merged.tsv"
    print(f"Saving {cohort_name} to {output_file}...")
    df.to_csv(output_file, sep='\t', index=False)

print("All cohorts saved!")

Saving BRCA to /workspace/data/processed/TCGA-BRCA_merged.tsv...
Saving LUAD to /workspace/data/processed/TCGA-LUAD_merged.tsv...
Saving LUSC to /workspace/data/processed/TCGA-LUSC_merged.tsv...
Saving SKCM to /workspace/data/processed/TCGA-SKCM_merged.tsv...
Saving MESO to /workspace/data/processed/TCGA-MESO_merged.tsv...
Saving COAD to /workspace/data/processed/TCGA-COAD_merged.tsv...
Saving LAML to /workspace/data/processed/TCGA-LAML_merged.tsv...
All cohorts saved!
