# 🧬 Pan-Cancer Immune Biomarker Discovery (with Real Survival Data)

This notebook integrates TCGA expression, immune subtypes, and real clinical survival metadata from cBioPortal.

In [None]:
!pip install -q requests pandas seaborn matplotlib lifelines scikit-learn gdown

[31mERROR: Could not find a version that satisfies the requirement gdc-client (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gdc-client[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## 🔌 Connect to cBioPortal API

In [21]:
import requests
import json

# Base URL for the API
base_url = 'https://www.cbioportal.org/api'

def get_cbioportal_data(endpoint, params=None):
    url = f"{base_url}/{endpoint}"
    response = requests.get(url, params=params)
    response.raise_for_status()  # Raise an exception for bad status codes
    return response.json()

## 📦 Get TCGA BRCA Sample List

In [22]:
# Get all studies
studies = get_cbioportal_data('studies')
print("Available studies:")
for study in studies:
    print(f"Study ID: {study['studyId']}, Name: {study['name']}")

# Get samples for BRCA study
study_id = 'brca_tcga'
samples = get_cbioportal_data(f'studies/{study_id}/samples')
sample_ids = [sample['sampleId'] for sample in samples]
print(f"\nLoaded {len(sample_ids)} samples")

Available studies:
Study ID: all_stjude_2015, Name: Acute Lymphoblastic Leukemia (St Jude, Nat Genet 2015)
Study ID: all_stjude_2013, Name: Hypodiploid Acute Lymphoid Leukemia (St Jude, Nat Genet 2013)
Study ID: acyc_fmi_2014, Name: Adenoid Cystic Carcinoma (FMI, Am J Surg Pathl. 2014)
Study ID: acyc_jhu_2016, Name: Adenoid Cystic Carcinoma (JHU, Cancer Prev Res 2016)
Study ID: acyc_mda_2015, Name: Adenoid Cystic Carcinoma (MDA, Clin Cancer Res 2015)
Study ID: acyc_mgh_2016, Name: Adenoid Cystic Carcinoma (MGH, Nat Gen 2016)
Study ID: acyc_sanger_2013, Name: Adenoid Cystic Carcinoma (Sanger/MDA, JCI 2013)
Study ID: all_stjude_2016, Name: Acute Lymphoblastic Leukemia (St Jude, Nat Genet 2016)
Study ID: appendiceal_msk_2022, Name: Appendiceal Cancer (MSK, J Clin Oncol 2022)
Study ID: blca_plasmacytoid_mskcc_2016, Name: Bladder Cancer (MSK, Nat Genet 2016)
Study ID: bcc_unige_2016, Name: Basal Cell Carcinoma (UNIGE, Nat Genet 2016)
Study ID: brca_broad, Name: Breast Invasive Carcinoma (Br

## 🧬 Download RNA-seq Gene Expression

In [None]:
genes = ['CD8A', 'PDCD1', 'CTLA4']
profile_id = 'brca_tcga_rna_seq_v2_mrna'

# Try the fetch endpoint
print("Trying to get molecular data...")
response = requests.post(
    f'{base_url}/molecular-data/fetch',
    json={
        'molecularProfileId': profile_id,
        'sampleIds': sample_ids,
        'geneIds': genes
    }
)
print("Response status:", response.status_code)
print("Response content:", response.text[:1000])

# Try to parse the response
molecular_data = response.json()
print("\nMolecular data type:", type(molecular_data))
print("Molecular data structure:", molecular_data[:5] if isinstance(molecular_data, list) else molecular_data)

# Create DataFrame
import pandas as pd
if isinstance(molecular_data, list):
    expr_df = pd.DataFrame(molecular_data)
    print("\nDataFrame columns:", expr_df.columns)
    if all(col in expr_df.columns for col in ['sampleId', 'geneId', 'value']):
        expr_df = expr_df.pivot(index='sampleId', columns='geneId', values='value').dropna()
        print("Final DataFrame shape:", expr_df.shape)
    else:
        print("Missing required columns. Available columns:", expr_df.columns)

Checking available molecular profiles...
Available profiles: [{'molecularAlterationType': 'PROTEIN_LEVEL', 'datatype': 'LOG2-VALUE', 'name': 'Protein expression (RPPA)', 'description': 'Protein expression measured by reverse-phase protein array', 'showProfileInAnalysisTab': False, 'patientLevel': False, 'molecularProfileId': 'brca_tcga_rppa', 'studyId': 'brca_tcga'}, {'molecularAlterationType': 'PROTEIN_LEVEL', 'datatype': 'Z-SCORE', 'name': 'Protein expression z-scores (RPPA)', 'description': 'Protein expression, measured by reverse-phase protein array, z-scores', 'showProfileInAnalysisTab': True, 'patientLevel': False, 'molecularProfileId': 'brca_tcga_rppa_Zscores', 'studyId': 'brca_tcga'}, {'molecularAlterationType': 'PROTEIN_LEVEL', 'datatype': 'CONTINUOUS', 'name': 'Protein levels (mass spectrometry by CPTAC)', 'description': 'Protein levels measured with mass spectrometry by the Clinical Proteomic Tumor Analysis Consortium (CPTAC)', 'showProfileInAnalysisTab': False, 'patientLeve

## 🧬 Merge Immune Subtypes

In [None]:
import pandas as pd
import requests
import io

# Download immune subtypes data from TCGA Pan-Cancer Atlas
url = "https://api.gdc.cancer.gov/data/1a7d7be8-675d-4e60-a105-19d4121bdebf"
headers = {
    'Content-Type': 'application/json',
}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    # Read the CSV file
    subtypes = pd.read_csv(io.StringIO(response.text))
    print("Successfully downloaded immune subtypes data")
    print("Data shape:", subtypes.shape)
    print("Columns:", subtypes.columns)
    
    # Filter for immune subtype information
    if 'Immune.Subtype' in subtypes.columns:
        subtypes = subtypes[['TCGA.Participant.Barcode', 'Immune.Subtype']]
        subtypes.columns = ['SampleID', 'ImmuneSubtype']
        
        # Merge with expression data
        expr_df = expr_df.join(subtypes.set_index('SampleID'), on='sampleId')
        expr_df.dropna(subset=['ImmuneSubtype'], inplace=True)
        print("Merged data shape:", expr_df.shape)
    else:
        print("Could not find immune subtype column in the data")
else:
    print("Could not download immune subtypes data. Status code:", response.status_code)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 18, saw 2


## 🩺 Fetch Real Clinical Survival Metadata

In [36]:
# Get clinical data
clinical_data = get_cbioportal_data(f'studies/{study_id}/clinical-data')
clinical_df = pd.DataFrame(clinical_data)
clinical_df = clinical_df.set_index('patientId')
clinical_df = clinical_df[['OS_MONTHS', 'OS_STATUS']].rename(columns={
    'OS_MONTHS': 'OS_Time',
    'OS_STATUS': 'OS_Status'
})

# Format status to 1/0
clinical_df['OS_Status'] = clinical_df['OS_Status'].apply(lambda x: 1 if str(x).strip().upper().startswith('DECEASED') else 0)

# Map patient ID from sample ID (remove '_tumor' or use substring)
expr_df['patientId'] = expr_df.index.str.replace('-Tumor', '', regex=False).str.slice(0, 12)
expr_df = expr_df.join(clinical_df, on='patientId')

KeyError: "None of [Index(['OS_MONTHS', 'OS_STATUS'], dtype='object')] are in the [columns]"

## 🧪 Survival Analysis by CD8A Expression

In [None]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

merged = expr_df.dropna(subset=['OS_Time', 'OS_Status'])
median_cd8 = merged['CD8A'].median()
high = merged[merged['CD8A'] > median_cd8]
low = merged[merged['CD8A'] <= median_cd8]

kmf = KaplanMeierFitter()
kmf.fit(high['OS_Time'], high['OS_Status'], label='High CD8A')
ax = kmf.plot()
kmf.fit(low['OS_Time'], low['OS_Status'], label='Low CD8A')
kmf.plot(ax=ax)
plt.title("Survival by CD8A Expression")
plt.show()