# 🧬 Pan-Cancer Immune Biomarker Discovery (with Real Survival Data)

This notebook integrates TCGA expression, immune subtypes, and real clinical survival metadata from cBioPortal.

In [None]:
!pip install -q bravado pandas seaborn matplotlib lifelines scikit-learn gdown


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## 🔌 Connect to cBioPortal API

In [5]:
from bravado.client import SwaggerClient
cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/v3/api-docs',
                                    config={"validate_requests":False,
                                           "validate_responses":False,
                                           "validate_swagger_spec": False})

# Add lower case attributes for easier access
for attr in dir(cbioportal):
    cbioportal.__setattr__(attr.replace(' ', '_').lower(), cbioportal.__getattr__(attr))

## 📦 Get TCGA BRCA Sample List

In [7]:
study_id = 'brca_tcga'
case_list = cbioportal.samples.getAllSamplesInStudyUsingGET(studyId=study_id).result()
sample_ids = [case['sampleId'] for case in case_list]
print(f"Loaded {len(sample_ids)} samples")

TypeError: 'NoneType' object is not iterable

## 🧬 Download RNA-seq Gene Expression

In [None]:
genes = ['CD8A', 'PDCD1', 'CTLA4']
profile_id = 'brca_tcga_rna_seq_v2_mrna'
expr_data = cbioportal.molecular_data.getMolecularDataInMolecularProfileUsingPOST(
    molecularProfileId=profile_id,
    sampleIds=sample_ids,
    entrezGeneIds=[],
    geneIds=genes
).result()

import pandas as pd
expr_df = pd.DataFrame(expr_data).pivot(index='sampleId', columns='geneId', values='value').dropna()

## 🧬 Merge Immune Subtypes

In [None]:
import gdown
url = 'https://drive.google.com/uc?id=1T9nEBYf_oAAdMi2LT3rJDnxlT2e7owHw'
gdown.download(url, 'immune_subtypes.csv', quiet=False)

subtypes = pd.read_csv('immune_subtypes.csv')
expr_df = expr_df.join(subtypes.set_index('SampleID'), on='sampleId')
expr_df.dropna(subset=['ImmuneSubtype'], inplace=True)

## 🩺 Fetch Real Clinical Survival Metadata

In [None]:
clinical_data = cbioportal.clinical_data.getAllClinicalDataInStudyUsingGET(studyId=study_id).result()
clinical_df = pd.DataFrame(clinical_data)
clinical_df = clinical_df.set_index('patientId')
clinical_df = clinical_df[['OS_MONTHS', 'OS_STATUS']].rename(columns={
    'OS_MONTHS': 'OS_Time',
    'OS_STATUS': 'OS_Status'
})

# Format status to 1/0
clinical_df['OS_Status'] = clinical_df['OS_Status'].apply(lambda x: 1 if str(x).strip().upper().startswith('DECEASED') else 0)

# Map patient ID from sample ID (remove '_tumor' or use substring)
expr_df['patientId'] = expr_df.index.str.replace('-Tumor', '', regex=False).str.slice(0, 12)
expr_df = expr_df.join(clinical_df, on='patientId')

## 🧪 Survival Analysis by CD8A Expression

In [None]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

merged = expr_df.dropna(subset=['OS_Time', 'OS_Status'])
median_cd8 = merged['CD8A'].median()
high = merged[merged['CD8A'] > median_cd8]
low = merged[merged['CD8A'] <= median_cd8]

kmf = KaplanMeierFitter()
kmf.fit(high['OS_Time'], high['OS_Status'], label='High CD8A')
ax = kmf.plot()
kmf.fit(low['OS_Time'], low['OS_Status'], label='Low CD8A')
kmf.plot(ax=ax)
plt.title("Survival by CD8A Expression")
plt.show()