In [6]:
import pandas as pd
import numpy as np

# Load mutational data
mutations_df = pd.read_csv("../ucec_tcga_pan_can_atlas_2018\\data_mutations.txt", sep="\t", comment="#", low_memory=False)

In [7]:
# Explore the dataset structure
print(f"Dataset shape: {mutations_df.shape}")
print(f"\nColumns in the dataset:")
print(mutations_df.columns.tolist())

Dataset shape: (935656, 104)

Columns in the dataset:
['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Consequence', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS', 'dbSNP_Val_Status', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1', 'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1', 'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'Verification_Status', 'Validation_Status', 'Mutation_Status', 'Sequencing_Phase', 'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File', 'Sequencer', 't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq', 'Protein_position', 'Codons', 'Hotspot', 'AA_MAF', 'AFR_MAF', 'ALLELE_NUM', 'AMR_MAF', 'ASN_MAF', 'Allele', 'Amino_acids', 'BIOTYPE', 'CANONICAL', 'CDS_position', 'CLIN_SIG'

In [8]:
# Look at the first few rows to understand the data structure
print("First 5 rows:")
mutations_df.head()

First 5 rows:


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,SYMBOL_SOURCE,TREMBL,TSL,UNIPARC,VARIANT_CLASS,all_effects,cDNA_position,n_depth,t_depth,Annotation_Status
0,PRLHR,2834.0,.,GRCh37,10,120354589,120354589,+,synonymous_variant,Silent,...,HGNC,.,.,UPI000013CA6F,SNV,"PRLHR,synonymous_variant,p.%3D,ENST00000369169...",307,41,23,SUCCESS
1,PARD3,56288.0,.,GRCh37,10,34626217,34626218,+,frameshift_variant,Frame_Shift_Ins,...,HGNC,.,.,UPI0000073A9F,insertion,"PARD3,frameshift_variant,p.Ser836LysfsTer8,ENS...",2880-2881,148,165,SUCCESS
2,WDFY4,57705.0,.,GRCh37,10,49939399,49939399,+,synonymous_variant,Silent,...,HGNC,Q6PIM1_HUMAN,.,UPI000176ADB8,SNV,"WDFY4,synonymous_variant,p.%3D,ENST00000413659...",1401,26,23,SUCCESS
3,C10orf71,118461.0,.,GRCh37,10,50532105,50532105,+,synonymous_variant,Silent,...,HGNC,.,.,UPI0000161572,SNV,"C10orf71,synonymous_variant,p.%3D,ENST00000374...",1803,60,47,SUCCESS
4,YAP1,10413.0,.,GRCh37,11,101984924,101984924,+,missense_variant,Missense_Mutation,...,HGNC,.,.,UPI00000746D8,SNV,"YAP1,missense_variant,p.Arg124Pro,ENST00000526...",759,52,28,SUCCESS


In [11]:
# Look for patient ID columns (common names)
patient_id_columns = [col for col in mutations_df.columns if any(keyword in col.lower() for keyword in ['patient', 'sample', 'tumor', 'id'])]
print(f"Potential patient ID columns: {patient_id_columns}")

# Check gene-related columns
print(f"\nGene-related columns you mentioned:")
if 'Hugo_Symbol' in mutations_df.columns:
    print(f"Hugo_Symbol - unique genes: {mutations_df['Hugo_Symbol'].nunique()}")
    print(f"Sample Hugo_Symbol values: {mutations_df['Hugo_Symbol'].head().tolist()}")
else:
    print("Hugo_Symbol column not found")

if 'Entrez_Gene_Id' in mutations_df.columns:
    print(f"Entrez_Gene_Id - unique genes: {mutations_df['Entrez_Gene_Id'].nunique()}")
    print(f"Sample Entrez_Gene_Id values: {mutations_df['Entrez_Gene_Id'].head().tolist()}")
else:
    print("Entrez_Gene_Id column not found")

Potential patient ID columns: ['Entrez_Gene_Id', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'Tumor_Validation_Allele1', 'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'Validation_Status', 'Validation_Method', 'Transcript_ID', 'Amino_acids', 'HGNC_ID']

Gene-related columns you mentioned:
Hugo_Symbol - unique genes: 21737
Sample Hugo_Symbol values: ['PRLHR', 'PARD3', 'WDFY4', 'C10orf71', 'YAP1']
Entrez_Gene_Id - unique genes: 19799
Sample Entrez_Gene_Id values: [2834.0, 56288.0, 57705.0, 118461.0, 10413.0]


In [10]:
# Understand the data structure better
print("Understanding the data structure:")
print(f"Total rows: {len(mutations_df)}")

# If we found patient ID columns, analyze them
if patient_id_columns:
    for col in patient_id_columns[:2]:  # Look at first 2 patient ID columns
        unique_patients = mutations_df[col].nunique()
        print(f"Unique values in {col}: {unique_patients}")
        print(f"Sample {col} values: {mutations_df[col].head().tolist()}")
        
# Show data types to understand what kind of info we have
print(f"\nData types:")
print(mutations_df.dtypes)

Understanding the data structure:
Total rows: 935656
Unique values in Entrez_Gene_Id: 19799
Sample Entrez_Gene_Id values: [2834.0, 56288.0, 57705.0, 118461.0, 10413.0]
Unique values in Tumor_Seq_Allele1: 1009
Sample Tumor_Seq_Allele1 values: ['C', '-', 'G', 'C', 'G']

Data types:
Hugo_Symbol           object
Entrez_Gene_Id       float64
Center                object
NCBI_Build            object
Chromosome            object
                      ...   
all_effects           object
cDNA_position         object
n_depth                int64
t_depth                int64
Annotation_Status     object
Length: 104, dtype: object
