In [1]:
import yaml
import json
import pandas as pd
from clustergrammer import Network
from clustergrammer_widget import *

In [2]:
signature_labels = yaml.load(open('COSMICv3_labels.yml'), Loader=yaml.FullLoader)
DDR_genes = yaml.load(open('DDR_genes.yml'), Loader=yaml.FullLoader)
genes_DDR = {}
for k, vlist in DDR_genes.items():
    for v in vlist:
        genes_DDR[v.upper()] = k

samples = json.load(open('TCGA_COAD_cases.json'))
sample_labels = {}

for s in samples:
    if 'diagnoses' not in s:
        continue
    sample = s['submitter_id']
    label = s['diagnoses'][0]['primary_diagnosis'].split(",")[0]
    sample_labels[sample] = (
        'Sample: {}'.format(sample),
        'Subtype: {}'.format(label),
        'Methylation: {}'.format("Define here!"))

## Mutational Signatures

In [3]:
cosmic = pd.read_table("COAD.txt")
cosmic

Unnamed: 0,sample,signature,exposure,mutations
0,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS15,0.223284,201
1,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS6,0.125008,113
2,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS20,0.114956,104
3,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS54,0.098486,89
4,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS21,0.059189,53
...,...,...,...,...
7318,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS17a,0.012159,2
7319,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS48,0.009352,1
7320,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS7d,0.008737,1
7321,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS49,0.005208,1


In [9]:
# threshold on the exposure 5%
# rename signatures and samples
# it's not quite "the pandas way", but does the job

samples = []
signatures = []
exposures = []

for idx, row in cosmic.iterrows():
    exposure = row['exposure']
    if exposure < 0.05:
        continue
    
    sample = row['sample'] # [:12]
    samples.append(sample)
    
    signature = row['signature'].split("-", 1)[1][3:]
    if signature in signature_labels:
        signature = "{} ({})".format(signature, signature_labels.get(signature, ""))
    signatures.append(signature)
    
    exposures.append(exposure)

df_long = pd.DataFrame({'sample': samples, 'signature': signatures, 'exposure': exposures})
print(df_long)

                            sample                 signature  exposure
0     TCGA-AA-3966-01A-01D-1981-10                  15 (MMR)  0.223284
1     TCGA-AA-3966-01A-01D-1981-10               6 (DDR MSI)  0.125008
2     TCGA-AA-3966-01A-01D-1981-10           20 (MMR, POLD1)  0.114956
3     TCGA-AA-3966-01A-01D-1981-10  54 (Sequencing artifact)  0.098486
4     TCGA-AA-3966-01A-01D-1981-10                  21 (MMR)  0.059189
...                            ...                       ...       ...
1712  TCGA-CM-5344-01A-21D-1719-10                  15 (MMR)  0.063539
1713  TCGA-CM-5344-01A-21D-1719-10               6 (DDR MSI)  0.061794
1714  TCGA-AA-3496-01A-21D-1835-10                 1 (5mC>T)  0.516963
1715  TCGA-AA-3496-01A-21D-1835-10                        42  0.140978
1716  TCGA-AA-3496-01A-21D-1835-10  10b (Polymerase epsilon)  0.083581

[1717 rows x 3 columns]


In [11]:
# From long format to wide format
df = df_long.pivot(index='signature', columns='sample', values='exposure').fillna(0.0)
df.columns

Index(['TCGA-3L-AA1B-01A-11D-A36X-10', 'TCGA-4N-A93T-01A-11D-A36X-10',
       'TCGA-4T-AA8H-01A-11D-A40P-10', 'TCGA-5M-AAT4-01A-11D-A40P-10',
       'TCGA-5M-AAT5-01A-21D-A40P-10', 'TCGA-5M-AAT6-01A-11D-A40P-10',
       'TCGA-5M-AATA-01A-31D-A40P-10', 'TCGA-5M-AATE-01A-11D-A40P-10',
       'TCGA-A6-2671-01A-01D-1408-10', 'TCGA-A6-2672-01B-03D-2298-08',
       ...
       'TCGA-QG-A5YV-01A-11D-A28G-10', 'TCGA-QG-A5YW-01A-11D-A28G-10',
       'TCGA-QG-A5YX-01A-11D-A28G-10', 'TCGA-QG-A5Z1-01A-11D-A28G-10',
       'TCGA-QG-A5Z2-01A-11D-A28G-10', 'TCGA-QL-A97D-01A-12D-A40P-10',
       'TCGA-RU-A8FL-01A-11D-A36X-10', 'TCGA-SS-A7HO-01A-21D-A36X-10',
       'TCGA-T9-A92H-01A-11D-A36X-10', 'TCGA-WS-AB45-01A-11D-A40P-10'],
      dtype='object', name='sample', length=399)

In [12]:
net = Network(clustergrammer_widget)
net.load_df(df)

In [13]:
net.cluster()

In [14]:
net.widget()

clustergrammer_widget(network='{"row_nodes": [{"name": "1 (5mC>T)", "ini": 58, "clust": 56, "rank": 57, "rankv…

## Mutations in DDR genes


In [15]:
fname = "TCGA.COAD.mutect.03652df4-6090-4f5a-a2ff-ee28a37f9301.DR-10.0.somatic.maf"

In [16]:
maf = pd.read_table(fname, delimiter="\t", skiprows=5)

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
list(maf.columns)

['Hugo_Symbol',
 'Entrez_Gene_Id',
 'Center',
 'NCBI_Build',
 'Chromosome',
 'Start_Position',
 'End_Position',
 'Strand',
 'Variant_Classification',
 'Variant_Type',
 'Reference_Allele',
 'Tumor_Seq_Allele1',
 'Tumor_Seq_Allele2',
 'dbSNP_RS',
 'dbSNP_Val_Status',
 'Tumor_Sample_Barcode',
 'Matched_Norm_Sample_Barcode',
 'Match_Norm_Seq_Allele1',
 'Match_Norm_Seq_Allele2',
 'Tumor_Validation_Allele1',
 'Tumor_Validation_Allele2',
 'Match_Norm_Validation_Allele1',
 'Match_Norm_Validation_Allele2',
 'Verification_Status',
 'Validation_Status',
 'Mutation_Status',
 'Sequencing_Phase',
 'Sequence_Source',
 'Validation_Method',
 'Score',
 'BAM_File',
 'Sequencer',
 'Tumor_Sample_UUID',
 'Matched_Norm_Sample_UUID',
 'HGVSc',
 'HGVSp',
 'HGVSp_Short',
 'Transcript_ID',
 'Exon_Number',
 't_depth',
 't_ref_count',
 't_alt_count',
 'n_depth',
 'n_ref_count',
 'n_alt_count',
 'all_effects',
 'Allele',
 'Gene',
 'Feature',
 'Feature_type',
 'One_Consequence',
 'Consequence',
 'cDNA_position',
 'C

In [20]:
# m['Hugo_Symbol']
# m['Tumor_Sample_Barcode']

samples = []
genes = []
mutations = []

CONSERVATIVE_MODE = True

gene_sample_pairs = set()

for idx, row in maf.iterrows():
    if CONSERVATIVE_MODE:
        if row['Variant_Classification'] not in ('Missense_Mutation', 'In_Frame_Del', 'Frame_Shift_Del', 'Nonsense_Mutation'):
            continue

    gene = row['Hugo_Symbol'].upper()
    if gene not in genes_DDR:
        continue

    gene = ('Gene: {}'.format(gene), 'DDR mechanism: {}'.format(genes_DDR[gene]))

    sample = row['Tumor_Sample_Barcode']  # [:12]

    value = int(row['t_alt_count']) / float(int(row['t_depth']))

    if (gene, sample) in gene_sample_pairs:
        continue
    
    samples.append(sample)    
    genes.append(gene)
    mutations.append(value)
    gene_sample_pairs.add((gene, sample))
    
df_gene_long = pd.DataFrame({'sample': samples, 'gene': genes, 'mutation': mutations})
df_gene = df_gene_long.pivot(index='gene', columns='sample', values='mutation').fillna(0.0)

In [21]:
net_gene = Network(clustergrammer_widget)
net_gene.load_df(df_gene)

In [22]:
net_gene.cluster()

In [23]:
net_gene.widget()

clustergrammer_widget(network='{"row_nodes": [{"name": "Gene: ALKBH2", "ini": 172, "clust": 73, "rank": 20, "r…