In [3]:
import yaml
import json
import pandas as pd
from clustergrammer import Network
from clustergrammer_widget import *

## Mutational Signatures

In [4]:
cosmic = pd.read_table("COAD.txt")
cosmic

Unnamed: 0,sample,signature,exposure,mutations
0,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS15,0.223284,201
1,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS6,0.125008,113
2,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS20,0.114956,104
3,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS54,0.098486,89
4,TCGA-AA-3966-01A-01D-1981-10,COSMICv3-SBS21,0.059189,53
...,...,...,...,...
7318,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS17a,0.012159,2
7319,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS48,0.009352,1
7320,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS7d,0.008737,1
7321,TCGA-AA-3496-01A-21D-1835-10,COSMICv3-SBS49,0.005208,1


## Load annotations

https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes

In [14]:
sample_type = {}

tumor_types = {
    '01': 'Primary',
    '06': 'Metastatic',
}

for idx, row in cosmic.iterrows():
    sample_id = row['sample'][:12]
    code = row['sample'].split('-')[3][0:2]
    sample_type[sample_id] = tumor_types.get(code, code)

sample_type

{'TCGA-AA-3966': 'Primary',
 'TCGA-AD-6901': 'Primary',
 'TCGA-AA-3673': 'Primary',
 'TCGA-A6-2679': 'Primary',
 'TCGA-AD-6895': 'Primary',
 'TCGA-AA-3975': 'Primary',
 'TCGA-AM-5821': 'Primary',
 'TCGA-CK-6751': 'Primary',
 'TCGA-AA-A02O': 'Primary',
 'TCGA-AA-3858': 'Primary',
 'TCGA-AY-A54L': 'Primary',
 'TCGA-AA-3941': 'Primary',
 'TCGA-NH-A50V': 'Primary',
 'TCGA-A6-A566': 'Primary',
 'TCGA-CK-4952': 'Primary',
 'TCGA-G4-6303': 'Primary',
 'TCGA-AD-6888': 'Primary',
 'TCGA-CA-6719': 'Primary',
 'TCGA-D5-6898': 'Primary',
 'TCGA-DM-A1D4': 'Primary',
 'TCGA-AA-3971': 'Primary',
 'TCGA-AA-A010': 'Primary',
 'TCGA-AA-3821': 'Primary',
 'TCGA-D5-6534': 'Primary',
 'TCGA-AD-6964': 'Primary',
 'TCGA-AA-3489': 'Primary',
 'TCGA-AA-3812': 'Primary',
 'TCGA-5M-AAT5': 'Primary',
 'TCGA-AA-A01Z': 'Primary',
 'TCGA-CM-6678': 'Primary',
 'TCGA-A6-6137': 'Primary',
 'TCGA-AZ-6605': 'Primary',
 'TCGA-SS-A7HO': 'Primary',
 'TCGA-F4-6569': 'Primary',
 'TCGA-AA-A02F': 'Primary',
 'TCGA-AA-3655': 'Pr

In [15]:
signature_labels = yaml.load(open('COSMICv3_labels.yml'), Loader=yaml.FullLoader)
DDR_genes = yaml.load(open('DDR_genes.yml'), Loader=yaml.FullLoader)
genes_DDR = {}
for k, vlist in DDR_genes.items():
    for v in vlist:
        genes_DDR[v.upper()] = k

samples = json.load(open('TCGA_COAD_cases.json'))
sample_labels = {}

for s in samples:
    if 'diagnoses' not in s:
        continue
    sample = s['submitter_id']
    label = s['diagnoses'][0]['primary_diagnosis'].split(",")[0]
    sample_labels[sample] = (
        'Sample: {}'.format(sample),
        'Subtype: {}'.format(label),
        'Disease: {}'.format(sample_type.get(sample, "Unknown")),
        'Methylation: {}'.format("Define here!"))

In [13]:
sample_labels

{'TCGA-DM-A28M': ('Sample: TCGA-DM-A28M',
  'Subtype: Adenocarcinoma',
  'Disease: Primary',
  'Methylation: Define here!'),
 'TCGA-F4-6805': ('Sample: TCGA-F4-6805',
  'Subtype: Adenocarcinoma',
  'Disease: Primary',
  'Methylation: Define here!'),
 'TCGA-AA-A022': ('Sample: TCGA-AA-A022',
  'Subtype: Adenocarcinoma',
  'Disease: Primary',
  'Methylation: Define here!'),
 'TCGA-D5-6931': ('Sample: TCGA-D5-6931',
  'Subtype: Adenocarcinoma',
  'Disease: Primary',
  'Methylation: Define here!'),
 'TCGA-A6-2678': ('Sample: TCGA-A6-2678',
  'Subtype: Adenocarcinoma',
  'Disease: Unknown',
  'Methylation: Define here!'),
 'TCGA-AA-3697': ('Sample: TCGA-AA-3697',
  'Subtype: Adenocarcinoma',
  'Disease: Primary',
  'Methylation: Define here!'),
 'TCGA-QG-A5YV': ('Sample: TCGA-QG-A5YV',
  'Subtype: Adenocarcinoma',
  'Disease: Primary',
  'Methylation: Define here!'),
 'TCGA-D5-6539': ('Sample: TCGA-D5-6539',
  'Subtype: Adenocarcinoma',
  'Disease: Primary',
  'Methylation: Define here!'),


## Create signature x sample matrix

In [22]:
# threshold on the exposure 5%
# rename signatures and samples
# it's not quite "the pandas way", but does the job

samples = []
signatures = []
exposures = []

for idx, row in cosmic.iterrows():
    exposure = row['exposure']
    if exposure < 0.05:
        continue

    # just to try: mutations (absolute value, not proportion)
    exposure = row['mutations']
    
    sample = row['sample'][:12]
    sample = sample_labels.get(sample, sample)
    samples.append(sample)
    
    signature = row['signature'].split("-", 1)[1][3:]
    if signature in signature_labels:
        signature = "{} ({})".format(signature, signature_labels.get(signature, ""))
    signatures.append(signature)
    
    exposures.append(exposure)

df_long = pd.DataFrame({'sample': samples, 'signature': signatures, 'exposure': exposures})
print(df_long)

                                                 sample  \
0     (Sample: TCGA-AA-3966, Subtype: Mucinous adeno...   
1     (Sample: TCGA-AA-3966, Subtype: Mucinous adeno...   
2     (Sample: TCGA-AA-3966, Subtype: Mucinous adeno...   
3     (Sample: TCGA-AA-3966, Subtype: Mucinous adeno...   
4     (Sample: TCGA-AA-3966, Subtype: Mucinous adeno...   
...                                                 ...   
1712  (Sample: TCGA-CM-5344, Subtype: Adenocarcinoma...   
1713  (Sample: TCGA-CM-5344, Subtype: Adenocarcinoma...   
1714  (Sample: TCGA-AA-3496, Subtype: Adenocarcinoma...   
1715  (Sample: TCGA-AA-3496, Subtype: Adenocarcinoma...   
1716  (Sample: TCGA-AA-3496, Subtype: Adenocarcinoma...   

                     signature  exposure  
0                     15 (MMR)       201  
1                  6 (DDR MSI)       113  
2              20 (MMR, POLD1)       104  
3     54 (Sequencing artifact)        89  
4                     21 (MMR)        53  
...                        ...   

In [23]:
# From long format to wide format
df = df_long.pivot(index='signature', columns='sample', values='exposure').fillna(0.0)
df

sample,"(Sample: TCGA-3L-AA1B, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-4N-A93T, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-4T-AA8H, Subtype: Mucinous adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-5M-AAT4, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-5M-AAT6, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-5M-AATE, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-A6-2671, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-A6-2672, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-A6-2674, Subtype: Mucinous adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-A6-2675, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)",...,"(Sample: TCGA-QG-A5YX, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-QG-A5Z1, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-QG-A5Z2, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-QL-A97D, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-RU-A8FL, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-SS-A7HO, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-T9-A92H, Subtype: Adenocarcinoma, Disease: Primary, Methylation: Define here!)","(Sample: TCGA-WS-AB45, Subtype: Mucinous adenocarcinoma, Disease: Primary, Methylation: Define here!)",TCGA-5M-AAT5,TCGA-5M-AATA
signature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1 (5mC>T),84.0,56.0,89.0,80.0,877.0,75.0,40.0,66.0,77.0,73.0,...,69.0,63.0,354.0,107.0,47.0,76.0,70.0,1625.0,62.0,52.0
10a (Polymerase epsilon),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10b (Polymerase epsilon),0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
13 (AID/APOBEC),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14 (Polymerase epsilon + MMR),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15 (MMR),0.0,0.0,0.0,0.0,390.0,0.0,11.0,0.0,0.0,13.0,...,0.0,10.0,142.0,26.0,31.0,23.0,0.0,248.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
net = Network(clustergrammer_widget)
net.load_df(df)

In [25]:
net.cluster()

In [26]:
net.widget()

clustergrammer_widget(network='{"row_nodes": [{"name": "1 (5mC>T)", "ini": 58, "clust": 12, "rank": 57, "rankv…

## Mutations in DDR genes


In [31]:
fname = "TCGA.COAD.mutect.03652df4-6090-4f5a-a2ff-ee28a37f9301.DR-10.0.somatic.maf"

In [32]:
maf = pd.read_table(fname, delimiter="\t", skiprows=5)

  interactivity=interactivity, compiler=compiler, result=result)


In [33]:
list(maf.columns)

['Hugo_Symbol',
 'Entrez_Gene_Id',
 'Center',
 'NCBI_Build',
 'Chromosome',
 'Start_Position',
 'End_Position',
 'Strand',
 'Variant_Classification',
 'Variant_Type',
 'Reference_Allele',
 'Tumor_Seq_Allele1',
 'Tumor_Seq_Allele2',
 'dbSNP_RS',
 'dbSNP_Val_Status',
 'Tumor_Sample_Barcode',
 'Matched_Norm_Sample_Barcode',
 'Match_Norm_Seq_Allele1',
 'Match_Norm_Seq_Allele2',
 'Tumor_Validation_Allele1',
 'Tumor_Validation_Allele2',
 'Match_Norm_Validation_Allele1',
 'Match_Norm_Validation_Allele2',
 'Verification_Status',
 'Validation_Status',
 'Mutation_Status',
 'Sequencing_Phase',
 'Sequence_Source',
 'Validation_Method',
 'Score',
 'BAM_File',
 'Sequencer',
 'Tumor_Sample_UUID',
 'Matched_Norm_Sample_UUID',
 'HGVSc',
 'HGVSp',
 'HGVSp_Short',
 'Transcript_ID',
 'Exon_Number',
 't_depth',
 't_ref_count',
 't_alt_count',
 'n_depth',
 'n_ref_count',
 'n_alt_count',
 'all_effects',
 'Allele',
 'Gene',
 'Feature',
 'Feature_type',
 'One_Consequence',
 'Consequence',
 'cDNA_position',
 'C

In [35]:
# m['Hugo_Symbol']
# m['Tumor_Sample_Barcode']

samples = []
genes = []
mutations = []

CONSERVATIVE_MODE = True

gene_sample_pairs = set()

for idx, row in maf.iterrows():
    if CONSERVATIVE_MODE:
        if row['Variant_Classification'] not in ('Missense_Mutation', 'In_Frame_Del', 'Frame_Shift_Del', 'Nonsense_Mutation'):
            continue

    gene = row['Hugo_Symbol'].upper()
    if gene not in genes_DDR:
        continue

    gene = ('Gene: {}'.format(gene), 'DDR mechanism: {}'.format(genes_DDR[gene]))

    sample = row['Tumor_Sample_Barcode'][:12]
    sample = sample_labels.get(sample, sample)

    # Variant allele frequency
    value = int(row['t_alt_count']) / float(int(row['t_depth']))

    if (gene, sample) in gene_sample_pairs:
        continue
    
    samples.append(sample)    
    genes.append(gene)
    mutations.append(value)
    gene_sample_pairs.add((gene, sample))
    
df_gene_long = pd.DataFrame({'sample': samples, 'gene': genes, 'mutation': mutations})
df_gene = df_gene_long.pivot(index='gene', columns='sample', values='mutation').fillna(0.0)

In [37]:
df_gene

sample,"(Sample: TCGA-3L-AA1B, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-4N-A93T, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-4T-AA8H, Subtype: Mucinous adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-5M-AAT4, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-5M-AAT6, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-5M-AATE, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-A6-2672, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-A6-2675, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-A6-2677, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-A6-2680, Subtype: Adenocarcinoma, Methylation: Define here!)",...,"(Sample: TCGA-QG-A5YV, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-QG-A5YX, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-QG-A5Z1, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-QG-A5Z2, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-QL-A97D, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-RU-A8FL, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-SS-A7HO, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-T9-A92H, Subtype: Adenocarcinoma, Methylation: Define here!)","(Sample: TCGA-WS-AB45, Subtype: Mucinous adenocarcinoma, Methylation: Define here!)",TCGA-5M-AAT5
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(Gene: ALKBH2, DDR mechanism: Direct-reversal)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Gene: ALKBH3, DDR mechanism: Direct-reversal)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Gene: APEX1, DDR mechanism: Other-BER)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Gene: APEX2, DDR mechanism: Other-BER)",0.0,0.0,0.0,0.0,0.114943,0.0,0.254545,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Gene: APLF, DDR mechanism: Other-BER)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(Gene: XRCC2, DDR mechanism: HR)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Gene: XRCC3, DDR mechanism: HR)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Gene: XRCC4, DDR mechanism: NH-EJ)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(Gene: XRCC5, DDR mechanism: NH-EJ)",0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
net_gene = Network(clustergrammer_widget)
net_gene.load_df(df_gene)

In [39]:
net_gene.cluster()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  df = df.ix[keep_rows]
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  tmp_df['mat'] = tmp_df['mat'].ix[keep_rows]


In [40]:
net_gene.widget()

clustergrammer_widget(network='{"row_nodes": [{"name": "Gene: ALKBH2", "ini": 172, "clust": 73, "rank": 20, "r…