In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

hg38_path = '/home/clint/buffalo/hg38/hg38.fa'
cosmic_mutants = '/home/clint/buffalo/cosmic/v76/CosmicMutantExport_LiverHCC.tsv'

In [2]:
from siglib.cosmic import merge_samples, parse_to_samples

merged_samples = merge_samples(parse_to_samples(cosmic_mutants))
merged_filtered_samples = [x for x in merged_samples.values() if len(x.mutations) in range(100, 8000)]

Discovery of Target Residues
---

In [80]:
with_target_residue = 0

for sample in merged_filtered_samples:
    for m in sample.mutations:
        if m.AA == 'p.R249S' and m.chrom == 'chr17' and m.position[0] == 7674216:
            with_target_residue += 1
            print(sample, "\n")
            break

OVERVIEW
-----------------------------------------------------
Sample: TCGA-CC-A3MB-01
Site: liver
Mutations: 313 mutations
Histology: hepatocellular_carcinoma carcinoma

IDENTIFIERS
-----------------------------------------------------
Accession:                            ENST00000317216
HGNC ID:                                         3240
Pubmed ID:                                           
Sample ID:                                    2194595
Study ID:                                         628
Tumour ID:                                    2062873

CLINICAL INFORMATION
-----------------------------------------------------
Age:                                               36
Genome Wide Screen:                                 y
Sample Source:                                     NS
Tumour Origin:                                     NS

SITE
-----------------------------------------------------
Primary:                                        liver
Subtype 1:                       

In [4]:
print("Samples with p.R249S:  {}".format(with_target_residue))
print("Total Population:      {}".format(len(merged_filtered_samples)))
p = with_target_residue / len(merged_filtered_samples)
print('Population Proportion: {:.3%}'.format(p))

Samples with p.R249S:  16
Total Population:      314
Population Proportion: 5.096%


Statistical Test
---

$\mathrm{H}_{\mathrm{o}}$: The null hypothesis is that the probability of observed R249S mutated samples in a selected cohort is equal to the probability of R249S mutated samples in the population.

$\mathrm{H}_{\mathrm{A}}$: The alternate hypothesis is that the probability of R249S mutated samples is different in the cohort than in the true population.

This is a two-sided exact test of a Bernouilli experiment. If a one-sided test is needed then we will need to use the `binom.test` routine in the `R` language. If we want to search for other potentially correlated metdata we will need to multiple hypothesis correct.

In [5]:
from scipy.stats import binom_test

print('p-value: {:0.3}'.format(binom_test(x=7, n=13, p=p)))

p-value: 1.16e-06


Considering all Mutated Residues $(n > 3)$
---

1. Count the occurence of each mutated residue in the dataset.
2. Select residues that appear more than 3 times.
3. Test enrichment of residue in selected cohort.

In [96]:
from collections import Counter

residues = Counter()

for sample in merged_filtered_samples:
    for mutation in sample.mutations:
        residues[(mutation.chrom, mutation.position[0], mutation.AA)] += 1
print("There are {:,} unique mutated residues".format(len(residues)))

There are 55,455 unique mutated residues


In [97]:
selected_residues = []

for residue, count in sorted(residues.items()):
    if count > 3:
        selected_residues.append(residue)
print("There are {:,} selected residues".format(len(selected_residues)))

There are 285 selected residues


In [90]:
group = ['067T',
         'TCGA-CC-A7IL-01', 'TCGA-CC-A7II-01', 'TCGA-CC-A7IG-01', 'TCGA-DD-A1EL-01', 'TCGA-DD-A114-01',
         'CHC1704T', 'CHC1211T', 'CHC1754T', 'CHC1154T', 'CHC1717T',
         '3206A7_009_T', '3206A7_017_T']

In [91]:
p_values = []

for residue in selected_residues:
    in_group = with_target_residue = 0
    for sample in merged_filtered_samples:
        for m in sample.mutations:
            if residue == (m.chrom, m.position[0], m.AA):
                if sample.name in group:
                    in_group += 1
                with_target_residue += 1
                break
    p_values.append(binom_test(in_group, len(group), with_target_residue / len(merged_filtered_samples)))

Correct p-values for Multiple Hypothesis Testing
---
Use Benjamnini/Hochberg (non-negative) formula

In [92]:
from statsmodels.sandbox.stats.multicomp import multipletests

alpha = 0.05
reject, pvals_corrected, *_ = multipletests(p_values, alpha, 'fdr_bh')
print("The number of residues below α={} with a different proportion in our samples: {}".format(alpha, sum(reject)))

The number of residues below α=0.05 with a different proportion in our samples: 1


What Residues are Significantly Different?
---

In [106]:
import numpy as np

for residue, pval in zip(np.array(selected_residues)[reject], pvals_corrected[reject]):
    print("Residue {} in {}:{:,} (FDR p-value: {:0.2e})".format(residue[2], residue[0], int(residue[1]), pval))

Residue p.R249S in chr17:7,674,216 (FDR p-value: 3.32e-04)
