In [1]:
import sys
from collections import defaultdict

from concurrent.futures import ProcessPoolExecutor as Pool
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle

import oncodcl_funct as odf
import oncodcl_funct2 as odf2
import signature as sign

In [2]:
%load_ext line_profiler

In [3]:
%load_ext autoreload
%autoreload 2

In [21]:
# Global variables
OUTPUT_FILE = '/home/carnedo/projects/oncodriveclustl/outputs/SKCM.txt'

def write_results2(results, output_file):
    """Save results to the output file
    :param results: dict, dictionary of results, keys are gene's names
    :param output_file: path, path of the output file
    :return: None
    """
    header = ['SYMBOL', 'SCORE_OBS', 'SCORE_SIM', 'CGC']
    with open(output_file, 'w') as fd:
        fd.write('{}\n'.format('\t'.join(header)))
        for gene_name, values in results.items():
            score_obs, score_sim, cgc = values
            fd.write('{}\t{}\t{}\t{}\n'.format(
                gene_name, score_obs, score_sim, cgc))
            # fd.write(f'{gene_name}\t{length}\t{num_clusters}\t{num_mutations}\t{score}\t{cgc}\n')

    df = pd.read_csv(output_file, sep='\t', header=0)
    df.sort_values(by=['SCORE_OBS', 'CGC'], ascending=[False, False], inplace=True)
    df.to_csv(path_or_buf=output_file, sep='\t', na_rep='')

In [4]:
# Parse regions
input_regions = '/home/carnedo/projects/oncodriveclustl/inputs/regions/02_cds.regions.gz'
trees, regions_d, chromosomes_d = odf.regions(input_regions)
sys.stderr.write('Regions parsed\n')

Regions parsed


In [5]:
# Read mutations, intersect with regions
input_mutations = '/home/carnedo/projects/oncodriveclustl/inputs/mutations/pancanatlas/SKCM.txt'
mutations_d = odf.read_mutations(input_mutations, trees)
sys.stderr.write('Mutations read\n')

Mutations read


In [6]:
# Calculate signatures for cancer type dataset
obj = sign.Signature(start_at_0=True)
obj.calculate(input_mutations)
signatures = obj.signatures['probabilities']
sys.stderr.write('Signatures calculated\n')

231785it [00:02, 95876.63it/s] 
Signatures calculated


In [28]:
# Analyze only genes with >= 2 mutations
genes = [(g, regions_d[g], chromosomes_d[g], mutations_d[g]) for g in regions_d.keys() if len(mutations_d[g]) >= 2]
CGC_genes = set([line.split('\t')[0] for line in open('/home/carnedo/projects/oncodriveclustl/inputs/CGC/CGCMay17_cancer_types_TCGA.tsv', 'r')])

In [None]:
results_d = defaultdict()


with Pool(max_workers=4) as executor, tqdm(total=len(genes)) as pbar:
    for gene, scores in executor.map(odf2.run_region, genes):
        pbar.update(1)
        CGC = gene in CGC_genes
        results_d[gene] = (scores['obs'], scores['sim'], CGC)
            
sys.stderr.write('Results calculated\n')

 72%|███████▏  | 11220/15561 [23:34<18:04,  4.00it/s]  

Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could not be broadcast together with shapes (50,) (51,) (50,) 
Ranges problem: operands could n

 92%|█████████▏| 14288/15561 [30:59<03:09,  6.70it/s]  

In [49]:
%%time
results_short = defaultdict()

CGC_genes = set([line.split('\t')[0] for line in open('/home/carnedo/projects/oncodriveclustl/inputs/CGC/CGCMay17_cancer_types_TCGA.tsv', 'r')])

for gene, scores in map(odf2.run_region, genes[278:280]):
    CGC = gene in CGC_genes
    results_short[gene] = (scores['obs'], scores['sim'], CGC)


CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 26.8 ms


### Profiling

In [54]:
%lprun -f odf.clustering odf2.run_region(genes[10])

### N in trinucleotide

In [114]:
import re

p1 = re.compile('[N]{1}')
p2 = re.compile('[N]{2}')

nucleot = {'A', 'C', 'G', 'T'}

tri = ['ACG', 'TTG', 'AAA', 'GCG', 'NAG', 'CNA', 'ANN', 'NNG', 'NNN']

for element in tri: 
    print('---------------')
    print(element)
    two_N = p2.search(element)
    one_N = p1.search(element)
    
    if two_N is None:
        # No N
        if one_N is None:
            print('No N')
        # One N
        else:
            new_tris = set()
            prob = 0
            for nuc in nucleot:
                new_tris.add(re.sub('N', nuc, element))
            print(new_tris)
            
            for element in new_tris: 
                for change in new_tris.difference(set([element])): 
                    print(element, change)
                    print('·····')
    # Two or three N
    else:
        print('Two N in trinucleotide found')


---------------
ACG
No N
---------------
TTG
No N
---------------
AAA
No N
---------------
GCG
No N
---------------
NAG
{'AAG', 'TAG', 'CAG', 'GAG'}
AAG TAG
·····
AAG CAG
·····
AAG GAG
·····
TAG AAG
·····
TAG CAG
·····
TAG GAG
·····
CAG AAG
·····
CAG TAG
·····
CAG GAG
·····
GAG AAG
·····
GAG TAG
·····
GAG CAG
·····
---------------
CNA
{'CCA', 'CGA', 'CAA', 'CTA'}
CCA CGA
·····
CCA CAA
·····
CCA CTA
·····
CGA CCA
·····
CGA CAA
·····
CGA CTA
·····
CAA CCA
·····
CAA CGA
·····
CAA CTA
·····
CTA CCA
·····
CTA CGA
·····
CTA CAA
·····
---------------
ANN
Two N in trinucleotide found
---------------
NNG
Two N in trinucleotide found
---------------
NNN
Two N in trinucleotide found


### Error clusters min_r solved

In [None]:
g = 'WDPCP'
symbol = g
chromosome = chromosomes_d[g]
regions = regions_d[g]
mutations = mutations_d[g]
n_simulations = 3

In [None]:
region_arrays = odf.get_arrays(symbol, chromosome, regions, signatures)

# Analysis of the observed mutations
observed_results = odf3.analysis(region_arrays, mutations, window_s=50, window_c=50)

# Analysis of the simulated mutations
for i in range(15000): 
    simulations = []
    for _ in range(n_simulations):
        print(i)
        simulated_mutations = odf3.simulate(array_positions=region_arrays['genomic'], n_mutations=len(mutations), pos_prob=region_arrays['probs'])
        print(simulated_mutations)
        simulated_results = odf3.analysis(region_arrays, simulated_mutations, window_s=50, window_c=50)
        print('analysis done')
        #simulations.append(simulated_results)

    print(symbol)

In [None]:
simulated_mutations

In [None]:
testneg = [ 63815406,  63712106,  63631205,  63401851,  63712054,  63664599,
  63815406,  63631539,  63631422,  63605591]

In [None]:
testpos = [ 63631649,  63605587,  63664658, 63815350, 63486545, 63540419,
  63401818, 63631546, 63401895, 63401927,]

In [None]:
odf3.analysis(region_arrays, testneg, window_s=50, window_c=50)

In [None]:
results_d = defaultdict()
for gene, results in map(odf3.run_region, genes):
    results_d[gene] = results