In [1]:
# test calcium simulation

In [1]:
import random
from random import randint
import pandas as pd
from pandasql import sqldf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from scipy import stats
import math
from sklearn.mixture import GaussianMixture
from datetime import datetime
import pickle
from fisher import pvalue

%matplotlib inline

In [None]:
def simulate(refgene, cnv_data, args, causal_genes):
    df = cnv_data.drop_duplicates(subset=("chrom", "cnv_start", "cnv_end"))
    block_counts = count_cnv_by_block(df, args['block_size'])
    cnv_length = cnv_data['cnv_end'] - cnv_data['cnv_start']
    status = 1
    case_data = []
    ctrl_data = []
    debug = {'p in case': [], 'p in ctrl': [], 'niter': 0, 'time': [str(datetime.now()), None], 'args': dict(args), 
             'seed': args.seed, 'causal genes': causal_genes, 'number of genes overlap CNV': [], 
             'number of causal genes in case': [], 'number of causal genes in ctrl': [], 
             'simulated CNV length in case': [], 'simulated CNV length in ctrl': []}
    
    while(status):
        sample_len = sample_cnv_length(cnv_length, args['avg_cnv_per_individual'])
        sample_blocks = get_sample_blocks(block_counts, len(sample_len))
        samples = assign_cnv_to_sample(sample_blocks, sample_len, args['block_size'])
        samples = annotate_samples(samples, refgene)
        causal_genes_in_sample = get_causal_genes(causal_genes, samples['gene_name'].tolist())
        p = p_case(args['prevalence'], len(causal_genes_in_sample), args)
        debug['number of genes overlap CNV'].append(len(set(samples['gene_name'].tolist())))
        if random.random() < p and len(case_data) < args['n_case']:
            # sample data is a case
            case_data.append(samples)
            debug['p in case'].append(p)
            debug['number of causal genes in case'].append(len(causal_genes_in_sample))
            debug['simulated CNV length in case'].extend(sample_len)
        if random.random() > p and len(ctrl_data) < args['n_ctrl']:
            # sample data is a control
            ctrl_data.append(samples)
            debug['p in ctrl'].append(p)
            debug['number of causal genes in ctrl'].append(len(causal_genes_in_sample))
            debug['simulated CNV length in ctrl'].extend(sample_len)
        if len(case_data) == args['n_case'] and len(ctrl_data) == args['n_ctrl']:
            status = 0
        debug['niter'] += 1
    debug['time'][1] = str(datetime.now())
    return {'case': case_data, 'ctrl': ctrl_data, 'debug': debug}

In [2]:
# 
def load_pthwy_gene(filename, n_skiprow = 2):
    pthwy_gene = pd.read_table(filename, skiprows = n_skiprow, header = None, names = ["gene_name"])
    return pthwy_gene

In [3]:
# 
class Environment(dict):
    def __init__(self):
        parameters = {"block_size": 100000,
                      "avg_cnv_per_individual": 5,
                      "n_case": 250,
                      "n_ctrl": 250,
                       # set Gamma shape to be 3 instead of 5
                       # "odds_ratio_params" : None # for H_0
                      "odds_ratio_params": {"shape": 5, "scale": 1},
                      "prevalence": 0.005,
                      "n_causal_gene": 200,
                      "refgene_file": "data/refGene.txt.gz",
                      "pthwy_gene_file": "../data/calciumgeneset.txt",
                      "cnv_file": "data/ISC-r1.CNV.bed",
                      "case_dataset": "delCases",
                      "ctrl_dataset": "delControls",
                      "output": "del_sample"
                     }
        self.update(parameters)
        self.causal_genes = load_pthwy_gene(parameters["pthwy_gene_file"])["gene_name"].tolist()
        self.seed = 12

args = Environment()

In [4]:
def load_cnv_data(filename):
    '''load cnv data'''
    cnvbed = {}
    dataset = None
    for line in open(filename).readlines():
        if not line.startswith("chr"):
            dataset = line.strip().split()[1].lstrip("name=")
            cnvbed[dataset] = {}
            continue
        line = line.strip().split()
        if not line[0] in cnvbed[dataset]:
            cnvbed[dataset][line[0]] = []
        cnvbed[dataset][line[0]].append((int(line[1]),int(line[2])))

    for dataset in cnvbed.keys():
        for chrom in cnvbed[dataset]:
            cnvbed[dataset][chrom].sort()

    cnvbed_df = {}
    for dataset in cnvbed.keys():
        cnvbed_df[dataset] = {"chrom":[], "cnv_start":[], "cnv_end":[]}
        for chrom in cnvbed[dataset]:
            start, end = tuple(zip(*cnvbed[dataset][chrom])) 
            cnvbed_df[dataset]["chrom"].extend([chrom] * len(start))
            cnvbed_df[dataset]["cnv_start"] += list(start)
            cnvbed_df[dataset]["cnv_end"] += list(end)
        cnvbed_df[dataset] = pd.DataFrame.from_dict(cnvbed_df[dataset]).drop_duplicates(
                                                subset=("chrom", "cnv_start", "cnv_end"))
    return cnvbed_df
cnv_data = load_cnv_data("data/ISC-r1.CNV.bed")
cnv_data = pd.concat([cnv_data["delCases"], cnv_data["delControls"]])
cnv_data = cnv_data.drop_duplicates(subset=("chrom", "cnv_start", "cnv_end"))
cnv_length = cnv_data["cnv_end"] - cnv_data["cnv_start"]

In [5]:
def count_cnv_by_block(df, block_size):
    def count_by_blocks(chrom):
        data = df.query('chrom == "{}"'.format(chrom))['cnv_start'].tolist()
        start_pos = min(data)
        end_pos = max(data)
        counts, bins = np.histogram(data, bins = int((end_pos - start_pos) / block_size) + 1, 
                                    range = (start_pos, end_pos))
        return counts, [int(x) for x in bins]
    block_counts = []
    for chrom in set(df['chrom']):
        counts, bins = count_by_blocks(chrom)
        # most blocks contain 0 CNV start. Add 0.5 to each block, so that CNV could start within any block.
        block_counts.extend([((chrom, x), y+0.5) for x, y in zip(bins, counts)])
    return block_counts
block_counts = count_cnv_by_block(cnv_data, args['block_size'])
block_counts

[(('chr17', 6888), 1.5),
 (('chr17', 106869), 1.5),
 (('chr17', 206851), 0.5),
 (('chr17', 306833), 1.5),
 (('chr17', 406815), 0.5),
 (('chr17', 506797), 0.5),
 (('chr17', 606779), 0.5),
 (('chr17', 706761), 0.5),
 (('chr17', 806743), 0.5),
 (('chr17', 906725), 0.5),
 (('chr17', 1006707), 0.5),
 (('chr17', 1106689), 1.5),
 (('chr17', 1206671), 0.5),
 (('chr17', 1306653), 0.5),
 (('chr17', 1406635), 0.5),
 (('chr17', 1506617), 0.5),
 (('chr17', 1606599), 0.5),
 (('chr17', 1706581), 0.5),
 (('chr17', 1806563), 0.5),
 (('chr17', 1906545), 0.5),
 (('chr17', 2006527), 0.5),
 (('chr17', 2106509), 0.5),
 (('chr17', 2206491), 0.5),
 (('chr17', 2306473), 0.5),
 (('chr17', 2406455), 0.5),
 (('chr17', 2506437), 0.5),
 (('chr17', 2606419), 0.5),
 (('chr17', 2706401), 0.5),
 (('chr17', 2806383), 0.5),
 (('chr17', 2906365), 0.5),
 (('chr17', 3006347), 0.5),
 (('chr17', 3106329), 0.5),
 (('chr17', 3206311), 0.5),
 (('chr17', 3306293), 1.5),
 (('chr17', 3406275), 0.5),
 (('chr17', 3506257), 0.5),
 (('

In [6]:
status = 1
case_data = []
ctrl_data = []
debug = {'p': [], 'niter': 0, 'time': [str(datetime.now()), None], 'args': dict(args), 'seed': args.seed, 
         'causal genes': args.causal_genes, 'number of causal genes in case': [], 
         'number of causal genes in ctrl': [], 'number of genes overlap CNV': [], 
         'simulated CNV length in case': [], 'simulated CNV length in ctrl': []}

In [7]:
def sample_cnv_length(data, mean_num_cnv):
    return np.random.choice(data, np.random.poisson(mean_num_cnv))

In [8]:
def get_sample_blocks(block_counts, num_cnv):
    probability_distribution = np.array([x[1] for x in block_counts])
    sample_idx = np.random.choice(range(len(block_counts)), num_cnv, 
                                  p = probability_distribution / sum(probability_distribution))
    return sorted([block_counts[idx][0] for idx in sample_idx])

In [9]:
def assign_cnv_to_sample(sample_blocks, sample_len, block_size):
    samples = {'chrom': [], 'cnv_start': [], 'cnv_terminate': []}
    for x, y in zip(sample_blocks, sample_len):
        start_pos = randint(x[1], x[1] + block_size)
        samples['cnv_start'].append(start_pos)
        samples['cnv_terminate'].append(start_pos + int(y))
        samples['chrom'].append(x[0])
    return pd.DataFrame(samples)

In [10]:
def load_reference_gene(filename):
    '''Load reference gene database'''
    ref_gene = pd.read_table(filename, compression="gzip", sep="\t", 
                         header = None, usecols=(1,2,4,5,12), 
                         names = ["tx_name", "chrom", "tx_start", "tx_end", "gene_name"])
    return ref_gene.drop_duplicates(subset=("chrom", "tx_start", "tx_end"))
refgene = load_reference_gene("data/refGene.txt.gz")

In [11]:
def annotate_samples(samples, refgene):
    query = """
        SELECT cnv.chrom, cnv.cnv_start, cnv.cnv_terminate, gene.tx_name, gene.gene_name
        FROM samples cnv LEFT JOIN refgene gene
        WHERE cnv.chrom == gene.chrom 
        AND (
        (cnv.cnv_start >= gene.tx_start AND cnv.cnv_start <= gene.tx_end)
        OR
        (cnv.cnv_terminate >= gene.tx_start AND cnv.cnv_terminate <= gene.tx_end)
        OR
        (cnv.cnv_start <= gene.tx_start AND cnv.cnv_terminate >= gene.tx_end)
        )
        """
        # drop_duplicates(): make sure the case that CNV spread multiple txs but each gene to be counted only once
    return sqldf(query).drop_duplicates(subset=("chrom", "cnv_start", "cnv_terminate", "gene_name"))

In [12]:
def get_causal_genes(causal_genes, sample_genes):
    '''get causal genes for each simulated sample'''
    return [x for x in causal_genes if x in sample_genes]

In [13]:
def p_case(p, num_causal_genes_in_sample, sim_args):
    if num_causal_genes_in_sample == 0:
        return p
    baseline_odds = p / (1 - p)
    if sim_args["odds_ratio_params"]["shape"] is None or sim_args["odds_ratio_params"]["scale"] is None:
        odds_ratio = 1
    else:
        odds_ratio = np.prod([np.random.gamma(sim_args["odds_ratio_params"]['shape'], 
                                              sim_args["odds_ratio_params"]['scale']) 
                              for x in range(num_causal_genes_in_sample)])
    # obtain the power of fisher test by setting odds ratio to 1
    # odds_ratio = 1
    odds = baseline_odds * odds_ratio
    return odds / (1 + odds)

In [14]:
while(status):
    sample_len = sample_cnv_length(cnv_length, args['avg_cnv_per_individual'])
    sample_blocks = get_sample_blocks(block_counts, len(sample_len))
    samples = assign_cnv_to_sample(sample_blocks, sample_len, args['block_size'])
    samples = annotate_samples(samples, refgene)
    causal_genes_in_sample = get_causal_genes(args.causal_genes, samples['gene_name'].tolist())
    p = p_case(args['prevalence'], len(causal_genes_in_sample), args)
    debug['number of genes overlap CNV'].append(len( set(samples['gene_name'].tolist()) ))
    if random.random() < p and len(case_data) < args['n_case']:
        # sample data is a case
        case_data.append(samples)
        debug['p'].append(p)
        debug['number of causal genes in case'].append(len(causal_genes_in_sample))
        debug['simulated CNV length in case'].extend(sample_len)
    if random.random() > p and len(ctrl_data) < args['n_ctrl']:
        # sample data is a control
        ctrl_data.append(samples)
        debug['p'].append(p)
        debug['number of causal genes in ctrl'].append(len(causal_genes_in_sample))
        debug['simulated CNV length in ctrl'].extend(sample_len)
    if len(case_data) == args['n_case'] and len(ctrl_data) == args['n_ctrl']:
        status = 0
    debug['niter'] += 1
debug['time'][1] = str(datetime.now())

In [15]:
case_data

[   chrom  cnv_start  cnv_terminate    tx_name gene_name
 0  chr16   12298062       12418629  NM_032167     SNX29
 1   chr2   43953144       44109485  NM_015522  DYNC2LI1
 3   chr2   43953144       44109485  NM_022436     ABCG5
 4   chr2   43953144       44109485  NM_022437     ABCG8
 5   chr2   43953144       44109485  NM_172069   PLEKHH2,
     chrom  cnv_start  cnv_terminate       tx_name     gene_name
 0    chr1  109136721      109276393  NM_001010883       FAM102B
 1    chr1  109136721      109276393  NM_001102592        HENMT1
 2    chr1  109136721      109276393  NM_001144937         FNDC7
 3    chr1  109136721      109276393     NM_018061       PRPF38B
 6   chr10  125701683      126316176  NM_001146340        NKX1-2
 7   chr10  125701683      126316176  NM_001167880          LHPP
 8   chr10  125701683      126316176  NM_001270764        CHST15
 11  chr10  125701683      126316176  NM_001322971           OAT
 12  chr10  125701683      126316176     NM_014661        FAM53B
 15  ch

In [16]:
ctrl_data

[    chrom  cnv_start  cnv_terminate       tx_name    gene_name
 0    chr1   42687025       43128455  NM_001077447         PPCS
 1    chr1   42687025       43128455  NM_001080850       CCDC30
 2    chr1   42687025       43128455  NM_001198850        FOXJ3
 7    chr1   42687025       43128455     NM_006347         PPIH
 9    chr1   42687025       43128455     NM_032257      ZMYND12
 10   chr1   42687025       43128455     NM_173642       RIMKLA
 11  chr20   14833449       14948259     NM_080676      MACROD2
 12  chr20   14833449       14948259     NR_037841  MACROD2-AS1
 14   chr3    4149178        4447788  NM_001164675        SUMF1
 15   chr3    4149178        4447788  NM_001243723       SETMAR
 17   chr5  140212863      140362074     NM_014005       PCDHA9
 18   chr5  140212863      140362074     NM_018898      PCDHAC1
 19   chr5  140212863      140362074     NM_018899      PCDHAC2
 20   chr5  140212863      140362074     NM_018900       PCDHA1
 21   chr5  140212863      140362074    

In [15]:
debug["number of causal genes in case"]

[0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 0,
 1,
 2,
 3,
 0,
 1,
 0,
 0,
 2,
 2,
 1,
 2,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 3,
 1,
 0,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 4,
 4,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 2,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 3,
 1,
 0,
 1,
 2,
 0,
 2,
 2,
 3,
 0,
 2,
 0,
 1,
 2,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 2,
 1,
 0,
 0,
 1,
 0,
 2,
 2,
 1,
 2,
 0,
 2,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 3,
 1,
 1,
 0,
 1,
 2,
 3,
 2,
 1,
 0,
 0,
 2,
 2,
 1,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 3,
 0,
 2,
 0,
 3,
 4,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 3,
 0,
 4,
 0,
 3,
 0]

In [16]:
debug["number of causal genes in ctrl"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [17]:
def save_data(data, filename):
    pickle.dump(data, open(filename, "wb"))
sample_data = {'case': case_data, 'ctrl': ctrl_data, 'debug': debug}
save_data(sample_data, 'calcium_pathway_500.data.pkl')

In [23]:
from pprint import pprint
causal_genes = load_pthwy_gene("../data/calciumgeneset.txt")["gene_name"].tolist()
print (len(causal_genes))

178
