# Python utilities

This is a centralized script for all Python functions / classes ever used repeatedly, exported as Python module `utils.py`. **This notebook should not / cannot be executed as is!**.

## Load modules

In [1]:
import os, sys
import random
from random import randint
import pandas as pd
from pandasql import sqldf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from scipy import stats
import glob
import math
from sklearn.mixture import GaussianMixture
from datetime import datetime
import pickle
from fisher import pvalue

## File I/O functions

In [2]:
def load_reference_gene(filename):
    '''Load reference gene database'''
    ref_gene = pd.read_table(filename, compression="gzip", sep="\t", 
                         header = None, usecols=(1,2,4,5,12), 
                         names = ["tx_name", "chrom", "tx_start", "tx_end", "gene_name"])
    return ref_gene.drop_duplicates(subset=("chrom", "tx_start", "tx_end"))

def load_pthwy_gene(filename, n_skiprow = 2):
    '''Load pathway genes. For example, calcium pathway genes.
       The input file must contain a column named "gene_name".
    '''
    pthwy_gene = pd.read_table(filename, skiprows = n_skiprow, header = None, names = ["gene_name"])
    return pthwy_gene

def load_cnv_data(filename):
    '''load cnv data'''
    cnvbed = {}
    dataset = None
    for line in open(filename).readlines():
        if not line.startswith("chr"):
            dataset = line.strip().split()[1].lstrip("name=")
            cnvbed[dataset] = {}
            continue
        line = line.strip().split()
        if not line[0] in cnvbed[dataset]:
            cnvbed[dataset][line[0]] = []
        cnvbed[dataset][line[0]].append((int(line[1]),int(line[2])))

    for dataset in cnvbed.keys():
        for chrom in cnvbed[dataset]:
            cnvbed[dataset][chrom].sort()

    cnvbed_df = {}
    for dataset in cnvbed.keys():
        cnvbed_df[dataset] = {"chrom":[], "cnv_start":[], "cnv_end":[]}
        for chrom in cnvbed[dataset]:
            start, end = tuple(zip(*cnvbed[dataset][chrom])) 
            cnvbed_df[dataset]["chrom"].extend([chrom] * len(start))
            cnvbed_df[dataset]["cnv_start"] += list(start)
            cnvbed_df[dataset]["cnv_end"] += list(end)
        cnvbed_df[dataset] = pd.DataFrame.from_dict(cnvbed_df[dataset]).drop_duplicates(
                                                subset=("chrom", "cnv_start", "cnv_end"))
    return cnvbed_df

def save_data(data, filename):
    pickle.dump(data, open(filename, "wb"))

def load_data(filename):
    return pd.read_pickle(filename)

## Global variables

In [3]:
class Environment(dict):
    def __init__(self):
        parameters = {'block_size': 100000,
                      'avg_cnv_per_individual': 5,
                      'n_case': 5,
                      'n_ctrl': 5,
                       # set Gamma shape to be 3 instead of 5
                       # 'odds_ratio_params' : None # for H_0
                      'odds_ratio_params': {'shape': 25, 'scale': 1},
                      'prevalence': 0.005,
                      'n_causal_gene': 200,
                      'refgene_file': '../data/refGene.txt.gz',
                      'pthwy_gene_file': '../data/calciumgeneset.txt',
                      'cnv_file': '../data/ISC-r1.CNV.bed',
                      'case_dataset': 'delCases',
                      'ctrl_dataset': 'delControls',
                      'output': 'del_sample',
                      'ncpu': 6
                     }
        self.update(parameters)
        ## select causal genes randomly, instead of the first 100 in enrichment analysis
#         self.ref_gene_name = load_reference_gene(parameters["refgene_file"])["gene_name"].tolist()
#         self.causal_genes = random.sample(self.ref_gene_name, parameters['n_causal_gene'])
        self.causal_genes = load_pthwy_gene(parameters["pthwy_gene_file"])["gene_name"].tolist()
        self.seed = 12

args = Environment()

## CNV data processing

In [4]:
def count_cnv_by_block(df, block_size):
    # check how many CNVs start in block_size = 100K blocks genome
    # this cell produces `block_counts`: ((chrom, block_position), count)
    def count_by_blocks(chrom):
        data = df.query('chrom == "{}"'.format(chrom))['cnv_start'].tolist()
        start_pos = min(data)
        end_pos = max(data)
        counts, bins = np.histogram(data, bins = int((end_pos - start_pos) / block_size) + 1, 
                                    range = (start_pos, end_pos))
        return counts, [int(x) for x in bins]
    block_counts = []
    for chrom in set(df['chrom']):
        counts, bins = count_by_blocks(chrom)
        # most blocks contain 0 CNV start. Add 0.5 to each block, so that CNV could start within any block.
        block_counts.extend([((chrom, x), y+0.5) for x, y in zip(bins, counts)])
    return block_counts

def fit_truncated_gaussian_mix(x, k = 10):
    x = x.extend([-i for i in x])
    clf = GaussianMixture(n_components=1, covariance_type='full')
    clf.fit(x)
    return clf

def sample_cnv_length(data, mean_num_cnv):
    return np.random.choice(data, np.random.poisson(mean_num_cnv))

def get_sample_blocks(block_counts, num_cnv):
    '''sample blocks from blocks across genome'''
    probability_distribution = np.array([x[1] for x in block_counts])
    sample_idx = np.random.choice(range(len(block_counts)), num_cnv, 
                                  p = probability_distribution / sum(probability_distribution))
    return sorted([block_counts[idx][0] for idx in sample_idx])

def assign_cnv_to_sample(sample_blocks, sample_len, block_size):
    samples = {'chrom': [], 'cnv_start': [], 'cnv_terminate': []}
    for x, y in zip(sample_blocks, sample_len):
        start_pos = randint(x[1], x[1] + block_size)
        samples['cnv_start'].append(start_pos)
        samples['cnv_terminate'].append(start_pos + int(y))
        samples['chrom'].append(x[0])
    return pd.DataFrame(samples)

def annotate_samples(samples, gene_df):
    query = """
        SELECT cnv.chrom, cnv.cnv_start, cnv.cnv_terminate, gene.tx_name, gene.gene_name
        FROM samples cnv LEFT JOIN gene_df gene
        WHERE cnv.chrom == gene.chrom 
        AND (
        (cnv.cnv_start >= gene.tx_start AND cnv.cnv_start <= gene.tx_end)
        OR
        (cnv.cnv_terminate >= gene.tx_start AND cnv.cnv_terminate <= gene.tx_end)
        OR
        (cnv.cnv_start <= gene.tx_start AND cnv.cnv_terminate >= gene.tx_end)
        )
        """
        # drop_duplicates(): make sure the case that CNV spread multiple txs but each gene to be counted only once
    return sqldf(query).drop_duplicates(subset=("chrom", "cnv_start", "cnv_terminate", "gene_name"))

## Simulation related codes

In [5]:
def get_causal_genes(causal_genes, sample_genes):
    '''get causal genes for each simulated sample'''
    return [x for x in causal_genes if x in sample_genes]

def get_ccnv():
    '''get causal cnvs'''
    return None
    
def p_case(p, num_causal_genes_in_sample, shape, scale):
    if num_causal_genes_in_sample == 0:
        return p
    baseline_odds = p / (1 - p)
    if shape is None or scale is None:
        odds_ratio = 1
    else:
        odds_ratio = np.prod([np.random.gamma(shape, scale) 
                              for x in range(num_causal_genes_in_sample)])
    # obtain the power of fisher test by setting odds ratio to 1
    # odds_ratio = 1
    odds = baseline_odds * odds_ratio
    return odds / (1 + odds)

def simulate_core(case_data, ctrl_data, debug, args, cnv_length, block_counts, refgene, N1, N2):
    cnt1 = 0
    cnt2 = 0
    status = 1
    while(status):
        sample_len = sample_cnv_length(cnv_length, args["avg_cnv_per_individual"])
        sample_blocks = get_sample_blocks(block_counts, len(sample_len))
        samples = assign_cnv_to_sample(sample_blocks, sample_len, args["block_size"])
        samples = annotate_samples(samples, refgene)
        causal_genes_in_sample = get_causal_genes(args.causal_genes, samples['gene_name'].tolist())
        p = p_case(args["prevalence"], len(causal_genes_in_sample), 
                   args["odds_ratio_params"]["shape"], args["odds_ratio_params"]["scale"])
        if random.random() < p and cnt1 < N1:
            # sample data is a case
            case_data.append(samples)
            debug_data = dict(debug)
            debug_data['p in case'].append(p)
            debug_data['number of causal genes in case'].append(len(causal_genes_in_sample))
            debug_data['simulated CNV length in case'].extend(sample_len)
            debug_data['number of genes overlap CNV in case'].append(len(set(samples['gene_name'].tolist())))
            debug.update(debug_data)
            cnt1 += 1
        if random.random() > p and cnt2 < N2:
            # sample data is a control
            ctrl_data.append(samples)
            debug_data = dict(debug)
            debug_data['p in ctrl'].append(p)
            debug_data['number of causal genes in ctrl'].append(len(causal_genes_in_sample))
            debug_data['simulated CNV length in ctrl'].extend(sample_len)
            debug_data['number of genes overlap CNV in ctrl'].append(len(set(samples['gene_name'].tolist())))
            debug.update(debug_data)
            cnt2 += 1
        if cnt1 == N1 and cnt2 == N2:
            status = 0
    return status

def simulate(refgene, cnv_data, args):
    from multiprocessing import Pool, Manager
    manager = Manager()
    np.random.seed(args.seed)
    df = cnv_data.drop_duplicates(subset=("chrom", "cnv_start", "cnv_end"))
    block_counts = count_cnv_by_block(df, args['block_size'])
    cnv_length = df['cnv_end'] - df['cnv_start']
    debug = {'p in case': [], 'p in ctrl': [], 'args': dict(args), 'time': [str(datetime.now()), None],
             'causal genes': args.causal_genes, 'number of genes overlap CNV in case': [], 
             'number of genes overlap CNV in ctrl': [],
             'number of causal genes in case': [], 'number of causal genes in ctrl': [], 
             'simulated CNV length in case': [], 'simulated CNV length in ctrl': [], 'seed': args.seed}
    manager = Manager()
    case_data = manager.list()
    ctrl_data = manager.list()
    debug = manager.dict(debug)
    pool = Pool(args['ncpu'])
    N1 = [1] * args['n_case'] + [0] * args['n_ctrl']
    N2 = [0] * args['n_case'] + [1] * args['n_ctrl']
    # elog saves exceptions
    elog = [pool.apply_async(simulate_core, args = (case_data, ctrl_data, debug, args, cnv_length, block_counts, refgene, x, y)) 
            for x, y in zip(N1, N2)]
    pool.close()
    pool.join()
    for item in elog:
        item.get()
    debug = dict(debug)
    debug['time'][1] = str(datetime.now())
    return {'case': list(case_data), 'ctrl': list(ctrl_data), 'debug': debug}

def run_simulation(args, simulation_id = None):
    ref_gene = load_reference_gene(args['refgene_file'])
    cnv_data = load_cnv_data(args['cnv_file'])
    sample_data = simulate(ref_gene, pd.concat([cnv_data[args['case_dataset']], cnv_data[args['ctrl_dataset']]]),
                           args)
    save_data(sample_data, '{}{}.data.pkl'.\
              format(args['output'], '_{}'.format(simulation_id) if simulation_id is not None else ''))
    return sample_data

## Codes for data analysis

In [None]:
def get_analysis_blocks(data):
    '''Determine from `data` table (pd.DataFrame) independent genomic blocks.
       For simulated samples use `pd.concat([pd.concat(samples['case']), pd.concat(samples['ctrl'])])`
       as input'''
    from scipy.sparse.csgraph import connected_components
    #
    def reductionFunction(df):
        # create a 2D graph of connectivity between date ranges
        start = df.cnv_start.values
        end = df.cnv_terminate.values
        graph = (start <= end[:, None]) & (end >= start[:, None])
        n_components, indices = connected_components(graph)
        return df.groupby(indices).aggregate({'cnv_start': 'min',
                                              'cnv_terminate': 'max',
                                              'tx_name': lambda x: ','.join(sorted(set(x))),
                                              'gene_name': lambda x: ','.join(sorted(set(x)))})
    return [x.split(',') for x in data.groupby(['chrom']).apply(reductionFunction).reset_index('chrom')['gene_name'].tolist()]

def get_gene_table(gene_df):
    gene_table = {}
    for item in ["case", "ctrl"]:
        gene = pd.concat(gene_df[item])
        query = '''
        SELECT chrom, gene_name, count(gene_name)
        FROM gene
        GROUP BY chrom, gene_name
        ORDER BY count(gene_name) DESC
        '''
        gene_table[item] = sqldf(query)
    gene_table = pd.merge(gene_table["case"], gene_table["ctrl"], how = "outer", on = ["chrom", "gene_name"])
    gene_table["count(gene_name)_x"].fillna(0, inplace=True)
    gene_table["count(gene_name)_y"].fillna(0, inplace=True)
    gene_table = gene_table.rename(columns={"count(gene_name)_x": "n_case_gene", "count(gene_name)_y": "n_ctrl_gene"})
    n_gene_case = sum(gene_table["n_case_gene"])
    n_gene_ctrl = sum(gene_table["n_ctrl_gene"])
    gene_table["n_case_nogene"] = n_gene_case - gene_table["n_case_gene"]
    gene_table["n_ctrl_nogene"] = n_gene_ctrl - gene_table["n_ctrl_gene"]
    gene_table = gene_table[["gene_name", "n_case_gene", "n_ctrl_gene", "n_case_nogene", "n_ctrl_nogene"]]
    return gene_table

def get_stats(gene_table, sort = 0):
    # from website https://pypi.python.org/pypi/fisher/
    stats_table = [(pvalue(row["n_case_gene"], row["n_ctrl_gene"], row["n_case_nogene"], row["n_ctrl_nogene"]), row["gene_name"]) 
                   for idx, row in gene_table.iterrows()]
    p_value = [x[0].two_tail for x in stats_table]
    oddsratio_table = [(stats.fisher_exact([[row["n_case_gene"], row["n_ctrl_gene"]], [row["n_case_nogene"], row["n_ctrl_nogene"]]])[0], row["gene_name"]) 
                       for idx, row in gene_table.iterrows()]
    if not sort == 0:
        stats_table = sorted(stats_table, reverse=True, key = lambda x: -np.log10(x[0].two_tail))
        oddsratio_table = sorted(oddsratio_table, reverse=True, key=lambda x: x[0] if np.isfinite(x[0]) else -x[0])
    logp_2side = [-np.log10(x[0].two_tail) for x in stats_table]
    logp_gene = [x[1] for x in stats_table]
    OR_2side = [x[0] for x in oddsratio_table]
    OR_gene = [x[1] for x in oddsratio_table]
    stats_table = {"p_value": p_value, "logp_2side": logp_2side, "logp_gene": logp_gene, "OR_2side": OR_2side, "OR_gene": OR_gene}
    return stats_table

def get_stats_from_input(input_data, sort_data = 0):
    '''input data saved from run_simulate step: sample_dup and sample_del separately'''
    input_data = load_data(input_data)
    sample_gene_table = get_gene_table(input_data)
    sample_stats_table = get_stats(sample_gene_table, sort = sort_data)
    return sample_stats_table

def run_stats(input_data, output_data):
    stats_table = get_stats_from_input(input_data, sort_data=0)
    stats_table['debug'] = {'simulation_args': input_data['args']}
    save_data(stats_table, output_data)
    return stats_table

def pkl_to_matrix(input_data, make_block = False, dtype = np.uint8):
    dat = load_data(input_data)
    ref = load_reference_gene(dat['debug']['args']["refgene_file"])
    genes = pd.Series(list(set(ref['gene_name'])))
    regression_data = np.array([np.array(genes.isin(item["gene_name"]), dtype = float) 
                                for item in dat['case'] + dat['ctrl']])
    phenotype = np.matrix([1]*len(dat['case']) + [0]*len(dat['ctrl'])).T
    regression_data = np.hstack((phenotype, regression_data))
#     mask = np.ravel((regression_data==0).all(0))
#     regression_data = np.hstack((phenotype, regression_data[:, mask]))
    df = pd.DataFrame(regression_data, columns = ['phenotype'] + genes.tolist())
    newdf = pd.DataFrame()
    for col in df:
        if sum(df[col]) > 0:
            newdf[col] = df[col]
        else: continue
    if not make_block:
        res = newdf.astype(dtype, copy = True)
    else:
        blocks = get_analysis_blocks(pd.concat([pd.concat(dat['case']), pd.concat(dat['ctrl'])]))
        res = [newdf[['phenotype'] + item].astype(dtype, copy = True) for item in blocks]
    return {"data": res, "debug": dat['debug']}


def run_dap_lite(df, fout, grid = [(1,1),(2,2),(3,3),(4,4)], prefix = None, exec_path = None):
    '''Convert pandas dataframe to dap input:
        - phenotype / genotype file
        - prior file
        - grid file (of effect size): omega^2 + phi^2 is what we care. Let's set it to
            1 1; 2 2; 3 3 and 4 4 for now, as we only have one Y
    '''
    print (str(datetime.now()))
    import os
    if prefix is None:
        import time
        prefix = "/tmp/F" + str(time.time())
    if exec_path is None:
        exec_path = 'dap/dap'
    chrom = 'chr6'
    pos = 100000
    dat = [['pheno', 'trait', 'chicago'] + [str(x) for x in df['phenotype']]]
    prior = []
    for idx, item in enumerate(df.columns.values):
        if item == 'phenotype':
            continue
        dat.append(['geno', '{}.{}'.format(chrom, pos + idx), 'chicago'] + [str(x) for x in df[item]])
        prior.append(['{}.{}'.format(chrom, pos + idx), str(1/(df.shape[1] - 1))])
    with open(prefix + '.dat', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in dat]))
    with open(prefix + '.prior', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in prior]))
    with open(prefix + '.grid', 'w') as f:
        f.write('\n'.join([' '.join(map(str, x)) for x in grid]))
    os.system("{0} -d {1}.dat -g {1}.grid -t 8 -it 0.05 -prior {1}.prior > {2}".format(exec_path, prefix, fout))
    print (str(datetime.now()))
#     return dat


def run_dap(df, fileout, pthwy_genes, ref_genes, dap_method = "dap", grid = [(0,0.1)], multiplier = 5,  
            exec_path = 'dap', prefix = None, dry_run = False, ncpu = 6):
    '''Convert pandas dataframe to dap input:
        - phenotype / genotype file
        - prior file
        - grid file (of effect size): omega^2 + phi^2 is what we care. Let's set it to
          (0, beta) for now, as we only have one Y
        dap_method: default to `dap`, other choices are `dap1`, `dap-g` (dap greedy)
        You can experiment with multiple dap implementation using `dap_method` and `dry_run`
        For example you set `dry_run = True`, ncpu = 3 and `dap_method = "dap"` 
        you'll get a line of command printed on screen that will run `dap` if you execute it in terminal.
        Then you can change `dap_method = "dap1"`, run again to generate the command for `dap1` and execute
        it in another terminal window.
    '''
    import time
    times = [time.time(), None, None]
    if prefix is None:
        prefix = "/tmp/F" + str(time.time())
    exec_path = os.path.join(exec_path or '', dap_method)
#     chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
#     pos is all the gene names showed in .feather
    pos = df.columns.values[1:].tolist()
#     first item in "dat" is phenotype for all samples (# of rows in .feather)
    dat = [['pheno', 'trait', 'chicago'] + [str(x) for x in df['phenotype']]]
    prior = []
#     item is gene namePerformance Cake PanPerformance Cake PanPerformance Cake Pan
    for idx, item in enumerate(df.columns.values):
        if item == "phenotype":
            continue
        dat.append(['geno', '{}.{}'.format(ref_genes[ref_genes["gene_name"]==item]["chrom"].tolist()[0], item), 
                    'chicago'] + [str(x) for x in df[item]])
        n_overlap_gene = len(set(pthwy_genes["gene_name"]) & set(pos))
        prior_causal = multiplier / len(set(pos))
        div = (len(set(pos))-n_overlap_gene) or 1
        prior_noncausal = (1-prior_causal*n_overlap_gene) / div
        if multiplier <= 1.0:
            prior_pr = str(1 / len(set(pos)))
        else:
            prior_pr = str(prior_causal) if item in pthwy_genes["gene_name"].tolist() else str(prior_noncausal)
        prior.append(['{}.{}'.format(ref_genes[ref_genes["gene_name"]==item]["chrom"].tolist()[0], item), prior_pr])
#     print (prior_causal, prior_noncausal)
    with open(prefix + '.dat', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in dat]))
    with open(prefix + '.prior', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in prior]))
    with open(prefix + '.grid', 'w') as f:
        f.write('\n'.join([' '.join(map(str, x)) for x in grid]))
    times[1] = time.time()
    if dap_method == 'dap':
        cmd = "{0} -d {1}.dat -g {1}.grid -it 0.05 -prior {1}.prior -t {2} >> {3} ".\
            format(exec_path, prefix, ncpu, fileout)
    elif dap_method == 'dap-g':
        cmd =  "{0} -d {1}.dat -g {1}.grid -prior {1}.prior -t {2} >> {3} ".\
            format(exec_path, prefix, ncpu, fileout)
    else:
        cmd = "{0} -d {1}.dat -prior {1}.prior >> {2} ".\
            format(exec_path, prefix, fileout)
    if not dry_run:
        # print(cmd)
        os.system(cmd)
        times[2] = time.time()
    else:
        # generate cmd, then run under the folder of "analysis"
        print(cmd)
    return times

## Power and type I error

In [7]:
def get_power(stats_table, causal_genes, p = 0.05):
    '''get power for each simulated dataset.
    First get overlapped genes from stats table and causal genes, and find corresponding p value
    Then power = #(p_value < 0.05) / #(p_value)'''
    overlap_causal_genes = [gene for gene in causal_genes if gene in stats_table["genes"]]
    overlap_p_value = [stats_table["p_value"][stats_table["genes"].index(gene)] 
                       for gene in overlap_causal_genes]
    pvalue_less_than_p = [x for x in overlap_p_value if x < p]
    return len(pvalue_less_than_p)/len(overlap_causal_genes)

def get_typeIerror(stats_table, causal_genes, p = 0.05):
    '''get type I error for each simulated dataset.
    First get the overlapped genes from noncausal genes and genes in stats table, find corresponding p value
    Then type I error = #(p_value < 0.05) / #(p_value)
    '''
    noncausal_overlap_genes = [gene for gene in stats_table["genes"] if gene not in causal_genes]
    noncausal_overlap_genes_pvalue = [stats_table["p_value"][stats_table["genes"].index(gene)]
                                      for gene in noncausal_overlap_genes]
    pvalue_less_than_p = [x for x in noncausal_overlap_genes_pvalue if x < p]
    return len(pvalue_less_than_p)/len(noncausal_overlap_genes_pvalue)

def test_contingency_table(gene_table, method = "Fisher", option = False): 
    if (method == "Fisher"):
        stats_table = [(pvalue(row["n_case_gene"], row["n_ctrl_gene"], row["n_case_nogene"], row["n_ctrl_nogene"]), 
                        row["gene_name"]) for idx, row in gene_table.iterrows()]
        p_value = [x[0].two_tail for x in stats_table]
        genes = [x[1] for x in stats_table]
        stats_table = {"genes": genes, "p_value": p_value}
    else:
        table = [( stats.chi2_contingency([[row["n_case_gene"], row["n_ctrl_gene"]], [row["n_case_nogene"], row["n_ctrl_nogene"]]], 
                                          correction = option), row["gene_name"] ) 
                 for idx, row in gene_table.iterrows()]
        p_value = [x[0][1] for x in table]
        gene = [x[1] for x in table]
        stats_table = {"p_value": p_value, "genes": gene}
    return stats_table

def get_power_and_typeIerror(input_data, method_option = "chi2", correction_option = False, p_option = 0.05):
    '''use function "load_data" and "get_gene_table" from simulation.py, use simulated dataset as input data,
    and get stats table by using Fisher or chisquare test.
    Then get power and type I error by using functions above, input is stats table and causal genes'''
    sample_table = load_data(input_data)
    causal_genes = sample_table["debug"]["causal genes"]
    gene_table = get_gene_table(sample_table)
    stats_table = test_contingency_table(gene_table, method = method_option, option = correction_option)
    power = get_power(stats_table, causal_genes, p = p_option)
    typeI_error = get_typeIerror(stats_table, causal_genes, p = p_option)
    return {"power": power, "typeI_error": typeI_error, "debug": sample_table["debug"]["args"]}

def run_power_typeIerror(datasets):
    '''input data must be a list of simulated datasets'''
    res = {}
    i = 0
    for data in datasets:
        res_data = get_power_and_typeIerror(data)
        res["dataset_{}".format(i)] = res_data
        i += 1
    return res

## Obsolete codes
Yet may still be useful

### Simulation codes
Here is the version using randomly sampled causal genes, and single process computation (use `ctrl` + `/` on PC or `cmd` + `/` on Mac to comment and uncomment)

In [8]:
# def p_case(p, num_causal_genes_in_sample, sim_args):
#     if num_causal_genes_in_sample == 0:
#         return p
#     baseline_odds = p / (1 - p)
#     if sim_args["odds_ratio_params"]["shape"] is None or sim_args["odds_ratio_params"]["scale"] is None:
#         odds_ratio = 1
#     else:
#         odds_ratio = np.prod([np.random.gamma(sim_args["odds_ratio_params"]['shape'], 
#                                               sim_args["odds_ratio_params"]['scale']) 
#                               for x in range(num_causal_genes_in_sample)])
#     # obtain the power of fisher test by setting odds ratio to 1
#     # odds_ratio = 1
#     odds = baseline_odds * odds_ratio
#     return odds / (1 + odds)

# class Environment(dict):
#     def __init__(self):
#         parameters = {'block_size': 100000,
#                       'avg_cnv_per_individual': 5,
#                       'n_case': 2000,
#                       'n_ctrl': 2000,
#                        # set Gamma shape to be 3 instead of 5
#                        # 'odds_ratio_params' : None # for H_0
#                       'odds_ratio_params': {'shape': 5, 'scale': 1},
#                       'prevalence': 0.005,
#                       'n_causal_gene': 200,
#                       'refgene_file': 'data/refGene.txt.gz',
#                       'cnv_file': 'data/ISC-r1.CNV.bed',
#                       'case_dataset': 'delCases',
#                       'ctrl_dataset': 'delControls',
#                       'output': 'del_sample' 
#                      }
#         self.update(parameters)
#         ## select causal genes randomly, instead of the first 100 in enrichment analysis
#         self.ref_gene_name = load_reference_gene(parameters["refgene_file"])["gene_name"].tolist()
#         self.causal_genes = random.sample(self.ref_gene_name, parameters['n_causal_gene'])
#         self.seed = 999


# def simulate(refgene, cnv_data, args, causal_genes):
#     df = cnv_data.drop_duplicates(subset=("chrom", "cnv_start", "cnv_end"))
#     block_counts = count_cnv_by_block(df, args['block_size'])
#     cnv_length = cnv_data['cnv_end'] - cnv_data['cnv_start']
#     status = 1
#     case_data = []
#     ctrl_data = []
#     debug = {'p': [], 'niter': 0, 'time': [str(datetime.now()), None], 'args': dict(args), 
#              'causal genes': causal_genes, 'number of causal genes': [], 'number of genes overlap CNV': [],
#              'simulated CNV length in case': [], 'simulated CNV length in ctrl': []}
    
#     while(status):
#         sample_len = sample_cnv_length(cnv_length, args['avg_cnv_per_individual'])
#         sample_blocks = get_sample_blocks(block_counts, len(sample_len))
#         samples = assign_cnv_to_sample(sample_blocks, sample_len, args['block_size'])
#         samples = annotate_samples(samples, refgene)
#         causal_genes_in_sample = get_causal_genes(causal_genes, samples['gene_name'].tolist())
#         p = p_case(args['prevalence'], len(causal_genes_in_sample), args)
#         # add the number of causal genes overlapped with simulated CNVs for each simulated sample
#         debug['number of causal genes'].append(len(causal_genes_in_sample))
#         # add the number of genes overlapped with simulated CNVs, both causal and non-causal genes
#         debug['number of genes overlap CNV'].append(len( set(samples['gene_name'].tolist()) ))
#         #debug['p'].append(p)
#         if random.random() < p and len(case_data) < args['n_case']:
#             # sample data is a case
#             case_data.append(samples)
#             debug['p'].append(p)
#             debug['simulated CNV length in case'].extend(sample_len)
#         if random.random() > p and len(ctrl_data) < args['n_ctrl']:
#             # sample data is a control
#             ctrl_data.append(samples)
#             debug['p'].append(p)
#             debug['simulated CNV length in ctrl'].extend(sample_len)
#         if len(case_data) == args['n_case'] and len(ctrl_data) == args['n_ctrl']:
#             status = 0
#         debug['niter'] += 1
#     debug['time'][1] = str(datetime.now())
#     return {'case': case_data, 'ctrl': ctrl_data, 'debug': debug}

# def save_data(data, filename):
#     pickle.dump(data, open(filename, "wb"))

# def load_data(filename):
#     return pickle.load(open(filename, "rb"))


# def run_simulation(args, simulation_id = 0):
#     np.random.seed(args.seed)
#     ref_gene = load_reference_gene(args['refgene_file'])
#     cnv_data = load_cnv_data(args['cnv_file'])
#     sample_data = simulate(ref_gene, pd.concat([cnv_data[args['case_dataset']], cnv_data[args['ctrl_dataset']]]),
#                           args, args.causal_genes)
#     save_data(sample_data, '{}_{}.data.pkl'.format(args['output'], simulation_id))
#     return sample_data

## Export all to module `utils.py`


In [9]:
!python extract_function.py --from Python_Utils.ipynb --to ../analysis/utils.py ../prototype/utils.py utils.py

Extracting from Python_Utils.ipynb
Done!
