In [87]:
import cPickle
import datetime
import glob
import gzip
import os
import random
import shutil
import subprocess
import time
import uuid

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm

import ciepy as cpy
import projectpy as ppy

%matplotlib inline
%load_ext rpy2.ipython

random.seed(20150605)

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [3]:
outdir = os.path.join(cpy.root, 'output',
                      'run_eqtl_analysis')
ppy.makedir(outdir)

private_outdir = os.path.join(cpy.root, 'private_output',
                              'run_eqtl_analysis')
ppy.makedir(private_outdir)

In [4]:
gene_info = pd.read_table('/raid3/projects/CARDIPS/data/public/gencode_v19/gene_info.tsv',
                          index_col=0)

fn = os.path.join(cpy.root, 'output', 'eqtl_input', 'gene_to_regions.p')
gene_to_regions = cPickle.load(open(fn, 'rb'))

vsd = pd.read_table(os.path.join(cpy.root, 'output', 'eqtl_input', 'vst_counts.tsv'), index_col=0)

# Run EMMAX

In [188]:
todo = list(set(vsd.index) - 
            set([os.path.split(x)[1] for x in glob.glob(os.path.join(outdir, 'test_results', '*'))]))
todo = [x for x in todo if gene_info.ix[x, 'chrom'] not in ['chrX', 'chrY', 'chrM']]

## PBS

In [172]:
def run_emmax_pbs(gene_ids, n=10):
    """
    gene_ids is a list of gene_ids and n is the number of genes to submit at the same time.
    This script will find n number of genes that EMMAX hasn't been run for and submit a job
    for those genes.
    """
    genes_todo = []
    i = 0
    while len(genes_todo) < n:
        if not os.path.exists(os.path.join(outdir, 'test_results', gene_ids[i])):
            genes_todo.append(gene_ids[i])
        i += 1
    res = datetime.datetime.now()
    date = '{}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}'.format(res.year, res.month,
                                                          res.day, res.hour,
                                                          res.minute,
                                                          res.second)
    fn = os.path.join(outdir, 'test_results', 'pbs_scripts', '{}.pbs'.format(date))
    with open(fn, 'w') as f:
        f.write('#!/bin/bash\n#PBS -q medium\n')
        f.write('#PBS -N emmax_{}\n'.format(date))
        f.write('#PBS -l nodes=1:ppn=2\n')
        f.write('#PBS -o {}/emmax_{}.out\n'.format(
                os.path.join(outdir, 'test_results', 'pbs_scripts'), date))
        f.write('#PBS -e {}/emmax_{}.err\n\n'.format(
                    os.path.join(outdir, 'test_results', 'pbs_scripts'), date))
        f.write('source activate cardips\n')
        f.write('source /raid3/projects/CARDIPS/other_repos/'
                'cardips-data-software/environment.sh\n\n')
        for gene_id in genes_todo:
            toutdir = os.path.join(outdir, 'test_results', gene_id)
            ppy.makedir(toutdir)
            c = 'python {} \\\n\t'.format(os.path.join(cpy.root, 'scripts', 'run_emmax.py'))
            c += ' \\\n\t'.join([
                    gene_id,
                    os.path.join(cpy.root, 'private_data', 'wgs', 'merged_pass_uuid_ann_ref.vcf.gz'),
                    ','.join(gene_to_regions[gene_id]),
                    os.path.join(cpy.root, 'output', 'eqtl_input', 'phe_vst_counts.tsv'),
                    os.path.join(cpy.root, 'output', 'eqtl_input', 'emmax.ind'),
                    os.path.join(cpy.root, 'output', 'kinship_matrix', 'wgs.kin'),
                    toutdir,
                    '-c {}'.format(os.path.join(cpy.root, 'output', 'eqtl_input', 
                                                'emmax.cov')),
                ])
            f.write(c + '\n\n')
    ppy.submit_job(fn)

In [None]:
for i in range(10):
    run_emmax_pbs(todo, n=20)
    run_emmax_pbs(todo, n=20)
    run_emmax_pbs(todo, n=20)
    run_emmax_pbs(todo, n=20)
    run_emmax_pbs(todo, n=20)
    time.sleep(120)

In [186]:
with open('/raid3/projects/CARDIPS/analysis/cardips-ipsc-eqtl/output/'
          'run_eqtl_analysis/test_results/pbs_scripts/jnames.txt') as f:
    jnames = [x.strip().split()[-1] for x in f.readlines()]

pbs = glob.glob('/raid3/projects/CARDIPS/analysis/cardips-ipsc-eqtl/output/'
                'run_eqtl_analysis/test_results/pbs_scripts/*.pbs')
done = {}
for p in pbs:
    done[p] = []
    with open(p) as f:
        lines = [x.strip().split()[0] for x in f.readlines() if x.strip()[0:3] == 'ENS']
    for g in lines:
        if os.path.exists('/raid3/projects/CARDIPS/analysis/cardips-ipsc-eqtl/output/'
                          'run_eqtl_analysis/test_results/{}/minimum_pvalues.tsv'.format(g)):
            done[p].append(g)
            
not_done = []
for k in done.keys():
    n = os.path.split(k)[1]
    if len(done[k]) == 0 and 'emmax_{}'.format(os.path.splitext(n)[0]) not in jnames:
        not_done.append(k)
        print('{} not started'.format(n))

In [180]:
for fn in not_done:
    ppy.submit_job(fn)

In [None]:
3 + 

## Local

In [48]:
def run_emmax(gene_id):
    os.chdir('/raid3/projects/CARDIPS/analysis/cardips-ipsc-eqtl/notebooks')
    toutdir = os.path.join(outdir, 'test_results', gene_id)
    if not os.path.exists(toutdir):
        ppy.makedir(toutdir)
        fn = os.path.join(toutdir, '{}.sh'.format(gene_id))
        with open(fn, 'w') as f:
            c = 'python {} \\\n\t'.format(os.path.join(cpy.root, 'scripts', 'run_emmax.py'))
            c += ' \\\n\t'.join([
                    gene_id,
                    os.path.join(cpy.root, 'private_data', 'wgs', 'merged_pass_uuid_ann_ref.vcf.gz'),
                    ','.join(gene_to_regions[gene_id]),
                    os.path.join(cpy.root, 'output', 'eqtl_input', 'phe_vst_counts.tsv'),
                    os.path.join(cpy.root, 'output', 'eqtl_input', 'emmax.ind'),
                    os.path.join(cpy.root, 'output', 'kinship_matrix', 'wgs.kin'),
                    toutdir,
                    '-c {}'.format(os.path.join(cpy.root, 'output', 'eqtl_input', 
                                                'emmax.cov')),
                ])
            f.write(c + '\n')
        subprocess.check_call('bash {}'.format(fn), shell=True)

In [134]:
!ipcluster stop --profile=cardips

2015-07-18 08:16:45.402 [IPClusterStop] CRITICAL | Could not read pid file, cluster is probably not running.


In [99]:
!ipcluster start -n 12 --daemon --profile=cardips
!sleep 30

In [101]:
from IPython.parallel import Client
parallel_client = Client()

In [102]:
dview = parallel_client[:]

In [103]:
with dview.sync_imports():
    import os
    import subprocess
    import time
    import ciepy
    import projectpy

importing os on engine(s)
importing subprocess on engine(s)
importing time on engine(s)
importing ciepy on engine(s)
importing projectpy on engine(s)


In [104]:
%px cpy = ciepy
%px ppy = projectpy

In [105]:
dview.push(dict(gene_to_regions=gene_to_regions, outdir=outdir, run_emmax=run_emmax))

<AsyncResult: _push>

In [129]:
dview.scatter('todo', todo);

In [130]:
sleep = np.arange(0, 10 * len(parallel_client.ids), 10)
dview.scatter('sleep', sleep);

In [None]:
%px time.sleep(sleep[0]) ; [run_emmax(x) for x in todo]