# Run eQTL Analysis

This notebook coordinates and executes the eQTL analysis. This notebook is
specialized for the Frazer lab cluster. Since running the entire analysis is 
time consuming, I generally run it "by hand," starting jobs for groups of
genes at different times. I've included instructions at various points below.

In [1]:
import cPickle
import datetime
import glob
import gzip
import os
import random
import re
import shutil
import subprocess
import time
import uuid

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
import vcf as pyvcf

import cardipspy as cpy
import ciepy

%matplotlib inline
%load_ext rpy2.ipython

random.seed(20150605)

In [2]:
outdir = os.path.join(ciepy.root, 'output',
                      'run_eqtl_analysis')
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output',
                              'run_eqtl_analysis')
cpy.makedir(private_outdir)

In [3]:
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'gene_to_regions.p')
gene_to_regions = cPickle.load(open(fn, 'rb'))

exp = pd.read_table(os.path.join(ciepy.root, 'output', 'eqtl_input', 
                                 'tpm_log_filtered_phe_std_norm_peer_resid.tsv'), index_col=0)

## Run Analysis

The `run_emmax_sge` method will submit a job for a gene. I currently ask for 
16Gb of RAM per job and four cores. If you ask for less cores, more jobs will run
per node but all of the IO seems to slow the jobs down. Many genes probably need
less than 16Gb of RAM but some need more. The `mem_needed` method was my attempt
at estimating how much memory a job would need but it wasn't working well.
I think the memory needed scales with the number of variants (which `num_variants`
can tell you), so I could go and monitor the amount of memory used versus
the number of variants. However, I didn't have a big problem with jobs failing when 
using 16Gb of memory (it seems like ~430 genes failed and had to be run again).

In [4]:
cpy.makedir(os.path.join(private_outdir, 'sge_scripts'))
cpy.makedir(os.path.join(private_outdir, 'logs'))

In [37]:
def num_variants(vcf, gene_id, tempdir, regions, samples, bcftools_path):
    # This doesn't include CNVs but there aren't many of those.
    import ciepy

    samples = pd.read_table(samples, header=None, squeeze=True)
    fn = os.path.join(tempdir, '{}.vcf.gz'.format(gene_id))
    c = ('{} view {} -q 0.05:minor -m2 -M2 -r {} -s {} -O u | '
         '{} filter -m x -O v | grep -v \\# | wc -l'.format(
             bcftools_path,
             vcf,
             regions,
             ','.join(samples.values),
             bcftools_path,
             fn))
    num = int(subprocess.check_output(c, shell=True).strip())
    return num

def make_variant_cov(res_files, out):
    vcf = os.path.join(ciepy.root, 'private_output/eqtl_input/filtered_all/0000.vcf.gz')
    cnv_vcf = os.path.join(ciepy.root, 'private_output', 'cnv_processing', 'emmax_sorted.vcf.gz')
    cov = os.path.join(ciepy.root, 'output', 'eqtl_input', 'emmax_sex_only.tsv')
    covariates = pd.read_table(cov, index_col=0, header=None, squeeze=True)
    new_covariates = pd.DataFrame({'sex':covariates})
    for i,fn in enumerate(res_files):
        res = ciepy.read_emmax_output(fn)
        res = res[res.PVALUE == res.PVALUE.min()]
        i = res.index[0]
        if 'CNV' in res.ix[i, 'MARKER_ID']:
            vcf_reader = pyvcf.Reader(open(cnv_vcf))
        else:
            vcf_reader = pyvcf.Reader(open(vcf))
        res = vcf_reader.fetch(res.ix[i, 'CHROM'], res.ix[i, 'BEG'], res.ix[i,'END'])
        r = res.next()
        new_covariates[i] = 0
        hets = set([x.sample for x in r.get_hets()]) & set(new_covariates.index)
        halts = set([x.sample for x in r.get_hom_alts()]) & set(new_covariates.index)
        new_covariates.ix[hets, i] = 1
        new_covariates.ix[halts, i] = 2
    new_covariates.to_csv(out, sep='\t', header=None)

def run_emmax_sge(gene_id, out_dy, mem=16, queue=None, res_files=None):
    """
    """
    vcf = os.path.join(ciepy.root, 'private_output/eqtl_input/filtered_all/0000.vcf.gz')
    cnv_vcf = os.path.join(ciepy.root, 'private_output', 'cnv_processing', 'emmax_sorted.vcf.gz')
    samples = os.path.join(ciepy.root, 'output', 'eqtl_input', 'emmax_samples.tsv')
    regions = ','.join([x[3:] for x in gene_to_regions[gene_id]])
    #num = num_variants(vcf, gene_id, outdir, regions, samples, 'bcftools')
    #mem = mem_needed(num)
    
    exp = os.path.join(ciepy.root, 'output', 'eqtl_input', 
                       'tpm_log_filtered_phe_std_norm_peer_resid.tsv')
    kin = os.path.join(ciepy.root, 'output', 'eqtl_input', 'wgs.kin')
    toutdir = os.path.join(out_dy, gene_id)
    cpy.makedir(toutdir)
    cov = os.path.join(ciepy.root, 'output', 'eqtl_input', 'emmax_sex_only.tsv')
    
    # If one or more emmax results files are provided, we'll get the most significant
    # variant from each file and add that as a covariate. We'll write the new covariate
    # file in the gene's output directory.
    if res_files:
        covariates = pd.read_table(cov, index_col=0, header=None, squeeze=True)
        for fn in res_files:
            cov = os.path.join(toutdir, '{}.cov'.format(gene_id))
            make_variant_cov(res_files, cov)

    res = datetime.datetime.now()
    date = re.sub(r'\D', '_', str(res))
    fn = os.path.join(private_outdir, 'sge_scripts', '{}_{}.sh'.format(gene_id, date))
    with open(fn, 'w') as f:
        f.write('#!/bin/bash\n\n')
        f.write('#$ -N emmax_{}_{}\n'.format(gene_id, date))
        if queue:
            f.write('#$ -l {}\n'.format(queue))
            if queue == 'opt':
                mem = mem / 2.
        num_threads = 4
        f.write('#$ -l h_vmem={}G\n'.format(mem / num_threads))
        #f.write('#$ -l h_vmem=1G\n')
        f.write('#$ -pe smp {}\n'.format(num_threads))
        f.write('#$ -S /bin/bash\n')
        f.write('#$ -o {}/emmax_{}_{}.out\n'.format(
                os.path.join(private_outdir, 'logs'), gene_id, date))
        f.write('#$ -e {}/emmax_{}_{}.err\n\n'.format(
                    os.path.join(private_outdir, 'logs'), gene_id, date))
        f.write('module load cardips/1\n')
        f.write('source activate cie\n\n')
        
        c = 'python {} \\\n\t'.format(os.path.join(ciepy.root, 'scripts', 'run_emmax.py'))
        c += ' \\\n\t'.join([
                gene_id,
                '{},{}'.format(vcf, cnv_vcf),
                regions,
                exp,
                samples,
                kin,
                toutdir,
                '-c {}'.format(cov),
            ])
        f.write(c + '\n\n')
    subprocess.check_call('qsub {}'.format(fn), shell=True)
    #print(fn)
    
def get_jobs():
    """Get info about jobs currently running."""
    # Get jobs currently waiting to start or started.
    running = !qstat -r | grep jobname
    running = [x.split()[-1] for x in running if 'emmax_' in x]
    # Get all submission scripts created.
    fns = glob.glob(os.path.join(private_outdir, 'sge_scripts', '*.sh'))
    jobnames = ['emmax_' + os.path.splitext(os.path.split(x)[1])[0] for x in fns]
    genes = [os.path.split(x)[1].split('_')[0] for x in fns]
    jobs = pd.DataFrame(np.array(fns).T, index=jobnames, columns=['path'])
    jobs = pd.DataFrame([fns, genes], columns=jobnames, index=['path', 'gene']).T
    jobs['status'] = 'finished'
    # For now, running means either the job is waiting to start or has started.
    jobs.ix[running, 'status'] = 'running'
    return jobs

def get_genes(jobs):
    """Get job info about genes that we are analyzing."""
    genes = [os.path.split(x)[1] for x in glob.glob(os.path.join(private_outdir, 'results', 'ENSG*'))]
    genes = pd.DataFrame(index=genes)
    genes['status'] = 'incomplete'
    min_pvals = glob.glob(os.path.join(private_outdir, 'results', 'ENSG*', 'permuted_pvalues.tsv'))
    genes.ix[[x.split('/')[-2] for x in min_pvals], 'status'] = 'complete'
    genes['job_status'] = 'finished'
    # If there is any running job with this gene, we want to mark the job_status
    # as running.
    genes.ix[jobs.ix[jobs.status == 'running', 'gene'], 'job_status'] = 'running'
    return genes

In [22]:
def process_failed():
    # Write failed genes to file.
    new_failed = list(genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index)
    if len(new_failed) > 0:
        new_failed = list(genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index)
        if os.path.exists(os.path.join(private_outdir, 'failed.tsv')):
            failed = pd.read_table(os.path.join(private_outdir, 'failed.tsv'), squeeze=True, 
                                   header=None, index_col=0)
            if len(set(failed.index) & set(new_failed)) > 0:
                failed[list(set(failed.index) & set(new_failed))] += 1
            t = pd.Series(1, index=list(set(new_failed) - set(failed.index)))
            failed = pd.concat([t, failed])
        else:
            failed = pd.Series(1, index=new_failed)
        failed.to_csv(os.path.join(private_outdir, 'failed.tsv'), header=None, sep='\t')

        # Remove output directories.
        dys = [os.path.join(private_outdir, 'results', g) for g in 
               genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index]
        c = ' ; '.join(['if [ -d "{0}" ]; then rm -r {0} ; fi'.format(dy) for dy in dys])
        subprocess.check_call(c, shell=True)

        # Delete temp directories if they exist.
        dys = ['/dev/shm/{}'.format(g) for g in 
               genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index]
        s = ' ; '.join(['if [ -d "{0}" ]; then rm -r {0} ; fi'.format(dy) for dy in dys])
        c = 'pdsh -g n "{}"'.format(s)
        subprocess.check_call(c, shell=True)
        
def submit_failed(failed_fn):
    # Submit failed genes with more memory.
    if failed_fn:
        jobs = get_jobs()
        genes = get_genes(jobs)
        failed = pd.read_table(failed_fn, index_col=0, 
                               header=None, squeeze=True)
        todo = list(set(failed.index) - set(genes.index))
        for gene in todo:
            mem = (failed[gene] + 1) * 16
            print(gene, mem)
            run_emmax_sge(gene, mem=mem)
            #run_emmax_sge(gene, mem=mem * 2, queue='opt')

def submit_jobs(todo, out_dy, failed_fn, res_dys=None, failed=True, queue=None):
    """out_dy is the otuput directory where the per-gene output directories
    will be stored. If failed == True, re-submit failed jobs. failed_fn is the 
    file that keeps track of which genes' jobs failed and how many times."""
    cpy.makedir(out_dy)
    s = glob.glob(os.path.join(out_dy, '*'))
    s = [os.path.split(x)[1] for x in s]
    fns = glob.glob(os.path.join(out_dy, '*', 'permuted_pvalues.tsv'))
    g = [x.split('/')[-2] for x in fns]
    jobs = !qstat -r | grep jobname | tr -s ' ' | cut -d ' ' -f 4
    jobs = [x for x in jobs if 'emmax' in x]
    jobs = [x.split('_')[1] for x in jobs]

    # Remove failed genes. I'll submit these with more memory.
    if os.path.exists(failed_fn):
        with open(failed_fn) as f:
            failed = [x.strip() for x in f.readlines()]
        todo = list(set(todo) - set(failed))

    # Submit new jobs.
    ind = 0
    while ind < len(todo):
        # Get prior results to use as covariate if needed.
        if res_dys:
            res_files = []
            for dy in res_dys:
                if os.path.exists(os.path.join(dy, todo[ind])):
                    res_files.append(os.path.join(dy, todo[ind], todo[ind] + '.tsv'))
        run_emmax_sge(todo[ind], out_dy, queue=queue, res_files=res_files)
        ind += 1

### First analysis

In [11]:
out_dy = os.path.join(private_outdir, 'results')
failed_fn = os.path.join(private_outdir, 'failed.tsv')
todo = list(set(exp.index) - 
            set([os.path.split(x)[1] for x in glob.glob(os.path.join(out_dy, '*'))]))
todo = [x for x in todo if gene_info.ix[x, 'chrom'] not in ['chrX', 'chrY', 'chrM']]

### Second analysis

### Third analysis

In [44]:
qvalues = pd.read_table(os.path.join(ciepy.root, 'output', 'eqtl_processing', 'secondary_eqtls',
                                     'qvalues.tsv'), index_col=0)
sig = qvalues[qvalues.sig]
out_dy = os.path.join(private_outdir, 'results3')
failed_fn = os.path.join(private_outdir, 'failed3.tsv')
res_dys = [os.path.join(private_outdir, 'results'), os.path.join(private_outdir, 'results2')]
cpy.makedir(out_dy)
todo = list(set(sig.index) - 
            set([os.path.split(x)[1] for x in glob.glob(os.path.join(out_dy, '*'))]))

In [46]:
#submit_jobs(todo, out_dy, failed_fn, res_dys=res_dys, failed=True, queue=None)
submit_jobs(todo, out_dy, failed_fn, res_dys=res_dys, failed=True, queue='opt')

In [None]:
3 + 

### `all` queue

This cell will submit jobs to the `all` queue. The variable `num_to_submit`
controls how many jobs to submit.

I just load balance the `all` and `opt` queues myself as the jobs finish. For instance,
if I see the `opt` queue has a lot of jobs queued and `all` doesn't, I submit some jobs to
`all`. I don't see a big difference in the speed between the two queues.

In [None]:
todo = list(set(exp.index) - 
            set([os.path.split(x)[1] for x in glob.glob(os.path.join(private_outdir, 'results', '*'))]))
todo = [x for x in todo if gene_info.ix[x, 'chrom'] not in ['chrX', 'chrY', 'chrM']]

# Remove failed genes. I'll wait to resubmit these with more memory.
if os.path.exists(os.path.join(private_outdir, 'failed.txt')):
    with open(os.path.join(private_outdir, 'failed.txt')) as f:
        failed = [x.strip() for x in f.readlines()]
    todo = list(set(todo) - set(failed))

# Set num_to_submit to the number of jobs you want to submit.
num_to_submit = len(todo)
ind = 0
while len(todo) > 0 and ind < num_to_submit:
    run_emmax_sge(todo[ind])
    ind += 1

### `opt` queue

This cell will submit jobs to the `opt` queue. The variable `num_to_submit`
controls how many jobs to submit. `run_emmax_sge` will cut the memory in half
for `opt` jobs since they have less memory. If a gene fails in the `opt` queue 
due to memory it may work in the `all` queue.

In [None]:
todo = list(set(exp.index) - 
            set([os.path.split(x)[1] for x in glob.glob(os.path.join(private_outdir, 'results', '*'))]))
todo = [x for x in todo if gene_info.ix[x, 'chrom'] not in ['chrX', 'chrY', 'chrM']]

# Remove failed genes. I'll wait to resubmit these with more memory.
if os.path.exists(os.path.join(private_outdir, 'failed.txt')):
    with open(os.path.join(private_outdir, 'failed.txt')) as f:
        failed = [x.strip() for x in f.readlines()]
    todo = list(set(todo) - set(failed))

# Set num_to_submit to the number of jobs you want to submit.
num_to_submit = len(todo)
ind = 0
while len(todo) > 0 and ind < num_to_submit:
    run_emmax_sge(todo[ind], queue='opt')
    ind += 1

In [274]:
# Submit failed genes with more memory.
if os.path.exists(os.path.join(private_outdir, 'failed.tsv')):
    jobs = get_jobs()
    genes = get_genes(jobs)
    failed = pd.read_table(os.path.join(private_outdir, 'failed.tsv'), index_col=0, 
                           header=None, squeeze=True)
    todo = list(set(failed.index) - set(genes.index))
    for gene in todo:
        mem = (failed[gene] + 1) * 16
        print(gene, mem)
        run_emmax_sge(gene, mem=mem)
        #run_emmax_sge(gene, mem=mem * 2, queue='opt')

('ENSG00000123243.10', 32)


### Look for failed jobs

Some genes have errors when they are running. This can happen if I don't request enough memory 
for instance. Genes that fail won't have `minimum_pvalues.tsv` files even when their job finishes. 
I want to identify these genes and remove their output directory so they will be run in a new job. 
I think these genes often leave behind their temp directory so I have to go delete those too.

In [309]:
todo = list(set(exp.index))
todo = [x for x in todo if gene_info.ix[x, 'chrom'] not in ['chrX', 'chrY', 'chrM']]
print('{:,} total genes to do.'.format(len(todo)))

17,769 total genes to do.


The table below shows how many genes' jobs are running. The number in the bottom
left cell (incomplete, finished) indicates jobs that failed.

In [310]:
jobs = get_jobs()
genes = get_genes(jobs)

pd.crosstab(genes.status, genes.job_status)

job_status,finished,running
status,Unnamed: 1_level_1,Unnamed: 2_level_1
complete,17768,0
incomplete,0,1


Any genes that are incomplete but whose jobs are finished had errors. I need 
to delete their temp directories and the output directories and try resubmitting.
I keep track of the failed genes so I can wait to resubmit them with more memory.
The file `failed.tsv` has the gene ID and the number of times the gene failed. I
increase the RAM in proportion to the number of times the gene fails. So if it has failed
three times, I'll give it $3 * 16 = 48$ Gb of RAM the next time I submit, etc.

In [273]:
# Write failed genes to file.
new_failed = list(genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index)
if len(new_failed) > 0:
    new_failed = list(genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index)
    if os.path.exists(os.path.join(private_outdir, 'failed.tsv')):
        failed = pd.read_table(os.path.join(private_outdir, 'failed.tsv'), squeeze=True, 
                               header=None, index_col=0)
        if len(set(failed.index) & set(new_failed)) > 0:
            failed[list(set(failed.index) & set(new_failed))] += 1
        t = pd.Series(1, index=list(set(new_failed) - set(failed.index)))
        failed = pd.concat([t, failed])
    else:
        failed = pd.Series(1, index=new_failed)
    failed.to_csv(os.path.join(private_outdir, 'failed.tsv'), header=None, sep='\t')

    # Remove output directories.
    dys = [os.path.join(private_outdir, 'results', g) for g in 
           genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index]
    c = ' ; '.join(['if [ -d "{0}" ]; then rm -r {0} ; fi'.format(dy) for dy in dys])
    subprocess.check_call(c, shell=True)

    # Delete temp directories if they exist.
    dys = ['/dev/shm/{}'.format(g) for g in 
           genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index]
    s = ' ; '.join(['if [ -d "{0}" ]; then rm -r {0} ; fi'.format(dy) for dy in dys])
    c = 'pdsh -g n "{}"'.format(s)
    subprocess.check_call(c, shell=True)

Sometimes I may accidentally submit a job for a gene that already has a job submitted.
This attempts to fix that.

In [None]:
2 + 

## Secondary etc. QTLs

I want to search for secondary, tertiary, etc. eQTLs for genes that had one significant
eQTL.

In [360]:
qvalues = pd.read_table(os.path.join(ciepy.root, 'output', 'eqtl_processing',
                                     'qvalues.tsv'), index_col=0)
sig = qvalues[qvalues.sig]

In [None]:
todo = list(set(sig.index) - 
            set([os.path.split(x)[1] for x in glob.glob(os.path.join(private_outdir, 'results2', '*'))]))

# Remove failed genes. I'll wait to resubmit these with more memory.
if os.path.exists(os.path.join(private_outdir, 'failed2.txt')):
    with open(os.path.join(private_outdir, 'failed2.txt')) as f:
        failed = [x.strip() for x in f.readlines()]
    todo = list(set(todo) - set(failed))

# Set num_to_submit to the number of jobs you want to submit.
num_to_submit = len(todo) / 2
ind = 0
while len(todo) > 0 and ind < num_to_submit:
    gene_id = todo[ind]
    rfn = os.path.join(ciepy.root, 'private_output', 'run_eqtl_analysis', 'results',
                       gene_id, '{}.tsv'.format(gene_id))
    run_emmax_sge(gene_id, res_files=[rfn])
    ind += 1

In [413]:
todo = list(set(sig.index) - 
            set([os.path.split(x)[1] for x in glob.glob(os.path.join(private_outdir, 'results2', '*'))]))

# Remove failed genes. I'll wait to resubmit these with more memory.
if os.path.exists(os.path.join(private_outdir, 'failed2.txt')):
    with open(os.path.join(private_outdir, 'failed2.txt')) as f:
        failed = [x.strip() for x in f.readlines()]
    todo = list(set(todo) - set(failed))

# Set num_to_submit to the number of jobs you want to submit.
num_to_submit = len(todo)
ind = 0
while len(todo) > 0 and ind < num_to_submit:
    gene_id = todo[ind]
    rfn = os.path.join(ciepy.root, 'private_output', 'run_eqtl_analysis', 'results',
                       gene_id, '{}.tsv'.format(gene_id))
    run_emmax_sge(gene_id, queue='opt', res_files=[rfn])
    ind += 1

In [483]:
s = glob.glob(os.path.join(private_outdir, 'results2', '*'))
s = [os.path.split(x)[1] for x in s]
fns = glob.glob(os.path.join(private_outdir, 'results2', '*', 'permuted_pvalues.tsv'))
g = [x.split('/')[-2] for x in fns]
jobs = !qstat -r | grep jobname | tr -s ' ' | cut -d ' ' -f 4
jobs = [x for x in jobs if 'emmax' in x]
jobs = [x.split('_')[1] for x in jobs]

# Write failed genes to file.
new_failed = set(s) - set(g) - set(jobs)
if len(new_failed) > 0:
    # new_failed = list(genes[(genes.status == 'incomplete') & (genes.job_status == 'finished')].index)
    if os.path.exists(os.path.join(private_outdir, 'failed2.tsv')):
        failed = pd.read_table(os.path.join(private_outdir, 'failed2.tsv'), squeeze=True, 
                               header=None, index_col=0)
        if len(set(failed.index) & set(new_failed)) > 0:
            failed[list(set(failed.index) & set(new_failed))] += 1
        t = pd.Series(1, index=list(set(new_failed) - set(failed.index)))
        failed = pd.concat([t, failed])
    else:
        failed = pd.Series(1, index=new_failed)
    failed.to_csv(os.path.join(private_outdir, 'failed2.tsv'), header=None, sep='\t')

    # Remove output directories.
    dys = [os.path.join(private_outdir, 'results2', g) for g in new_failed]
    c = ' ; '.join(['if [ -d "{0}" ]; then rm -r {0} ; fi'.format(dy) for dy in dys])
    subprocess.check_call(c, shell=True)

    # Delete temp directories if they exist.
    dys = ['/dev/shm/{}'.format(g) for g in new_failed]
    s = ' ; '.join(['if [ -d "{0}" ]; then rm -r {0} ; fi'.format(dy) for dy in dys])
    c = 'pdsh -g n "{}"'.format(s)
    subprocess.check_call(c, shell=True)

In [None]:
# Submit failed genes with more memory.
if os.path.exists(os.path.join(private_outdir, 'failed2.tsv')):
    jobs = !qstat -r | grep jobname | tr -s ' ' | cut -d ' ' -f 4
    jobs = [x for x in jobs if 'emmax' in x]
    jobs = [x.split('_')[1] for x in jobs]
    failed = pd.read_table(os.path.join(private_outdir, 'failed2.tsv'), index_col=0, 
                           header=None, squeeze=True)
    todo = list(set(failed.index) - set(jobs))
    for gene in todo:
        mem = (failed[gene] + 1) * 16
        print(gene, mem)
        rfn = os.path.join(ciepy.root, 'private_output', 'run_eqtl_analysis', 'results',
                           gene, '{}.tsv'.format(gene))
        run_emmax_sge(gene, mem=mem * 2, queue='opt', res_files=[rfn])
        #run_emmax_sge(gene, mem=mem, res_files=[rfn])

('ENSG00000231389.3', 32)
('ENSG00000196126.6', 32)
('ENSG00000182372.6', 32)
('ENSG00000213760.6', 32)
('ENSG00000224557.3', 32)
('ENSG00000254870.1', 32)
('ENSG00000223865.6', 32)
('ENSG00000263020.1', 32)
('ENSG00000230313.1', 32)
('ENSG00000179344.12', 32)
('ENSG00000204287.9', 32)