In [1]:
import glob
import gzip
import os
import random
import shutil
import subprocess
import uuid

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools as pbt
import seaborn as sns
import statsmodels.api as sm

import ciepy as cpy
import projectpy as ppy

%matplotlib inline
%load_ext rpy2.ipython

random.seed(20150605)

In [2]:
outdir = os.path.join(cpy.root, 'output',
                      'run_eqtl_analysis')
ppy.makedir(outdir)

private_outdir = os.path.join(cpy.root, 'private_output',
                              'run_eqtl_analysis')
ppy.makedir(private_outdir)

In [None]:

#gene_to_regions = 

vsd = pd.read_table(os.path.join(cpy.root, 'eqtl_input', 'vst_counts.tsv'), index_col=0)

In [3]:
array_meta = pd.read_table(os.path.join(cpy.root, 'data', 'array_metadata.tsv'), index_col=0)
wgs_meta = pd.read_table(os.path.join(cpy.root, 'data', 'wgs_metadata.tsv'), index_col=0, 
                         squeeze=True)
rna_meta = pd.read_table(os.path.join(cpy.root, 'data', 'rna_seq_metadata.tsv'), index_col=0)

os.path.join(cpy.root, 'private_data', 'DI_6a_db_gap_pedigree_04292015_UUIDs_only_V1.xlsx')
pedigree = pd.read_excel(fn)
pedigree.index = pedigree.SUBJECT_ID

gene_counts = pd.read_table(os.path.join(cpy.root, 'data', 'gene_counts.tsv'),
                            index_col=0)
gene_info = pd.read_table('/raid3/projects/CARDIPS/data/public/gencode_v19/gene_info.tsv',
                          index_col=0)
genes = pbt.BedTool('/raid3/projects/CARDIPS/data/public/gencode_v19/genes.bed')



NameError: name 'fn' is not defined

In [None]:
def run_emmax_permutations(gene_id, regions, ped, kinship_matrix, tempdir, outdir, 
                           permuted_peds):
    """
    Run EMMAX for a single gene given a ped file and permuted ped files.
    
    Parameters
    ----------
    gene_id : str
        Gencode gene ID for gene to test.
        
    regions : list
        List of strings of the form 'chr1:100-200'. Biallelic SNVs in these regions
        will be tested.
    
    ped : str
        Path to PED file with actual genotypes (i.e. not permuted).
        
    kinship_matrix : str
        Path to kinship matrix file.
        
    tempdir : str
        Path to directory where temp directory should be made.
        
    outdir : str
        Path to directory where results will be saved.
        
    permuted_peds : list
        List of strings of paths to permuted PED files.

    """
    tempdir = os.path.join(tempdir, gene_id)
    ppy.makedir(tempdir)

    curdir = os.path.realpath(os.curdir)
    os.chdir(tempdir)
    
    # Make VCF file.
    vcf = _make_emmax_vcf(gene_id, tempdir, regions)
    
    # Run EMMAX for real data.
    _emmax(gene_id, ped, kinship_matrix, vcf, gene_id)
    
    # Run EMMAX for permuted data.
    names = []
    pvalues = []
    for fn in permuted_peds:
        prefix = os.path.splitext(os.path.split(fn)[1])[0]
        names.append(prefix)
        _emmax(gene_id, fn, kinship_matrix, vcf, prefix)
        out = '{}.epacts.gz'.format(prefix)
        res = _read_emmax_output(out)
        res.index = 'chr' + res.CHROM.astype(str) + ':' + res.BEG.astype(str)
        pvalues.append(res.PVALUE)
        res.PVALUE.to_csv('{}_pvalues.tsv'.format(prefix), sep='\t')
        c = 'rm {}.epacts.gz'.format(prefix)
        subprocess.check_call(c, shell=True)
    pvalues = pd.DataFrame(pvalues, index=names).T
    pvalues.to_csv('permuted_pvalues.tsv', sep='\t')
    
    # Remove VCF file.
    c = 'rm {0}.vcf.gz {0}.vcf.gz.tbi'.format(gene_id)
    subprocess.check_call(c, shell=True)
    
    # Copy output to outdir.
    outdir = os.path.join(outdir, gene_id)
    ppy.makedir(outdir)
    shutil.move('{}.epacts.gz'.format(gene_id), outdir)
    shutil.move('permuted_pvalues.tsv', outdir)
    shutil.rmtree(tempdir)
    
    os.chdir(curdir)
    
def _emmax(gene_id, ped, kinship_matrix, vcf, prefix):
    """
    Execute EMMAX command.

    Parameters
    ----------
    gene_id : str
        Gencode gene ID for gene to test.
    
    ped : str
        Path to PED file with actual genotypes (i.e. not permuted).
        
    kinship_matrix : str
        Path to kinship matrix file.
        
    vcf : str
        Path to VCF file with SNVs to test.
        
    prefix : str
        Prefix for naming output files.

    """
    c = ('{} single --vcf {} --ped {} --min-maf 0.1 --kin {} --pheno {} '
         '--cov FC1 --cov FC2 --test q.emmax --out {} --run 4'.format(
        ppy.epacts,
        vcf,
        ped,
        kinship_matrix,
        gene_id,
        prefix))
    subprocess.check_call(c, shell=True)
    _delete_extra_files(prefix)
    
def _make_emmax_vcf(gene_id, tempdir, regions):
    fn = os.path.join(tempdir, '{}.vcf.gz'.format(gene_id))
    c = ('{} view {} -q 0.1:minor -m2 -M2 -v snps -r {} | '
         '{} annotate --rename-chrs {} -O z > {}'.format(
            ppy.bcftools, 
            os.path.join(cpy.root, 'private_data', 'wgs', 'merged_pass_uuid_ann_ref.vcf.gz'),
            ','.join(regions),
            ppy.bcftools,
            os.path.join(cpy.root, 'data', 'chromosome_conversion.txt'),
            fn))
    subprocess.check_call(c, shell=True)

    c = ('{} index --tbi {}'.format(ppy.bcftools, fn))
    subprocess.check_call(c, shell=True)
    return fn
    
def _delete_extra_files(prefix):
    """
    Delete extra EMMAX files that we don't need.
    """
    to_delete = ['cov', 'eigR', 'epacts.conf', 'epacts.gz.tbi', 'epacts.mh.pdf', 'epacts.OK', 
                 'epacts.qq.pdf', 'epacts.R', 'epacts.top5000', 'ind', 'Makefile', 'phe', 'reml']
    for suffix in to_delete:
        c = 'rm {}.{}'.format(prefix, suffix)
        subprocess.check_call(c, shell=True)
        
def _read_emmax_output(fn):
    with gzip.open(fn) as f:
        lines = [x.strip().split('\t') for x in f.readlines()]
    lines[0][0] = lines[0][0][1:]
    res = pd.DataFrame(lines[1:], columns=lines[0])
    res = res.convert_objects(convert_numeric=True)
    return res

TODO: Move permutations to separate function.

In [None]:
ped_fn = os.path.join(cpy.root, 'private_output', 'eqtl_input', 'vsd.ped')
kinship_matrix_fn = os.path.join(cpy.root, 'output', 'kinship_matrix', 'wgs.kinf')
tempdir = '/dev/shm'

In [None]:
os.chdir('/raid3/projects/CARDIPS/analysis/cardips-ipsc-eqtl/notebooks')

gene_id = test_gene_id
regions = gene_to_regions[test_gene_id]
ped = ped_fn
kinship_matrix = os.path.join(cpy.root, 'output', 'kinship_matrix', 'wgs.kinf')
tempdir = '/dev/shm'
outdir = outdir
permuted_peds = glob.glob(os.path.join(private_outdir, 'permuted_*.ped'))
#run_emmax_permutations(gene_id, regions, ped, kinship_matrix, tempdir, outdir, permuted_peds)