# Input Data

This notebook parses some of the CARDiPS files to take only data that we need for this project. 
This notebook will only run on the Frazer lab cluster.

In [1]:
import glob
import os
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import ciepy
import cardipspy as cpy

%matplotlib inline

In [2]:
outdir = os.path.join(ciepy.root, 'output',
                      'input_data')
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output',
                              'input_data')
cpy.makedir(private_outdir)

In [3]:
fn = ('/raid3/projects/CARDIPS/data/database/inventory_family.tsv')
family = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_pedigree.tsv')
pedigree = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_rnaseq.tsv')
rnaseq = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_sample.tsv')
sample = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_sequence.tsv')
sequence = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_snparray.tsv')
snparray = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_subject.tsv')
subject = pd.read_table(fn, index_col=1)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_wgs.tsv')
wgs = pd.read_table(fn, index_col=0)

In [4]:
censor = pd.read_table('/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/censor.tsv',
                       index_col=0, header=None, squeeze=True)

## Samples for this Study

In [5]:
# When this assertion isn't true, that means the database will have been updated
# so I'll need to update this code.
assert snparray.shape[0] == 444

tdf = snparray[snparray.cell == 'iPSC']
samples222 = (tdf.subject_id + ':C' + tdf.clone.astype(int).astype(str) + 
              ':P' + tdf.passage.astype(int).astype(str))

## Metadata

In [6]:
snparray = snparray[['subject_id', 'cell', 'array_id', 'pos']]
snparray = snparray[snparray.cell != 'iPSC']

In [7]:
rnaseq = rnaseq[rnaseq.sequence_id.apply(lambda x: x in [6, 7, 8])]
rnaseq.ix[censor[censor == True].index, 'status'] = 2
rnaseq = rnaseq[rnaseq.status != 2]
rnaseq = rnaseq[rnaseq.cell == 'iPSC']

In [8]:
rnaseq['key'] = (rnaseq.subject_id + ':C' + rnaseq.clone.astype(int).astype(str) + 
                 ':P' + rnaseq.passage.astype(int).astype(str))
rnaseq['in_222'] = rnaseq.key.apply(lambda x: x in samples222.values)

In [9]:
n = len(set(samples222) -  set(rnaseq.key))
print('Number of 222 samples that we don\'t have '
      'RNA-seq for (with correct clone and passage): '
      '{}.'.format(n))

Number of 222 samples that we don't have RNA-seq for (with correct clone and passage): 26.


In [10]:
rnaseq['in_eqtl'] = rnaseq.in_222

In [11]:
set(rnaseq.subject_id) - set(rnaseq.ix[rnaseq.in_eqtl == True, 'subject_id'])

{'0bf3da28-3985-4c34-8197-5816fd73b588',
 'b9e30469-13ad-4792-878a-889cd6480f91',
 'bd04a8cc-5d63-45bc-a2cc-91b0c7cb6e01',
 'fbabd331-fc4f-47de-b4d0-a9ee5434fa5b'}

There are four subjects for whom we have RNA-seq although the clone/passage number
doesn't match up with the arrays. I'll include the data from these samples for these
subjects.

In [12]:
for s in set(rnaseq.subject_id) - set(rnaseq.ix[rnaseq.in_eqtl == True, 'subject_id']):
    rnaseq.ix[rnaseq[rnaseq.subject_id == s].index[0], 'in_eqtl'] = True
rnaseq = rnaseq[['subject_id', 'clone', 'passage', 'sequence_id', 'in_222', 'in_eqtl']]

In [13]:
rnaseq['wgs_id'] = ''
tdf = wgs[wgs.status == 0]

for i in rnaseq.index:
    s = rnaseq.ix[i, 'subject_id']
    t = tdf[tdf.subject_id == s]
    if t.shape[0] == 1:
        rnaseq.ix[i, 'wgs_id'] = t.index[0]
    elif t.shape[0] > 1:
        if 'Blood' in t.cell.values:
            t = t[t.cell == 'Blood']
        elif 'iPSC' in t.cell.values:
            t = t[t.cell == 'iPSC']
        if t.shape[0] == 1:
            rnaseq.ix[i, 'wgs_id'] = t.index[0]
        else:
            print('?: {}'.format(i))
    else:
        print('No WGS: {}'.format(i))
        rnaseq.ix[i, 'in_eqtl'] = False

rnaseq.ix[rnaseq['wgs_id'] == '', 'wgs_id'] = np.nan

No WGS: 1c568951-4308-4270-b40a-0380bffe699c
No WGS: 2c2697a7-584f-4767-bc64-23a833648e81
No WGS: 4ebf16ec-bcdb-47f3-aefe-5e14cd1735d5
No WGS: 9809009f-63db-4a16-8fe3-a11a474f896f
No WGS: c29ee90a-9cc0-4552-9f76-5d00f9ed0335


In [14]:
wgs = wgs.ix[set(rnaseq.wgs_id.dropna()), ['subject_id', 'cell', 'sequence_id']]

In [20]:
subject = subject.ix[set(rnaseq.subject_id)]
subject = subject[['sex', 'age', 'estimated_ethnicity', 'disease', 'family_id', 'father_id', 'mother_id', 'twin_id']]

In [21]:
snparray.to_csv(os.path.join(outdir, 'array_metadata.tsv'), sep='\t')
rnaseq.to_csv(os.path.join(outdir, 'rnaseq_metadata.tsv'), sep='\t')
subject.to_csv(os.path.join(outdir, 'subject_metadata.tsv'), sep='\t')
wgs.to_csv(os.path.join(outdir, 'wgs_metadata.tsv'), sep='\t')

## RNA-seq Data

In [17]:
# STAR logs.
fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/star_logs.tsv'
logs = pd.read_table(fn, index_col=0, low_memory=False)
logs = logs.ix[rnaseq.index]
logs.to_csv(os.path.join(outdir, 'star_logs.tsv'), sep='\t')

In [18]:
# Expression values.
"""
fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/gene_counts.tsv'
counts = pd.read_table(fn, index_col=0, low_memory=False)
counts = counts[rnaseq.index]
counts.to_csv(os.path.join(outdir, 'gene_counts.tsv'), sep='\t')

fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_expected_counts.tsv'
ecounts = pd.read_table(fn, index_col=0, low_memory=False)
ecounts = ecounts[rnaseq.index]
ecounts.to_csv(os.path.join(outdir, 'rsem_expected_counts.tsv'), sep='\t')
"""
fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_tpm.tsv'
tpm = pd.read_table(fn, index_col=0, low_memory=False)
tpm = tpm[rnaseq.index]
tpm.to_csv(os.path.join(outdir, 'rsem_tpm.tsv'), sep='\t')
"""
fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_fpkm.tsv'
fpkm = pd.read_table(fn, index_col=0, low_memory=False)
fpkm = fpkm[rnaseq.index]
fpkm.to_csv(os.path.join(outdir, 'rsem_fpkm.tsv'), sep='\t')
""";

In [57]:
# Allele counts.
cpy.makedir(os.path.join(private_outdir, 'allele_counts'))
fns = glob.glob('/raid3/projects/CARDIPS/pipeline/RNAseq/*/'
                'results/*alignment_compare/*_counts.tsv')
fns = [x for x in fns if os.path.split(x)[1].split('_')[0] in rnaseq.index]
for fn in fns:
    new_fn = os.path.join(private_outdir, 'allele_counts', os.path.split(fn)[1])
    if not os.path.exists(new_fn):
        os.symlink(fn, new_fn)

In [60]:
fns = glob.glob('/raid3/projects/CARDIPS/pipeline/RNAseq/*/'
                'results/*alignment_compare/*_counts.tsv')
fns = [os.path.split(x)[1].split('_')[0] for x in fns if os.path.split(x)[1].split('_')[0] in rnaseq.index]

fns2 = glob.glob('/raid3/projects/CARDIPS/pipeline/RNAseq/*/results/*mbased/*_locus.tsv')
fns2 = [os.path.split(x)[1].split('_')[0] for x in fns2 if os.path.split(x)[1].split('_')[0] in rnaseq.index]

In [61]:
set(fns) - set(fns2)

set()

In [58]:
# MBASED ASE results.
cpy.makedir(os.path.join(outdir, 'mbased_locus'))
fns = glob.glob('/raid3/projects/CARDIPS/pipeline/RNAseq/*/results/*mbased/*_locus.tsv')
fns = [x for x in fns if os.path.split(x)[1].split('_')[0] in rnaseq.index]
for fn in fns:
    new_fn = os.path.join(outdir, 'mbased_locus', os.path.split(fn)[1])
    if not os.path.exists(new_fn):
        os.symlink(fn, new_fn)
        
cpy.makedir(os.path.join(private_outdir, 'mbased_snv'))
fns = glob.glob('/raid3/projects/CARDIPS/pipeline/RNAseq/*/results/*mbased/*_snv.tsv')
fns = [x for x in fns if os.path.split(x)[1].split('_')[0] in rnaseq.index]
for fn in fns:
    new_fn = os.path.join(private_outdir, 'mbased_snv', os.path.split(fn)[1])
    if not os.path.exists(new_fn):
        os.symlink(fn, new_fn)

## ATAC-seq Peaks

In [21]:
cpy.makedir(os.path.join(outdir, 'atac_seq'))
samples = ['83dacb11-4180-4807-b099-05fd1561a722',
           'e932e556-59a6-4f70-9b4c-ef5f69dac3ce',
           'f549b5fa-a6c0-49fb-8a07-dda4f72ff076']
for s in samples:
    fn = os.path.join('/raid3/projects/CARDIPS/pipeline/ATACseq/'
                      'merged/{0}_time_course_day_0_merged_macs2/'
                      '{0}_time_course_day_0_merged_peaks.narrowPeak'.format(s))
    new_fn = os.path.join(outdir, 'atac_seq', os.path.split(fn)[1])
    if not os.path.exists(new_fn):
        os.symlink(fn, new_fn)
    fn = os.path.join('/raid3/projects/CARDIPS/pipeline/ATACseq/'
                      'merged/{0}_time_course_day_0_merged_macs2/'
                      '{0}_time_course_day_0_merged_summits.bed'.format(s))
    new_fn = os.path.join(outdir, 'atac_seq', os.path.split(fn)[1])
    if not os.path.exists(new_fn):
        os.symlink(fn, new_fn)

## Chain Files

I need to convert some mouse coordinates to hg19 and some hg20 coordinates
to hg19 as well.

In [26]:
import shutil
from urllib2 import urlopen

def download_and_gunzip(url, dest):
    """
    Download a gzipped file url to dest and gunzip it.

    Parameters
    ----------
    url : str
        URL for gzipped file to download.

    dest : str
        Full path to save gzipped file to. This file will be gunzipped.

    """
    try:
        os.makedirs(os.path.split(dest)[0])
    except OSError:
        pass
    req = urlopen(url)
    with open(dest, 'w') as d:
        shutil.copyfileobj(req, d)
    subprocess.check_call(['gunzip', dest])

In [27]:
url = 'http://hgdownload.cse.ucsc.edu/goldenPath/mm9/vsHg19/mm9.hg19.all.chain.gz'
dest = os.path.join(outdir, 'mm9.hg19.all.chain.gz')
if not os.path.exists(dest):
    download_and_gunzip(url, dest)
url = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz'
dest = os.path.join(outdir, 'hg38ToHg19.over.chain.gz')
if not os.path.exists(dest):
    download_and_gunzip(url, dest)