# Input Data

This notebook parses some of the CARDiPS files to take only data that we need for this project. 
This notebook will only run on the Frazer lab cluster.

In [1]:
import glob
import os
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import ciepy
import cardipspy as cpy

%matplotlib inline



In [2]:
outdir = os.path.join(ciepy.root, 'output',
                      'input_data')
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output',
                              'input_data')
cpy.makedir(private_outdir)

In [107]:
dy = '/projects/CARDIPS/data/database/20151201'

fn = os.path.join(dy, 'baseline_analyte.tsv')
baseline_analyte = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_wgsisaac.tsv')
baseline_wgsisaac = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_ipsc.tsv')
baseline_ipsc = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_wgs.tsv')
baseline_wgs = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_cnv.tsv')
baseline_cnv = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_rnas.tsv')
baseline_rnas = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_ibd.tsv')
baseline_ibd = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_manifest.tsv')
baseline_manifest = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_snpa.tsv')
baseline_snpa = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_tissue.tsv')
baseline_tissue = pd.read_table(fn, index_col=0)

fn = os.path.join(dy, 'family1070_rnas.tsv')
family1070_rnas = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'family1070_tissue.tsv')
family1070_tissue = pd.read_table(fn, index_col=0)

fn = os.path.join(dy, 'subject_pedigree.tsv')
subject_pedigree = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'subject_family.tsv')
subject_family = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'subject_subject.tsv')
subject_subject = pd.read_table(fn, index_col=0)

#fn = os.path.join(dy, 'data_wgs.tsv')
#data_wgs = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_snpa.tsv')
#data_snpa = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_array.tsv')
#data_array = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_chips.tsv')
#data_chips = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_atacs.tsv')
#data_atacs = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_metha.tsv')
#data_metha = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_hic.tsv')
#data_hic = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_rnas.tsv')
#data_rnas = pd.read_table(fn, index_col=0)
#fn = os.path.join(dy, 'data_sequence.tsv')
#data_sequence = pd.read_table(fn, index_col=0)

In [4]:
dy = '/projects/CARDIPS/pipeline/RNAseq/combined_files'
censor = pd.read_table(os.path.join(dy, 'censor.tsv'),
                       index_col=0, header=None, squeeze=True)

## Array CNVs

I'm going to make a table of all CNVs identified by arrays. Some iPSC didn't have
any CNVs. For now, if an iPSC is in the CNV table, that means that it either
didn't have CNVs or we didn't test that clone/passage number for CNVs. 

In [5]:
cnv = baseline_cnv.merge(baseline_snpa, left_on='snpa_id', right_index=True,
                         suffixes=['_cnv', '_snpa'])
cnv = cnv.merge(baseline_analyte, left_on='analyte_id', right_index=True,
                suffixes=['_cnv', '_analyte'])
cnv = cnv.merge(baseline_tissue, left_on='tissue_id', right_index=True,
                suffixes=['_cnv', '_tissue'])
cnv = cnv[['type', 'chr', 'start', 'end', 'len', 'primary_detect_method', 
           'clone', 'passage', 'subject_id']]

## RNA-seq Samples for this Study

I'm going to use baseline and family 1070 samples.

In [89]:
# Get family1070 samples.
tdf = family1070_rnas[family1070_rnas.comment.isnull()]
tdf = tdf.merge(family1070_tissue, left_on='tissue_id', right_index=True, 
                suffixes=['_rna', '_tissue'])
tdf = tdf[tdf.cell_type == 'iPSC']
tdf.index = tdf.rnas_id
tdf = tdf[['ipsc_clone_number', 'ipsc_passage', 'subject_id']]
tdf.columns = ['clone', 'passage', 'subject_id']
tdf['isolated_by'] = 'p'
tdf.index.name = 'rna_id'

In [95]:
# Get the iPSC eQTL samples.
rna = baseline_rnas[baseline_rnas.rnas_id.isnull() == False]
rna.index = rna.rnas_id
rna.index.name = 'rna_id'
# TODO: update this to use table status column eventually.
rna = rna.ix[censor[censor == False].index]
rna = rna.merge(baseline_analyte, left_on='analyte_id', right_index=True,
                suffixes=['_rnas', '_analyte'])
rna = rna.merge(baseline_tissue, left_on='tissue_id', right_index=True,
                suffixes=['_rnas', '_tissue'])
rna = rna[['clone', 'passage', 'subject_id']]
rna['isolated_by'] = 'a'

TODO: working here. I need to redo the above when the rest of the samples
are added into baseline_rnas.

In [10]:
# Get subjects from Roy's paper.
cohort222 = baseline_ipsc.merge(baseline_tissue, left_on='tissue_id', 
                                right_index=True,  suffixes=['_ipsc', '_tissue'])
n = len(set(rna.subject_id) - set(cohort222.subject_id))
print('{} subjects not in the 222 cohort.'.format(n))

0 subjects not in the 222 cohort.


In [11]:
# Drop unnecessary/private columns.
rna = rna.drop(['name', 'cell', 'day', 'rep', 'status', 'comment'], axis=1)

I'm can use all of these samples that passed QC for various expression analyses.

### eQTL samples

Now I'm going to identify one sample per subject to use for eQTL analysis.

I'll start by keeping samples whose clone/passage number matches up with 
those from the 222 cohort.

In [12]:
rna['in_eqtl'] = False

In [13]:
samples = (cohort222.subject_id + ':' + cohort222.clone.astype(int).astype(str) + 
           ':' + cohort222.passage.astype(int).astype(str))

t = rna.dropna(subset=['passage'])
t.loc[:, ('sample')] = (t.subject_id + ':' + t.clone.astype(int).astype(str) + 
                        ':' + t.passage.astype(int).astype(str))
t = t[t['sample'].apply(lambda x: x in samples.values)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [14]:
# These samples are in the 222 cohort and the eQTL analysis.
rna['in_222'] = False
rna.ix[t.index, 'in_222'] = True
rna.ix[t.index, 'in_eqtl'] = True

Now I'll add in any samples for which we have CNVs but weren't in the 222.

In [15]:
samples = (cnv.subject_id + ':' + cnv.clone.astype(int).astype(str) + 
           ':' + cnv.passage.astype(int).astype(str))

t = rna.dropna(subset=['passage'])
t.loc[:, ('sample')] = (t.subject_id + ':' + t.clone.astype(int).astype(str) + 
                        ':' + t.passage.astype(int).astype(str))
t = t[t['sample'].apply(lambda x: x in samples.values)]
t = t[t.subject_id.apply(lambda x: x not in rna.ix[rna.in_eqtl, 'subject_id'].values)]

# These samples aren't in the 222 but we have a measured CNV for them.
rna.ix[t.index, 'in_eqtl'] = True

Now I'll add in samples where the clone was in the 222 but we don't have the same passage
number.

In [16]:
samples = (cohort222.subject_id + ':' + cohort222.clone.astype(int).astype(str))

t = rna[rna.in_eqtl == False]
t = t[t.subject_id.apply(lambda x: x not in rna.ix[rna.in_eqtl, 'subject_id'].values)]
t['samples'] = t.subject_id + ':' + t.clone.astype(int).astype(str)
t = t[t.samples.apply(lambda x: x in samples.values)]

# These clones are in the 222, we just have a different passage number.
rna['clone_in_222'] = False
rna.ix[rna.in_222, 'clone_in_222'] = True
rna.ix[t.index, 'clone_in_222'] = True
rna.ix[t.index, 'in_eqtl'] = True

Now I'll add in any samples from subjects we don't yet have in the eQTL analysis.

In [17]:
t = rna[rna.in_eqtl == False]
t = t[t.subject_id.apply(lambda x: x not in rna.ix[rna.in_eqtl, 'subject_id'].values)]

rna.ix[t.index, 'in_eqtl'] = True

In [18]:
n = rna.in_eqtl.value_counts()[True]
print('We have {} distinct subjects in the eQTL analysis.'.format(n))

We have 215 distinct subjects in the eQTL analysis.


## WGS Samples

Now I'll assign WGS IDs for each RNA-seq sample. Some subjects have multiple WGS samples
for different cell types. I'll preferentially use blood, fibroblast, and finally iPSC WGS.

In [19]:
rna['wgs_id'] = ''

In [20]:
# Remove censored samples.
wgs = data_wgs[data_wgs.status == 0]

In [21]:
for i in rna.index:
    s = rna.ix[i, 'subject_id']
    t = wgs[wgs.subject_id == s]
    if t.shape[0] == 1:
        rna.ix[i, 'wgs_id'] = t.index[0]
    elif t.shape[0] > 1:
        if 'Blood' in t.cell.values:
            t = t[t.cell == 'Blood']
        elif 'iPSC' in t.cell.values:
            t = t[t.cell == 'iPSC']
        if t.shape[0] == 1:
            rna.ix[i, 'wgs_id'] = t.index[0]
        else:
            print('?: {}'.format(i))
    else:
        print('No WGS: {}'.format(i))
        rna.ix[i, 'in_eqtl'] = False

rna.ix[rna['wgs_id'] == '', 'wgs_id'] = np.nan

No WGS: 4ebf16ec-bcdb-47f3-aefe-5e14cd1735d5
No WGS: 2c2697a7-584f-4767-bc64-23a833648e81
No WGS: 9809009f-63db-4a16-8fe3-a11a474f896f
No WGS: 1c568951-4308-4270-b40a-0380bffe699c
No WGS: c29ee90a-9cc0-4552-9f76-5d00f9ed0335
No WGS: 629211b6-49d7-4024-879c-3ac5bc86f9d9
No WGS: c5bc184d-3853-4cf4-92fd-bd561704154a
No WGS: eb9cd395-ccf7-4b0c-a9cd-b48129992972


In [22]:
n = len(set(rna[rna.in_eqtl].index) & set(rna[rna.wgs_id == ''].index))
print('{} samples in eQTL analysis without WGS data.'.format(n))

0 samples in eQTL analysis without WGS data.


I'm going to keep one WGS sample per person in the cohort 
(preferentially blood, fibroblast, and finally iPSC) even if we don't
have RNA-seq in case we want to look at phasing etc.

In [23]:
wgs.shape

(276, 9)

In [24]:
vc = wgs.subject_id.value_counts()
vc = vc[vc > 1]

keep = []
for s in vc.index:
    t = wgs[wgs.subject_id == s]
    if t.shape[0] == 1:
        keep.append(t.index[0])
    elif t.shape[0] > 1:
        if 'Blood' in t.cell.values:
            t = t[t.cell == 'Blood']
        elif 'iPSC' in t.cell.values:
            t = t[t.cell == 'iPSC']
        if t.shape[0] == 1:
            keep.append(t.index[0])
        else:
            print('?: {}'.format(i))

wgs = wgs.drop(set(wgs[wgs.subject_id.apply(lambda x: x in vc.index)].index) - set(keep))

In [25]:
wgs = wgs.drop(['name', 'clone', 'passage', 'day', 'status', 'comment', 'sequence_id'], axis=1)

In [26]:
subject = subject_subject.copy(deep=True)
subject.index = subject['id']
subject = subject.ix[set(rna.subject_id) | set(wgs.subject_id)]
subject = subject[['sex', 'age', 'family_id', 'father_id', 'mother_id', 
                   'twin_id', 'ethnicity_group']]

In [27]:
fn = os.path.join(outdir, 'cnvs.tsv')
if not os.path.exists(fn):
    cnv.to_csv(fn, sep='\t')
    
rna.index.name = 'sample_id'
fn = os.path.join(outdir, 'rnaseq_metadata.tsv')
if not os.path.exists(fn):
    rna.to_csv(fn, sep='\t')
    
fn = os.path.join(outdir, 'subject_metadata.tsv')
if not os.path.exists(fn):
    subject.to_csv(fn, sep='\t')
    
fn = os.path.join(outdir, 'wgs_metadata.tsv')
if not os.path.exists(fn):
    wgs.to_csv(fn, sep='\t')

## RNA-seq Data

In [28]:
dy = '/projects/CARDIPS/pipeline/RNAseq/combined_files'
# STAR logs.
fn = os.path.join(dy, 'star_logs.tsv')
logs = pd.read_table(fn, index_col=0, low_memory=False)
logs = logs.ix[rna.index]
logs.index.name = 'sample_id'

fn = os.path.join(outdir, 'star_logs.tsv')
if not os.path.exists(fn):
    logs.to_csv(fn, sep='\t')
    
# Expression values.
fn = os.path.join(dy, 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0, low_memory=False)
tpm = tpm[rna.index]
fn = os.path.join(outdir, 'rsem_tpm.tsv')
if not os.path.exists(fn):
    tpm.to_csv(fn, sep='\t')

TODO: Add ASE results when finished.

## ATAC-seq Peaks

## Chain Files

I need to convert some mouse coordinates to hg19 and some hg20 coordinates
to hg19 as well.

## Variant Calls

In [81]:
fn = os.path.join(private_outdir, 'autosomal_variants.vcf.gz')
if not os.path.exists(fn):
    os.symlink('/projects/CARDIPS/pipeline/WGS/mergedVCF/CARDIPS_201512.PASS.vcf.gz',
               fn)
    os.symlink('/projects/CARDIPS/pipeline/WGS/mergedVCF/CARDIPS_201512.PASS.vcf.gz.tbi',
               fn + '.tbi')