# Input Data

This notebook parses some of the CARDiPS files to take only data that we need for this project. 
This notebook will only run on the Frazer lab cluster.

In [2]:
import glob
import os
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import ciepy
import cardipspy as cpy

%matplotlib inline

In [3]:
fn = ('/raid3/projects/CARDIPS/data/database/inventory_family.tsv')
family = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_pedigree.tsv')
pedigree = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_rnaseq.tsv')
rnaseq = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_sample.tsv')
sample = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_sequence.tsv')
sequence = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_snparray.tsv')
snparray = pd.read_table(fn, index_col=0)
fn = ('/raid3/projects/CARDIPS/data/database/inventory_subject.tsv')
subject = pd.read_table(fn, index_col=1)
#fn = ('/raid3/projects/CARDIPS/data/database/inventory_wgs.tsv')
fn = ('/raid3/projects/CARDIPS/pipeline/WGS/inventory_wgs.tsv')
wgs = pd.read_table(fn, index_col=0)

In [4]:
censor = pd.read_table('/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/censor.tsv',
                       index_col=0, header=None, squeeze=True)

## Samples for this Study

In [5]:
# When this assertion isn't true, that means the database will have been updated
# so I'll need to update this code.
assert snparray.shape[0] == 444

tdf = snparray[snparray.cell == 'iPSC']
samples222 = (tdf.subject_id + ':C' + tdf.clone.astype(int).astype(str) + 
              ':P' + tdf.passage.astype(int).astype(str))

## Metadata

In [128]:
snparray = snparray[['subject_id', 'cell', 'array_id', 'pos', 'type']]
snparray = snparray[snparray.cell != 'iPSC']

In [9]:
#rnaseq = seqsample[seqsample.type == 'RNA']
rnaseq = rnaseq[rnaseq.sequence_id.apply(lambda x: x in [6, 7, 8])]
#rnaseq.ix[['50dbe4d7-00f7-489a-b3b4-a8386ed8f621',
#          '5bf68d53-4da9-46c4-8d52-9a7e46d2d39d',
#          '73f71ee5-269f-4cc6-ae49-1ea8093456c2',
#          ], 'status'] = 2
rnaseq.ix[censor[censor == True].index, 'status'] = 2
rnaseq = rnaseq[rnaseq.status != 2]
rnaseq = rnaseq[rnaseq.cell == 'iPSC']

In [12]:
rnaseq['key'] = (rnaseq.subject_id + ':C' + rnaseq.clone.astype(int).astype(str) + 
                 ':P' + rnaseq.passage.astype(int).astype(str))
rnaseq['in_222'] = rnaseq.key.apply(lambda x: x in samples222.values)

In [13]:
n = len(set(samples222) -  set(rnaseq.key))
print('Number of 222 samples that we don\'t have '
      'RNA-seq for (with correct clone and passage): '
      '{}.'.format(n))

Number of 222 samples that we don't have RNA-seq for (with correct clone and passage): 27.


In [14]:
rnaseq['keep'] = rnaseq.in_222

In [15]:
set(rnaseq.subject_id) - set(rnaseq.ix[rnaseq.keep == True, 'subject_id'])

{'0bf3da28-3985-4c34-8197-5816fd73b588',
 'b9e30469-13ad-4792-878a-889cd6480f91',
 'bd04a8cc-5d63-45bc-a2cc-91b0c7cb6e01',
 'fbabd331-fc4f-47de-b4d0-a9ee5434fa5b'}

There are four subjects for whom we have RNA-seq although the clone/passage number
doesn't match up with the arrays. I'll include the data from these samples for these
subjects.

In [16]:
for s in set(rnaseq.subject_id) - set(rnaseq.ix[rnaseq.keep == True, 'subject_id']):
    rnaseq.ix[rnaseq[rnaseq.subject_id == s].index[0], 'keep'] = True
rnaseq = rnaseq[rnaseq.keep]
rnaseq = rnaseq[['subject_id', 'clone', 'passage', 'sequence_id', 'in_222']]

In [165]:
wgs = seqsample[seqsample.type == 'WGS']
wgs.ix['648cab41-59ff-42e3-80ca-8398c57aa7a0', 'status'] = 2
wgs = wgs[wgs.status == 0]

In [175]:
keep = []
for s in rnaseq.subject_id:
    t = wgs[wgs.subject_id == s]
    if t.shape[0] == 1:
        keep.append(t.index[0])
    elif t.shape[0] > 1:
        t = t[t.cell == 'Blood']
        if t.shape[0] == 1:
            keep.append(t.index[0])
        else:
            print(s)
    else:
        print(s)

wgs = wgs.ix[keep, ['subject_id', 'cell', 'received', 'sequence_id']]

In [184]:
subject = subject.ix[wgs.subject_id]
subject = subject[['sex', 'ethnicity', 'disease', 'family_id', 'father_id', 'mother_id', 'twin_id']]

In [185]:
snparray.to_csv(os.path.join(ciepy.root, 'data', 'array_metadata.tsv'), sep='\t')
rnaseq.to_csv(os.path.join(ciepy.root, 'data', 'rnaseq_metadata.tsv'), sep='\t')
subject.to_csv(os.path.join(ciepy.root, 'data', 'subject_metadata.tsv'), sep='\t')
wgs.to_csv(os.path.join(ciepy.root, 'data', 'wgs_metadata.tsv'), sep='\t')

## RNA-seq Data

In [186]:
fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/gene_counts.tsv'
counts = pd.read_table(fn, index_col=0)
counts = counts[rnaseq.index]
counts.to_csv(os.path.join(ciepy.root, 'data', 'gene_counts.tsv'), sep='\t')

fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_expected_counts.tsv'
ecounts = pd.read_table(fn, index_col=0)
ecounts = ecounts[rnaseq.index]
ecounts.to_csv(os.path.join(ciepy.root, 'data', 'rsem_expected_counts.tsv'), sep='\t')

fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_tpm.tsv'
tpm = pd.read_table(fn, index_col=0)
tpm = tpm[rnaseq.index]
tpm.to_csv(os.path.join(ciepy.root, 'data', 'rsem_tpm.tsv'), sep='\t')