# Input Data

This notebook parses some of the CARDiPS files to take only data that we need for this project. 
This notebook will only run on the Frazer lab cluster.

In [2]:
import glob
import os
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import ciepy
import cardipspy as cpy

%matplotlib inline

In [127]:
array = pd.read_table('/raid3/projects/CARDIPS/data/database/inventory_array.tsv', index_col=0)
family = pd.read_table('/raid3/projects/CARDIPS/data/database/inventory_family.tsv', index_col=0)
rnaseq = pd.read_table('/raid3/projects/CARDIPS/data/database/inventory_rnaseq.tsv', index_col=0)
run = pd.read_table('/raid3/projects/CARDIPS/data/database/inventory_run.tsv', index_col=0)
subject = pd.read_table('/raid3/projects/CARDIPS/data/database/inventory_subject.tsv', index_col=1)
tissue = pd.read_table('/raid3/projects/CARDIPS/data/database/inventory_tissue.tsv', index_col=0)
for c in ['clone', 'passage', 'day']:
    tissue[c] = tissue[c].replace(0, np.nan) 
wgs = pd.read_table('/raid3/projects/CARDIPS/data/database/inventory_wgs.tsv', index_col=0)
wgs.ix['648cab41-59ff-42e3-80ca-8398c57aa7a0', 'failed'] = 1

## Samples for this Study

In [128]:
# When this assertion isn't true, that means the database will have been updated
# so I'll need to update this code.
assert array.shape[0] == 444

tdf = array.merge(tissue, left_on='tissue_id', right_index=True, suffixes=['', '_tissue'])
tdf = tdf[tdf.type_tissue == 'iPSC']
samples222 = (tdf.subject_id + ':C' + tdf.clone.astype(int).astype(str) + 
              ':P' + tdf.passage.astype(int).astype(str))

## Metadata

In [129]:
array = array.merge(tissue, left_on='tissue_id', right_index=True, suffixes=['', '_tissue'])
array = array[['type_tissue', 'clone', 'passage', 'array', 'coord', 'subject_id']]
array.columns = ['tissue', 'clone', 'passage', 'array', 'coord', 'subject_id']
array = array[array.tissue != 'iPSC']

In [130]:
rnaseq = rnaseq.merge(tissue, left_on='tissue_id', right_index=True, suffixes=['', '_tissue'])

In [131]:
rnaseq = rnaseq[rnaseq.run_id.apply(lambda x: x in [6, 7, 8])]
rnaseq.ix[['50dbe4d7-00f7-489a-b3b4-a8386ed8f621',
          '5bf68d53-4da9-46c4-8d52-9a7e46d2d39d',
          '73f71ee5-269f-4cc6-ae49-1ea8093456c2',
          ], 'failed'] = 1
rnaseq = rnaseq[rnaseq.failed == 0]

In [132]:
tdf = rnaseq[rnaseq.tissue_id.apply(lambda x: x in tdf.tissue_id.values)]
set(tdf.subject_id) - set(rnaseq.subject_id)

set()

In [133]:
rnaseq = rnaseq[rnaseq.tissue_id.apply(lambda x: x in tdf.tissue_id.values)]
rnaseq = rnaseq[['type', 'clone', 'passage', 'subject_id']]

In [151]:
tdf = wgs.merge(tissue, left_on='tissue_id', right_index=True,
                suffixes=['', '_tissue'])
tdf = tdf[tdf.failed == 0]

keep = []
for s in rnaseq.subject_id:
    t = tdf[tdf.subject_id == s]
    if t.shape[0] == 1:
        keep.append(t.index[0])
    elif t.shape[0] > 1:
        t = t[t.type == 'Blood']
        if t.shape[0] == 1:
            keep.append(t.index[0])
        else:
            print(s)
    else:
        print(s)
tdf = tdf.ix[keep]
wgs = tdf[['type', 'subject_id', 'received']]

In [155]:
subject = subject.ix[wgs.subject_id]
subject = subject[['gender', 'disease', 'family_id', 'father_id', 'mother_id', 'twin_id']]
subject.columns = ['sex', 'disease', 'family_id', 'father_id', 'mother_id', 'twin_id']

In [161]:
array.to_csv(os.path.join(ciepy.root, 'data', 'array_metadata.tsv'), sep='\t')
rnaseq.to_csv(os.path.join(ciepy.root, 'data', 'rnaseq_metadata.tsv'), sep='\t')
subject.to_csv(os.path.join(ciepy.root, 'data', 'subject_metadata.tsv'), sep='\t')
wgs.to_csv(os.path.join(ciepy.root, 'data', 'wgs_metadata.tsv'), sep='\t')

## RNA-seq Data

In [173]:
fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/gene_counts.tsv'
counts = pd.read_table(fn, index_col=0)
counts = counts[rnaseq.index]
counts.to_csv(os.path.join(ciepy.root, 'data', 'gene_counts.tsv'), sep='\t')

fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_expected_counts.tsv'
ecounts = pd.read_table(fn, index_col=0)
ecounts = ecounts[rnaseq.index]
ecounts.to_csv(os.path.join(ciepy.root, 'data', 'rsem_expected_counts.tsv'), sep='\t')

fn = '/raid3/projects/CARDIPS/pipeline/RNAseq/combined_files/rsem_tpm.tsv'
tpm = pd.read_table(fn, index_col=0)
tpm = tpm[rnaseq.index]
tpm.to_csv(os.path.join(ciepy.root, 'data', 'rsem_tpm.tsv'), sep='\t')