This notebook parses some of the CARDiPS files to take only data that we need for this project.

In [67]:
import glob
import os
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import ciepy as cpy
import projectpy as ppy

%matplotlib inline

In [135]:
array_meta = pd.read_table(os.path.join(cpy.root, 'data', 'array_metadata.tsv'), index_col=0)
wgs_meta = pd.read_table(os.path.join(cpy.root, 'data', 'wgs_metadata.tsv'), index_col=0, 
                         squeeze=True)
wgs_meta.index.name = 'wgs_id'
fn = ('/raid3/projects/CARDIPS/analysis/RNAseq/150512_150522_150527_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)

# Metadata

In [136]:
pd.crosstab(rna_meta.study_ipsc_eqtl, rna_meta.censor)

censor,False,True
study_ipsc_eqtl,Unnamed: 1_level_1,Unnamed: 2_level_1
0,45,1
1,197,27


In [137]:
rna_meta = rna_meta[rna_meta.censor == False]
rna_meta = rna_meta[rna_meta.study_ipsc_eqtl == 1]
rna_meta = rna_meta.reset_index().merge(wgs_meta.reset_index(), on='subject').set_index('index')
rna_meta = rna_meta.dropna(subset=['wgs_id'])
rna_meta.to_csv(os.path.join(cpy.root, 'data', 'rna_seq_metadata.tsv'), sep='\t')

I'm currently missing WGS data for 11 RNA-seq samples. There are 27 samples that are censored.

# Gene Counts

In [138]:
fns = glob.glob(os.path.join('/raid3/projects/CARDIPS/analysis/RNAseq/*XX/cdeboever/'
                             'results/*_counts/gene_counts.tsv'))
count_fns = []
for fn in fns:
    s = fn.split(os.path.sep)[-2].split('_')[0]
    if s in rna_meta.index:
        count_fns.append(fn)

In [139]:
counts = [pd.read_table(fn, index_col=0, header=None, squeeze=True) for fn in count_fns]
sample_names = [fn.split(os.path.sep)[-2].split('_')[0] for fn in count_fns]
d = {key: value for (key, value) in zip(sample_names, counts)}
counts = pd.DataFrame(d)
counts.index.name = 'gene'
counts.to_csv(os.path.join(cpy.root, 'data', 'gene_counts.tsv'), sep='\t')