# Introduction

So just how close are peng's files to what's at the DCC?

In [86]:
import pandas
import os
import sys
import numpy
import random
import itertools

In [48]:
HTSW = os.path.expanduser('~/proj/htsworkflow')
if HTSW not in sys.path:
    sys.path.append(HTSW)
from htsworkflow.submission.encoded import ENCODED

In [50]:
server = ENCODED('test.encodedcc.org')
server.load_netrc()

In [63]:
def sum_square_error(left, right):
    return sum((left-right)**2)

In [2]:
peng_root = '/woldlab/loxcyc/home/phe/160922BrianWilliamsRNAseq/161204FourthRun'

peng_fb_1 = os.path.join(peng_root, 'FB_1_E14_5-mm10-M4-male_anno_rsem.genes.results')
peng_fb_2 = os.path.join(peng_root, 'FB_2_E14_5-mm10-M4-male_anno_rsem.genes.results')
peng_hb_1 = os.path.join(peng_root, 'HB_1_E14_5-mm10-M4-male_anno_rsem.genes.results')
peng_hb_2 = os.path.join(peng_root, 'HB_2_E14_5-mm10-M4-male_anno_rsem.genes.results')
peng_mb_1 = os.path.join(peng_root, 'MB_1_E14_5-mm10-M4-male_anno_rsem.genes.results')
peng_mb_2 = os.path.join(peng_root, 'MB_2_E14_5-mm10-M4-male_anno_rsem.genes.results')


In [3]:
diane_root = os.path.expanduser('~/proj/mouse-epigenome2/embryonic/14.5/gene quantifications/')

diane_fb_1 = os.path.join(
    diane_root,
    'PolyA RNA-Seq from oligo-dT primed Total RNA on embryonic 14.5 day mouse forebrain',
    'ENCFF192XRT.tsv'
)
diane_fb_2 = os.path.join(
    diane_root,
    'PolyA RNA-Seq from oligo-dT primed Total RNA on embryonic 14.5 day mouse forebrain',
    'ENCFF809YXL.tsv'
)

In [4]:

right = pandas.read_csv(peng_fb_2, sep='\t', index_col=0)

In [5]:
def compare_sample(left):
    for i, a in enumerate([peng_fb_1, peng_fb_2, peng_hb_1, peng_hb_1, peng_hb_2, peng_mb_1, peng_mb_2]):
        right = pandas.read_csv(a, sep='\t', index_col=0)
        print("{} {:,}".format(i, sum((left['FPKM'] - right['FPKM'])**2)))
                      

In [6]:
left = pandas.read_csv(diane_fb_1, sep='\t', index_col=0)
compare_sample(left)

0 636,557,812.9531969
1 888,505,758.4678006
2 761,518,345.3222989
3 761,518,345.3222989
4 645,692,432.7442954
5 501,613,763.44019777
6 541,595,789.0818961


In [7]:
left = pandas.read_csv(diane_fb_2, sep='\t', index_col=0)
compare_sample(left)

0 658,954,288.2110978
1 936,760,783.8657001
2 772,159,240.7116046
3 772,159,240.7116046
4 667,102,963.4612011
5 548,433,195.5440974
6 580,169,910.4707956


In [8]:
left.shape, right.shape

((69690, 14), (69690, 14))

In [9]:
left.columns

Index(['transcript_id(s)', 'length', 'effective_length', 'expected_count',
       'TPM', 'FPKM', 'posterior_mean_count',
       'posterior_standard_deviation_of_count', 'pme_TPM', 'pme_FPKM',
       'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound',
       'FPKM_ci_upper_bound'],
      dtype='object')

In [10]:
right.columns

Index(['transcript_id(s)', 'length', 'effective_length', 'expected_count',
       'TPM', 'FPKM', 'posterior_mean_count',
       'posterior_standard_deviation_of_count', 'pme_TPM', 'pme_FPKM',
       'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound',
       'FPKM_ci_upper_bound'],
      dtype='object')

In [11]:
left = pandas.read_csv(os.path.expanduser('~/proj/mouse-epigenome2/ENCFF745ZJF.tsv'), sep='\t', index_col=0)

compare_sample(left.drop('gSpikein_phiX174'))

0 660,011,014.3739983
1 930,975,251.8072013
2 773,682,343.4713042
3 773,682,343.4713042
4 668,581,583.054301
5 549,170,055.5835979
6 580,970,197.4494964


In [65]:
left = pandas.read_csv(os.path.expanduser('~/proj/mouse-epigenome2/ENCFF745ZJF.tsv'), sep='\t', index_col=0)
right = pandas.read_csv(diane_fb_1, sep='\t', index_col=0)
sum_square_error(left['FPKM'].drop('gSpikein_phiX174'), right['FPKM'])

55999303.45219869

In [13]:
sum((left['FPKM']-left['FPKM'])**2)

0.0

# Load bulk metadata

In [17]:
metadata = pandas.read_csv('bulk_peng/RSEMcounttime.txt', sep='\t', header=None, usecols=[0,1], names=['header', 'name'])
metadata.set_index('header', inplace=True)
metadata.head()
stage = []
tissue = []
experiment = []
file = []
for header, row in metadata.iterrows():
    fields = row['name'].split('_')
    stage.append(' '.join(fields[0:2]))
    tissue.append(' '.join(fields[2:-4]))
    experiment.append(fields[-4])
    file.append(fields[-1][:-4])
metadata['stage'] = stage
metadata['tissue'] = tissue
metadata['experiment'] = experiment
metadata['file'] = file

# Spot check bulk matrix

In [41]:
def load_gene_table(filename):
    return pandas.read_csv(
        filename, 
        sep=' ', 
        header=None,
        dtype={0: str, 1: str},
        index_col=0)

In [42]:
for filename in ['bulk_peng/RSEMTPMtime.gene','bulk_peng/RSEMFPKMtime.gene','bulk_peng/RSEMcounttime.gene']:
    matrix = load_gene_table(filename)
    print(filename, matrix.shape)

bulk_peng/RSEMTPMtime.gene (69690, 157)
bulk_peng/RSEMFPKMtime.gene (69690, 157)
bulk_peng/RSEMcounttime.gene (69690, 157)


In [31]:
headers = list(pandas.read_csv('bulk_peng/header.gene', sep='\t', header=None).values[0])
bulk_headers = ['gene_name'] + headers
file_headers = ['gene_name'] + [metadata.loc[x, 'file'] for x in headers]

In [80]:
bulk_matrix = load_gene_table('bulk_peng/RSEMTPMtime.gene')

In [44]:
len(bulk_headers), len(file_headers), bulk_matrix.shape

(157, 157, (69690, 157))

In [78]:
bulk_files = [('bulk_peng/RSEMTPMtime.gene', 'TPM'),
              ('bulk_peng/RSEMFPKMtime.gene', 'FPKM'),
              ('bulk_peng/RSEMcounttime.gene', 'expected_count')]

In [79]:
for filename, column_name in bulk_files:
    bulk_matrix = load_gene_table(filename)
    bulk_matrix.columns = file_headers

    columns = [x for x in bulk_matrix.columns if x.startswith('EN')]
    numpy.random.shuffle(columns)
    for accession in columns[:10]:
        obj = server.get_json(accession)
        tsv = pandas.read_csv(
            server.prepare_url(
                obj['href']), sep='\t', index_col=0, header=0)
        error = sum_square_error(
            bulk_matrix[accession], 
            tsv[column_name].drop(('gSpikein_phiX174')))
        print(column_name, filename, accession, error)

TPM bulk_peng/RSEMTPMtime.gene ENCFF226ILJ 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF298WHK 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF319WZT 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF485CJB 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF049EIV 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF861GUP 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF210MWH 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF336VTP 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF413OJO 0.0
TPM bulk_peng/RSEMTPMtime.gene ENCFF824DCQ 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF795XBQ 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF052THP 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF184ZAV 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF971KZC 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF672DDJ 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF196WAD 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF049EIV 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF127FPD 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF155GNG 0.0
FPKM bulk_peng/RSEMFPKMtime.gene ENCFF114YCL 0.0
expected_count bulk_peng/RSEMcounttime.g

In [113]:
def compare_two_replicates(matrix, accession_left, accession_right, column_name):
    obj = server.get_json(accession_left)
    tsv = pandas.read_csv(
        server.prepare_url(
            obj['href']), sep='\t', index_col=0, header=0)
    error = sum_square_error(
        bulk_matrix[accession_right], 
        tsv[column_name].drop(('gSpikein_phiX174')))
    print("{:4} {} {} {:16,.2f}".format(column_name, accession_left, accession_right, error))

In [95]:
column_name='TPM'
bulk_matrix = load_gene_table('bulk_peng/RSEMTPMtime.gene')
bulk_matrix.columns = file_headers

In [114]:
paired = zip([file_headers[i] for i in range(1, len(file_headers), 2)],
             [file_headers[i] for i in range(2, len(file_headers), 2)])

for i, (accession_left, accession_right) in enumerate(paired):
    if i > 5:
        break
    compare_two_replicates(bulk_matrix, accession_left, accession_right, column_name)

TPM  ENCFF465YOS ENCFF516EUX 2,938,193,339.96
TPM  ENCFF385MJV ENCFF923YGS   523,615,065.64
TPM  ENCFF672DDJ ENCFF929KWG    26,456,043.48
TPM  ENCFF262TXH ENCFF772UWT    13,582,761.33
TPM  ENCFF129XUH ENCFF146HIO    14,347,523.32
TPM  ENCFF132NQU ENCFF867TKM   308,410,077.18


In [117]:
for i in range(5):
    accession_left = random.choice(file_headers[1:])
    accession_right = random.choice(file_headers[1:])
    compare_two_replicates(bulk_matrix, accession_left, accession_right, column_name)

TPM  ENCFF972NMO ENCFF875HIA 6,672,213,305.68
TPM  ENCFF867TKM ENCFF967SJG   790,448,134.81
TPM  ENCFF413OJO ENCFF390KQM   626,745,221.91
TPM  ENCFF706XGJ ENCFF918QNL   930,030,469.16
TPM  ENCFF870YWY ENCFF684GSP   621,790,466.35


In [123]:
[bulk_headers[file_headers.index(x)] for x in ['ENCFF972NMO', 'ENCFF875HIA']]

['stomach_e15.5_2', 'liver_p0_2']

In [124]:
[bulk_headers[file_headers.index(x)] for x in ['ENCFF385MJV', 'ENCFF923YGS']]

['bladder_p0_1', 'bladder_p0_2']

In [125]:
[bulk_headers[file_headers.index(x)] for x in ['ENCFF972NMO', 'ENCFF875HIA']]

['stomach_e15.5_2', 'liver_p0_2']