In [1]:
%matplotlib inline

import functools
import os
from collections import defaultdict
from tqdm import tqdm, tqdm_notebook
tqdm.pandas("Progress: ")

import numpy as np
import matplotlib
import pandas as pd
import pybedtools
import pysam
import seaborn as sns
import matplotlib
from matplotlib import gridspec
import scipy
from gscripts import qtools
from Bio import SeqIO
from gscripts.general import dataviz
from IPython.core.display import HTML
import urllib
import datetime
from gscripts.encode import encode_helpers
from gscripts.rnaseq import helpers
from gscripts.general import region_helpers

img_dir = "/home/gpratt/Dropbox/encode_integration/qc_work/"

sns.set_style("ticks")
xfmt = matplotlib.ticker.FuncFormatter(lambda x, p: format(float(x) / 1000000, ','))

RESET = False



In [2]:
import matplotlib as mpl
legend = None

OUTSIDE_LEGEND_SAVEFIG_KWS = dict(bbox_extra_artists=(legend,),
                                  bbox_inches='tight')
from matplotlib import rc

mpl.rcParams['svg.fonttype'] = 'none'

rc('text', usetex=False) 
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})

In [3]:
gene_id_to_name = region_helpers.gene_id_to_name("/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db")
gene_id_to_type = region_helpers.gene_id_to_type("/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db")

name_to_gene_id = {value: key for key, value in gene_id_to_name.items()}

In [4]:
def get_rpkm_uid(cell_type):
    #enrichment_type, count_method, count_location, col
    #This beast just matches the right input to the dataset counts or RPKMs
    if cell_type == "K562":
        uid = 'ENCFF131IST'
    elif cell_type == "HepG2":
        uid = 'ENCFF770XVY'
    else:
        print "error", col[5]
    return uid

def get_tpm_uid(cell_type):
    #enrichment_type, count_method, count_location, col
    #This beast just matches the right input to the dataset counts or RPKMs
    if cell_type == "K562":
        uid = 'ENCFF286GLL'
    elif cell_type == "HepG2":
        uid = 'ENCFF533XPJ'
    else:
        print "error", col[5]
    return uid

def get_tpm_transcript_uid(cell_type):
    #enrichment_type, count_method, count_location, col
    #This beast just matches the right input to the dataset counts or RPKMs
    if cell_type == "K562":
        uid = 'ENCFF424CXV'
    elif cell_type == "HepG2":
        uid = 'ENCFF205WUQ'
    else:
        print "error", col[5]
    return uid

def dict_to_corr_dataframe(expression_corr):
    expression_corr = pd.Series(expression_corr, name="r-value")
    expression_corr = pd.DataFrame(expression_corr.sort_values(ascending=False))
    expression_corr['r-squared'] = expression_corr['r-value'] ** 2
    expression_corr = expression_corr.sort_index()
    return expression_corr

In [5]:
merged_data = encode_helpers.get_merged_data()

#For Peak Analysis we only want to analyze datasets that have been submitted
# merged_data = merged_data[merged_data.submitted]
# merged_data = merged_data[['CLIP', 'INPUT', 'input_norm']]

In [6]:
total_bedtools = merged_data.sort_index()

In [7]:
#in order
#Count reads in all exons, reads each exon, reads in entire gene

#in /projects/ps-yeolab3/encode/analysis/encode_master/

#featureCounts -a /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf -o ../gene_expression.txt *{merged,unassigned}*.r2.bam -s 1 -T 16
#featureCounts -a /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf -o ../gene_expression_exon_level.txt *{merged,unassigned}*.r2.bam -s 1 -f -T 16
#featureCounts -a /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf -o ../gene_expression_whole_gene_level.txt *{merged,unassigned}*.r2.bam -s 1 -f -t gene -T 16

#in /projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2

#featureCounts -a /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf -o encode_total_rna.txt -p ENCFF131IST.bam ENCFF726SMY.bam ENCFF770XVY.bam ENCFF002PXG.bam -S rf -C -B --primary -p -s 1 -T 16
#featureCounts -a /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf -o encode_total_rna_exon_level.txt -p ENCFF131IST.bam ENCFF726SMY.bam ENCFF770XVY.bam ENCFF002PXG.bam  -S rf -C -B --primary -p -s 1 -f -T 16
#featureCounts -a /projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf -o encode_total_rna_whole_gene_level.txt -p ENCFF131IST.bam ENCFF726SMY.bam ENCFF770XVY.bam ENCFF002PXG.bam  -S rf -C -B --primary -p -s 1 -t gene -f -T 16

# Get the data I'll use to as the followup questions 

In [8]:
# @{$rnaseq_datasets{"K562"}} = ("ENCFF424CXV","ENCFF073NHK");
# @{$rnaseq_datasets{"HepG2"}} = ("ENCFF205WUQ","ENCFF915JUZ");

# !wget -P /projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ https://www.encodeproject.org/files/ENCFF424CXV/@@download/ENCFF424CXV.tsv
# !wget -P /projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ https://www.encodeproject.org/files/ENCFF073NHK/@@download/ENCFF073NHK.tsv
# !wget -P /projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ https://www.encodeproject.org/files/ENCFF205WUQ/@@download/ENCFF205WUQ.tsv
# !wget -P /projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ https://www.encodeproject.org/files/ENCFF915JUZ/@@download/ENCFF915JUZ.tsv

In [9]:
df = pd.read_table("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF424CXV.tsv")

In [10]:
def get_best_transcript(fn):
    df = pd.read_table(fn, index_col=0)
    df = df.sort_values("TPM", ascending=False).groupby("gene_id").first()
    return df

In [11]:
transcript_rsem = pd.concat({('ENCFF424CXV', "K562", 'rnaseq', "rep1"): get_best_transcript("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF424CXV.tsv").T,
('ENCFF073NHK', "K562", 'rnaseq', "rep2"): get_best_transcript("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF073NHK.tsv").T,
('ENCFF205WUQ', "HepG2", 'rnaseq', "rep1"): get_best_transcript("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF205WUQ.tsv").T,
('ENCFF915JUZ', "HepG2", 'rnaseq', "rep2"): get_best_transcript("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF915JUZ.tsv").T,}).T

transcript_tpm = transcript_rsem.xs("TPM", level=4, axis=1)


In [12]:
gene_rsem = pd.concat({('ENCFF424CXV', "K562", 'rnaseq', "rep1"): pd.read_table("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF424CXV.tsv", index_col=0).T,
('ENCFF986DBN', "K562", 'rnaseq', "rep2"): pd.read_table("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF986DBN.tsv", index_col=0).T,
('ENCFF533XPJ', "HepG2", 'rnaseq', "rep1"): pd.read_table("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF533XPJ.tsv", index_col=0).T,
('ENCFF321JIT', "HepG2", 'rnaseq', "rep2"): pd.read_table("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/ENCFF321JIT.tsv", index_col=0).T,}).T

gene_tpm = gene_rsem.xs("TPM", level=4, axis=1)


In [13]:
#Get total rna RPKMs
new_keys = {'ENCFF131IST.bam': ['ENCFF131IST', 'K562', 'rnaseq', 'rep1'],
            'ENCFF726SMY.bam': ['ENCFF726SMY', 'K562', 'rnaseq', 'rep2'],
            'ENCFF770XVY.bam': ['ENCFF770XVY', 'HepG2', 'rnaseq', 'rep1'],
            'ENCFF002PXG.bam': ['ENCFF002PXG', 'HepG2', 'rnaseq', 'rep2'],
}

cell_expression_counts = pd.read_table("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/encode_total_rna.txt", skiprows=1, index_col=0)
cell_expression_rpkms = helpers.counts_to_rpkm(cell_expression_counts)

cell_expression_rpkms.columns = pd.MultiIndex.from_tuples([new_keys[key] for key in cell_expression_rpkms.columns], 
                                                          names=['uID', 'Cell line', 'rbp', 'rep'])

cell_expression_counts = cell_expression_counts[new_keys.keys()]
cell_expression_counts.columns = pd.MultiIndex.from_tuples([new_keys[key] for key in cell_expression_counts.columns], 
                                                           names=['uID', 'Cell line', 'rbp', 'rep'])

#All counts
whole_cell_expression_counts = pd.read_table("/projects/ps-yeolab3/encode/analysis/rnaseq_bams_v2/encode_total_rna_whole_gene_level.txt", skiprows=1, index_col=0)
whole_cell_expression_rpkms = helpers.counts_to_rpkm(whole_cell_expression_counts)

whole_cell_expression_rpkms.columns = pd.MultiIndex.from_tuples([new_keys[key] for key in whole_cell_expression_rpkms.columns], 
                                                                names=['uID', 'Cell line', 'rbp', 'rep'])

whole_cell_expression_counts = whole_cell_expression_counts[new_keys.keys()]
whole_cell_expression_counts.columns = pd.MultiIndex.from_tuples([new_keys[key] for key in whole_cell_expression_counts.columns], 
                                                                 names=['uID', 'Cell line', 'rbp', 'rep'])

In [14]:
clip_keys = {value: key for key, value in total_bedtools.CLIP.apply(os.path.basename).iteritems()}
input_keys = {value: key for key, value in total_bedtools.INPUT.apply(os.path.basename).iteritems()}


clips = list(set(total_bedtools.CLIP.apply(os.path.basename).values))
inputs = list(set(total_bedtools.INPUT.apply(os.path.basename).values))

def get_counts_and_rpkms(fn):
    gene_expression_counts = pd.read_table(fn, skiprows=1, index_col=0)
    clip_counts = gene_expression_counts[clips]
    
    count_cols = list(gene_expression_counts.columns[:5])
    
    clip_counts.columns = pd.MultiIndex.from_tuples([clip_keys[key] for key in clip_counts.columns], 
                                                   names=['uID', 'Cell line', 'RBP', 'rep'])

    input_counts = gene_expression_counts[inputs]
    input_counts.columns = pd.MultiIndex.from_tuples([input_keys[key] for key in input_counts.columns], 
                                                   names=['uID', 'Cell line', 'RBP', 'rep'])

    clip_rpkms = helpers.counts_to_rpkm(gene_expression_counts[count_cols + clips])
    clip_rpkms.columns = pd.MultiIndex.from_tuples([clip_keys[key] for key in clip_rpkms.columns], 
                                                   names=['uID', 'Cell line', 'RBP', 'rep'])

    input_rpkms = helpers.counts_to_rpkm(gene_expression_counts[count_cols + inputs])
    input_rpkms.columns = pd.MultiIndex.from_tuples([input_keys[key] for key in input_rpkms.columns], 
                                                   names=['uID', 'Cell line', 'RBP', 'rep'])
    
    return clip_counts, input_counts, clip_rpkms, input_rpkms

clip_exon_counts, input_exon_counts, clip_exon_rpkm, input_exon_rpkm = get_counts_and_rpkms("/projects/ps-yeolab3/encode/analysis/gene_expression.txt")
clip_whole_gene_counts, input_whole_gene_counts, clip_whole_gene_rpkm, input_whole_gene_rpkm = get_counts_and_rpkms("/projects/ps-yeolab3/encode/analysis/gene_expression_whole_gene_level.txt")

In [15]:
def get_exon_counts(count_file):
    counts = pd.read_table(count_file, index_col=0)
    counts = counts.drop('Unnamed: 43', axis=1)

    new_columns = []
    for col in counts.columns:
        col = col.split("|")
        if len(col) == 2:
            rep, element = col
            rep = os.path.basename(rep).split(".")[0]
        elif len(col) == 1:
            element = col[0]
            element = element.split(".")[0]
        new_columns.append((rep, element))
    counts.columns = pd.MultiIndex.from_tuples(new_columns)
    return counts.T.groupby(level=0).apply(lambda x: x.swaplevel(0,1).ix[['5utr', '3utr', 'CDS']].sum(axis=0))

In [16]:
#At the end of the day I use featureCounts
# loc_dir = "/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_FINALforpapers_20170325"

# merged_data = merged_data.reset_index()
# merged_data['loc_counts'] = merged_data.apply(lambda x: os.path.join(loc_dir, "{}_{}_ReadsByLoc_combined.csv".format(x.uID, x.RBP)), axis=1)
# merged_data = merged_data.set_index(["uID", 'Cell line', 'RBP', 'rep'])

# eric_exon_counts = merged_data.xs("rep1", level="rep").loc_counts.progress_apply(get_exon_counts)
# eric_exon_counts = pd.concat(dict(eric_exon_counts.iteritems()))

# eric_exon_counts = eric_exon_counts.T

# new_cols = []
# for x, col in enumerate(eric_exon_counts.columns):
#     uid, cell_type, rbp, fn = col
#     if x % 3 == 0:
#         rep = "rep1"
#         ip_type = "ip"
#     elif x % 3 == 1:
#         rep = "rep2"
#         ip_type = "ip"
#     elif x % 3 == 2:
#         rep = "rep2"
#         ip_type = "input"
    
#     new_cols.append((ip_type, "eric", "exon", "count", uid, cell_type, rbp, rep))
# eric_exon_counts.columns = pd.MultiIndex.from_tuples(new_cols)


In [17]:
merged_rpkms = pd.concat({("ip", "featurecounts", "exon", 'rpkm'): clip_exon_rpkm,
                          ("ip", "featurecounts", "exon", 'count'): clip_exon_counts,
                          ("input", "featurecounts", "exon", 'rpkm'): input_exon_rpkm,
                          ("input", "featurecounts", "exon", 'count'): input_exon_counts,
                          ("ip", "featurecounts", "whole_gene", 'rpkm'): clip_whole_gene_rpkm,
                          ("ip", "featurecounts", "whole_gene", 'count'): clip_whole_gene_counts,
                          ("input", "featurecounts", "whole_gene", 'rpkm'): input_whole_gene_rpkm,
                          ("input", "featurecounts", "whole_gene", 'count'): input_whole_gene_counts,
                          ('rnaseq', "featurecounts", 'exon', 'rpkm'): cell_expression_rpkms,
                          ('rnaseq', "featurecounts", 'exon', 'count'): cell_expression_counts,
                          ('rnaseq', "featurecounts", 'whole_gene', 'rpkm'): whole_cell_expression_rpkms,
                          ('rnaseq', "featurecounts", 'whole_gene', 'count'): whole_cell_expression_counts,
                          ('rnaseq', "featurecounts", 'exon', 'tpm_length_normalized'): whole_cell_expression_rpkms,
                          ('rnaseq', "featurecounts", 'whole_gene', 'tpm_length_normalized'): whole_cell_expression_counts,

                          ('rnaseq', "tpm", 'exon', 'rpkm'): gene_tpm,
                          ('rnaseq', "tpm", 'exon', 'count'): gene_tpm,
                          ('rnaseq', "tpm", 'whole_gene', 'rpkm'): gene_tpm,
                          ('rnaseq', "tpm", 'whole_gene', 'count'): gene_tpm,
                          ('rnaseq', "tpm", 'exon', 'tpm_length_normalized'): gene_tpm,
                          ('rnaseq', "tpm", 'whole_gene', 'tpm_length_normalized'): gene_tpm,

                          ('rnaseq', "tpm_transcript", 'exon', 'rpkm'): transcript_tpm,
                          ('rnaseq', "tpm_transcript", 'exon', 'count'): transcript_tpm,
                          ('rnaseq', "tpm_transcript", 'whole_gene', 'rpkm'): transcript_tpm,
                          ('rnaseq', "tpm_transcript", 'whole_gene', 'count'): transcript_tpm,
                          ('rnaseq', "tpm_transcript", 'exon', 'tpm_length_normalized'): transcript_tpm,
                          ('rnaseq', "tpm_transcript", 'whole_gene', 'tpm_length_normalized'): transcript_tpm,

                          ('rnaseq', "tpm", 'exon', 'length'): transcript_rsem.xs("length", level=4, axis=1),
                          ('rnaseq', "tpm", 'exon', 'effective_length'): transcript_rsem.xs("effective_length", level=4, axis=1),
                          ('rnaseq', "tpm", 'whole_gene', 'length'): transcript_rsem.xs("length", level=4, axis=1),
                          ('rnaseq', "tpm", 'whole_gene', 'effective_length'): transcript_rsem.xs("effective_length", level=4, axis=1),

                         }, axis=1)

In [18]:
#merged_rpkms = pd.concat([merged_rpkms, eric_exon_counts], axis=1)

In [19]:
for col in merged_rpkms.columns:
    if col[0] in ("ip", 'input') and col[3] == "count":
        #Expression Correlation
        uid = get_tpm_transcript_uid(col[5])
        length_col = tuple(['rnaseq', 'tpm'] + list(col[2:3]) + ['length', uid, col[5], 'rnaseq', 'rep1']) 
        new_col = list(col)
        new_col[3] = "tpm_length_normalized"
        merged_rpkms[tuple(new_col)] = merged_rpkms[col] / merged_rpkms[length_col]
        

# All Genes

In [20]:
#get all genes in gencode, ignore the other random stuff thrown in by everyone else
merged_rpkms_all_genes = merged_rpkms.ix[cell_expression_counts.index].copy()

In [21]:
#Expression Correlation
def full_expression_corr(expression_type="featurecounts", do_log2=False):
    expression_corr_all_genes = {}
    for col in tqdm(merged_rpkms_all_genes[['ip', 'input']].columns):
        if expression_type == "featurecounts":
            uid = get_rpkm_uid(col[5])
        elif expression_type == "tpm":
            uid = get_tpm_uid(col[5])
        elif expression_type == "tpm_transcript":
            uid = get_tpm_transcript_uid(col[5])
            
        expression_values = tuple(['rnaseq', expression_type] + list(col[2:4]) + [uid, col[5], 'rnaseq', 'rep1']) 
        tmp_rpkms = merged_rpkms_all_genes[[expression_values, col]].fillna(0)
        if do_log2:
            tmp_rpkms = np.log10(tmp_rpkms)
            tmp_rpkms = tmp_rpkms.replace([np.inf, -np.inf], np.nan).dropna()
        expression_corr_all_genes[col] = scipy.stats.linregress(tmp_rpkms[col], tmp_rpkms[expression_values]).rvalue
    
    expression_corr_all_genes = dict_to_corr_dataframe(expression_corr_all_genes)
    return expression_corr_all_genes

expression_corr_all_genes = full_expression_corr()
# expression_corr_all_genes_tpm = full_expression_corr("tpm")
# expression_corr_all_genes_tpm_transcript = full_expression_corr("tpm_transcript")

expression_corr_all_genes_log2 = full_expression_corr(do_log2=True)
# expression_corr_all_genes_tpm_log2 = full_expression_corr("tpm", do_log2=True)
# expression_corr_all_genes_tpm_transcript_log2 = full_expression_corr("tpm_transcript", do_log2=True)

100%|██████████| 5940/5940 [00:45<00:00, 131.40it/s]
100%|██████████| 5940/5940 [01:50<00:00, 53.86it/s]


All the stuff below is important, but if you just log2 the correlations it gets so much better.  I guess thats what I'll do

In [22]:
expression_corr_all_genes_grp = expression_corr_all_genes.groupby(level=[0,1,2,3])
expression_corr_all_genes_grp.mean()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,r-value,r-squared
input,featurecounts,exon,count,0.154869,0.031582
input,featurecounts,exon,rpkm,0.206153,0.069546
input,featurecounts,exon,tpm_length_normalized,0.200157,0.063412
input,featurecounts,whole_gene,count,0.192449,0.045472
input,featurecounts,whole_gene,rpkm,0.209158,0.071725
input,featurecounts,whole_gene,tpm_length_normalized,0.201106,0.0605
ip,featurecounts,exon,count,0.146438,0.029676
ip,featurecounts,exon,rpkm,0.129062,0.034145
ip,featurecounts,exon,tpm_length_normalized,0.114096,0.02764
ip,featurecounts,whole_gene,count,0.185167,0.04258


In [23]:
# expression_corr_all_genes_tpm_grp = expression_corr_all_genes_tpm.groupby(level=[0,1,2,3])
# expression_corr_all_genes_tpm_grp.mean()

In [24]:
# expression_corr_all_genes_tpm_transcript_grp = expression_corr_all_genes_tpm_transcript.groupby(level=[0,1,2,3])
# expression_corr_all_genes_tpm_transcript_grp.mean()

In [25]:
#Protein Coding genes

In [26]:
protein_coding_genes = [(col in gene_id_to_type) and (gene_id_to_type[col] == "protein_coding") for col in merged_rpkms.index]
merged_rpkms_pc = merged_rpkms[protein_coding_genes]

In [27]:
#Expression Correlation
def pc_expression_corr(expression_type="featurecounts"):

    expression_corr_pc = {}
    

    for col in tqdm(merged_rpkms_pc[['ip', 'input']].columns):
        if expression_type == "featurecounts":
            uid = get_rpkm_uid(col[5])
        elif expression_type == "tpm":
            uid = get_tpm_uid(col[5])
        elif expression_type == "tpm_transcript":
            uid = get_tpm_transcript_uid(col[5])

        expression_values = tuple(['rnaseq', expression_type] + list(col[2:4]) + [uid, col[5], 'rnaseq', 'rep1']) 

        tmp_rpkms = merged_rpkms_pc[[expression_values, col]].fillna(0)
        expression_corr_pc[col] = scipy.stats.linregress(tmp_rpkms[col], tmp_rpkms[expression_values]).rvalue

    expression_corr_pc = dict_to_corr_dataframe(expression_corr_pc)
    return expression_corr_pc

expression_corr_pc = pc_expression_corr()
# expression_corr_pc_tpm = pc_expression_corr("tpm")
# expression_corr_pc_tpm_transcript = pc_expression_corr("tpm_transcript")

100%|██████████| 5940/5940 [00:28<00:00, 207.50it/s]


In [28]:
expression_corr_pc_grp = expression_corr_pc.groupby(level=[0,1,2, 3])
expression_corr_pc_grp.mean()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,r-value,r-squared
input,featurecounts,exon,count,0.570855,0.338724
input,featurecounts,exon,rpkm,0.361632,0.140632
input,featurecounts,exon,tpm_length_normalized,0.282,0.087083
input,featurecounts,whole_gene,count,0.485216,0.23962
input,featurecounts,whole_gene,rpkm,0.300377,0.098252
input,featurecounts,whole_gene,tpm_length_normalized,0.282519,0.082631
ip,featurecounts,exon,count,0.495427,0.276623
ip,featurecounts,exon,rpkm,0.387067,0.177923
ip,featurecounts,exon,tpm_length_normalized,0.226986,0.061579
ip,featurecounts,whole_gene,count,0.433491,0.210035


In [29]:
# expression_corr_pc_tpm_grp = expression_corr_pc_tpm.groupby(level=[0,1,2, 3])
# expression_corr_pc_tpm_grp.mean()

In [30]:
# expression_corr_pc_tpm_transcript_grp = expression_corr_pc_tpm_transcript.groupby(level=[0,1,2, 3])
# expression_corr_pc_tpm_transcript_grp.mean()

# Protein Coding Genes with only genes that have counts in everything

In [31]:
#Expression Correlation
def expression_corr_specific(expression_type="featurecounts"):

    expression_corr = {}
    expression_type = expression_type.split("_")[0]

    for col in tqdm(merged_rpkms_pc[['ip', 'input']].columns):
        if expression_type == "featurecounts":
            uid = get_rpkm_uid(col[5])
        elif expression_type == "tpm":
            uid = get_tpm_uid(col[5])
        elif expression_type == "tpm_transcript":
            uid = get_tpm_transcript_uid(col[5])

        expression_values = tuple(['rnaseq', expression_type] + list(col[2:4]) + [uid, col[5], 'rnaseq', 'rep1']) 

        tmp_rpkms = merged_rpkms_pc[[expression_values, col]]
        tmp_rpkms = tmp_rpkms[tmp_rpkms.applymap(lambda x: x != 0).all(axis=1)].dropna()
        expression_corr[col] = scipy.stats.linregress(tmp_rpkms[col], tmp_rpkms[expression_values]).rvalue

    expression_corr = dict_to_corr_dataframe(expression_corr)
    return expression_corr

expression_corr = expression_corr_specific()
# expression_corr_tpm = expression_corr_specific("tpm")
# expression_corr_tpm_transcript = expression_corr_specific("tpm_transcript")

100%|██████████| 5940/5940 [07:34<00:00, 12.56it/s]


In [32]:
expression_corr_grp = expression_corr.groupby(level=[0,1,2,3])
expression_corr_grp.mean()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,r-value,r-squared
input,featurecounts,exon,count,0.551738,0.318384
input,featurecounts,exon,rpkm,0.354125,0.135434
input,featurecounts,exon,tpm_length_normalized,0.280427,0.086455
input,featurecounts,whole_gene,count,0.462725,0.218816
input,featurecounts,whole_gene,rpkm,0.30418,0.100779
input,featurecounts,whole_gene,tpm_length_normalized,0.261877,0.07106
ip,featurecounts,exon,count,0.4737,0.256503
ip,featurecounts,exon,rpkm,0.380474,0.172969
ip,featurecounts,exon,tpm_length_normalized,0.225667,0.061259
ip,featurecounts,whole_gene,count,0.40743,0.18914


In [33]:
# expression_corr_tpm_gpr = expression_corr_tpm.groupby(level=[0,1,2,3])
# expression_corr_tpm_gpr.mean()

In [34]:
# expression_corr_tpm_transcript_grp = expression_corr_tpm_transcript.groupby(level=[0,1,2,3])
# expression_corr_tpm_transcript_grp.mean()

I can push correlations higher or lower depending on how I parse the data.  Higher coors if I throw out things that aren't covered in both comparisons, lower coors if I do.  

Erics counts, on average look like mine now, which is good.  Can't get quite 

In [35]:
# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "tpm_input_expression_corr.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     sns.violinplot(expression_corr.loc[('input', 'whole_gene')]['r-squared'], ax=ax)
#     ax.set_xlim(0,1)
#     ax.set_xlabel("$R^2$")
#     ax.set_title("Gene Expression (TPM)\nand reads in input correlation")
#     ax.set_ylabel("All Experiments")
#     sns.despine(ax=ax)
    
#     sns.violinplot(expression_corr.loc[('ip', 'exon', 'count')]['r-squared'], ax=ax)

#     sns.violinplot(rpkm_input_expression_corr.loc[('ip', 'gene', 'rpkm')]['r-squared'], ax=ax)


In [36]:
# rpkm_input_expression_corr.loc[('input', 'whole_gene')].mean()

Note the x-axis, expression counts are uncorrelated with input counts, TPM is somehow better correlated, but I don't know why.

In [37]:
# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "tpm_input_expression_corr.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     sns.violinplot(tpm_input_expression_corr.loc[('input', 'gene')]['r-squared'], ax=ax)
#     ax.set_xlim(0,1)
#     ax.set_xlabel("$R^2$")
#     ax.set_title("Gene Expression (TPM)\nand reads in input correlation")
#     ax.set_ylabel("All Experiments")
#     sns.despine(ax=ax)

In [38]:

# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "rpkm_input_expression_corr.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     sns.violinplot(rpkm_input_expression_corr.loc[('input', 'gene')]['r-squared'], ax=ax)
#     ax.set_xlim(0,1)
#     sns.despine(ax=ax)

# Correlations between IP and Input 

In [39]:
ip_values = merged_rpkms['ip']

ip_input_correlations = {}
for col in tqdm(ip_values.columns):
    ip_col = tuple(['ip'] + list(col))
    input_col = tuple(['input'] + list(col[:-1]) + ["rep2"])
    tmp_rpkms = merged_rpkms[[ip_col, input_col]]
    tmp_rpkms = np.log10(tmp_rpkms)
    tmp_rpkms = tmp_rpkms[(tmp_rpkms != 0).all(axis=1)]
    tmp_rpkms = tmp_rpkms.replace([np.inf, -np.inf], np.nan).dropna()
    ip_input_correlations[col] = scipy.stats.linregress(tmp_rpkms[ip_col], tmp_rpkms[input_col]).rvalue

ip_input_correlations = pd.Series(ip_input_correlations, name="rvalue")
ip_input_correlations = pd.DataFrame(ip_input_correlations)
ip_input_correlations['r-squared'] = ip_input_correlations['rvalue']**2

 91%|█████████ | 3585/3960 [13:20<01:28,  4.22it/s]

ValueError: I/O operation on closed file

In [None]:
ip_input_correlations.groupby(level=[0,1,2]).mean()

Erics counts in this case are slightly less well correlated.  If its all the same I'll just use my results to improve the baseline.  

In [None]:
num_rows = 1
num_cols = 1
with dataviz.Figure(os.path.join(img_dir, "ip_input_rpkm_correlation.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
    ax = fig.add_subplot(num_cols,num_rows,1)
    sns.violinplot(ip_input_correlations['r-squared'], ax=ax)
    ax.set_xlim(0,1)
    sns.despine(ax=ax)

In [None]:
ip_input_correlations.mean()

IP and input are better correlated, in praticular possibly abundtant RBPs are well correlated with input, where more lowly expressed RBPS are not, let me check this real quick.

In [None]:
def get_expression(row):
    cell_type = row.name[4]

    if cell_type == "K562":
        expression = gene_rsem[('ENCFF424CXV', 'K562', 'rnaseq', 'rep1')]
    if cell_type == "HepG2":
        expression = gene_rsem[('ENCFF533XPJ', 'HepG2', 'rnaseq', 'rep1')]

    return expression.ix[row.gene_id].TPM

ip_input_correlations['gene_id'] = [name_to_gene_id[name] for name in ip_input_correlations.index.get_level_values(level=5)]
ip_input_correlations['rbp_expression'] = ip_input_correlations.progress_apply(get_expression, axis=1)

In [None]:
ip_input_correlations.sort_values("r-squared", ascending=False).head()

In [None]:
num_rows = 1
num_cols = 1
with dataviz.Figure(os.path.join(img_dir, "ip_input_rpkm_correlation.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
    ax = fig.add_subplot(num_cols,num_rows,1)
    ax.scatter(ip_input_correlations['r-squared'], ip_input_correlations['rbp_expression'])
    ax.set_xlim(0,1)
    ax.set_xlabel("R-Squared")
    ax.set_ylabel("RBP Expression")
    sns.despine(ax=ax)
    

Well, isnt not expression thats causing the high input correlations

# IP Correlated with both measures of expression

In [None]:
# k562_clip_rpkms = clip_rpkms.xs("K562", level="Cell line", axis=1)
# hepg2_clip_rpkms = clip_rpkms.xs("HepG2", level="Cell line", axis=1)

# k562_expression = pd.merge(k562_rep1, k562_clip_rpkms, left_index=True, right_index=True, how="inner")
# hepg2_expression = pd.merge(hepg2_rep1, hepg2_clip_rpkms, left_index=True, right_index=True, how="inner")

In [None]:
# expression_corr = {}
# for col in k562_clip_rpkms.columns:
#      expression_corr[col] = scipy.stats.linregress(k562_expression.TPM, k562_expression[col]).rvalue
# for col in hepg2_clip_rpkms.columns:
#      expression_corr[col] = scipy.stats.linregress(hepg2_expression.TPM, hepg2_expression[col]).rvalue
        
# tmp_ip_expression_corr = pd.Series(expression_corr, name="r-value")
# tmp_ip_expression_corr = pd.DataFrame(tmp_ip_expression_corr.sort_values(ascending=False))
# tmp_ip_expression_corr['r-squared'] = tmp_ip_expression_corr['r-value'] ** 2

In [None]:
# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "tpm_ip_expression_corr.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     sns.violinplot(tmp_ip_expression_corr['r-squared'], ax=ax)
#     ax.set_xlim(0,1)
#     sns.despine(ax=ax)
    

In [None]:
# tmp_ip_expression_corr.mean()

In [None]:
# #Expression Correlation
# expression_corr = {}
# for col in k562_clip_rpkms.columns:
#      expression_corr[col] = scipy.stats.linregress(cell_expression_rpkms[('K562', "rep1")], k562_expression[col]).rvalue
# for col in hepg2_clip_rpkms.columns:
#      expression_corr[col] = scipy.stats.linregress(cell_expression_rpkms[('HepG2', "rep1")], hepg2_expression[col]).rvalue
        
# rpkm_ip_expression_corr = pd.Series(expression_corr, name="r-value")
# rpkm_ip_expression_corr = pd.DataFrame(rpkm_ip_expression_corr.sort_values(ascending=False))
# rpkm_ip_expression_corr['r-squared'] = rpkm_ip_expression_corr['r-value'] ** 2

In [None]:
# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "tpm_ip_expression_corr.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     sns.violinplot(rpkm_ip_expression_corr['r-squared'], ax=ax)
#     ax.set_xlim(0,1)
#     sns.despine(ax=ax)
    

In [None]:
# rpkm_ip_expression_corr.mean()

In [None]:
# no_counts = (clip_rpkms == 0).sum()
# no_counts.sort_values(ascending=False)

This is important becaue it explains LARP7, which should just bind one thing, but correlates well.  It correlates well because it binds so few sites in the genome, leading to an apparently better correlation I guess.  

I want to show that gene expression in uncorrelated to reads in peaks.  This argunment will allow me to that peaks picked up are the strongest peaks, of the expression level of where that gene exists.  Alternatively, if you sequence deeply.

Step 1: Show size matched controls correlate well to TPM
Step 2: Show IPs correlate less well to TPM this indicates the the data we pick up is enriching more for binding strength rather than gene expression
Step 3: Show peaks are basically uncorrelated to gene expression.  

Next we were interested to see the quality of peaks that were recovered, we found that peaks recovered as we added back in reads had a small reduction in the number of motifs or regions discovered, but still had fairly high quality binding sites.  This leads us to believe that deeper sequencing will recover confident but less strongly bound sites

Maybe look into this conservation thing, less strongy bound peaks are possibly less conserved?  

# Final Figure?

In [None]:
ip_input_correlations['exp'] = 'IP Input Correlation'
ip_input_correlations['similar'] = "foo"

In [None]:
num_rows = 1
num_cols = 1

tmp_correlation = ip_input_correlations.ix[['featurecounts', 'whole_gene']]

with dataviz.Figure(os.path.join(img_dir, "tpm_ip_expression_corr_presentation.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
    ax = fig.add_subplot(num_cols,num_rows,1)
    sns.violinplot(y='r-squared', x='exp', data=tmp_correlation,
                   linewidth=.5, ax=ax)
    ax.set_ylim(0,1)
    sns.despine()
    ax.set_ylabel("R2", fontsize=20)
    ax.set_xlabel("All Experiments", fontsize=20)
    ax.set_xticklabels([])
    ax.set_title("R2 of Input to IP Expermients", fontsize=20)
    [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(20) for tick in ax.get_yticklabels()]

with dataviz.Figure(os.path.join(img_dir, "tpm_ip_expression_corr.svg"), figsize=(2.5* num_rows, 2.5*num_cols)) as fig:
    ax = fig.add_subplot(num_cols,num_rows,1)
    sns.violinplot(y='r-squared', x='exp', data=tmp_correlation,
                   linewidth=.5, ax=ax)
    ax.set_ylim(0,1)
    sns.despine()
    ax.set_ylabel("Ra2", fontsize=8)
    ax.set_xlabel("All Experiments", fontsize=8)
    ax.set_xticklabels([])
    ax.set_title("R2 of Input to IP Expermients", fontsize=8)
    [tick.set_fontsize(8) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(8) for tick in ax.get_yticklabels()]

In [None]:
expression_corr_all_genes_log2 = expression_corr_all_genes_log2.reset_index()
expression_corr_all_genes_log2['exp'] = expression_corr_all_genes_log2['level_0']
expression_corr_all_genes_log2 = expression_corr_all_genes_log2.set_index(list(expression_corr_all_genes_log2.columns[:8]))
expression_corr_all_genes_log2['similar'] = "foo"

In [None]:
num_rows = 1
num_cols = 1

tmp_expression_corr = expression_corr_all_genes_log2.xs("whole_gene", level=2).xs("count", level=2)

with dataviz.Figure(os.path.join(img_dir, "tpm_ip_and_input_expression_corr_presentation.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
    ax = fig.add_subplot(num_cols,num_rows,1)
    sns.violinplot(y='r-squared', x='similar', hue="exp", 
                   split=True, data=tmp_expression_corr, inner="quart", 
                   linewidth=.5, ax=ax)
    ax.set_ylim(0,1)
    sns.despine(ax=ax)
    ax.set_ylabel("R2", fontsize=20)
    ax.set_xlabel("All Experiments", fontsize=20)
    ax.set_xticklabels([])
    ax.set_title("R2 of Input or IP Expermients\nand gene expression", fontsize=20)
    [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(20) for tick in ax.get_yticklabels()]
    ax.legend(loc=0, fontsize=20)
    
with dataviz.Figure(os.path.join(img_dir, "tpm_ip_and_input_expression_corr.svg"), figsize=(2.5* num_rows, 2.5*num_cols)) as fig:
    ax = fig.add_subplot(num_cols,num_rows,1)
    sns.violinplot(y='r-squared', x='similar', hue="exp", 
                   split=True, data=tmp_expression_corr, inner="quart", 
                   linewidth=.5, ax=ax)
    ax.set_ylim(0,1)
    sns.despine(ax=ax)
    ax.set_ylabel("R2", fontsize=8)
    ax.set_xlabel("All Experiments", fontsize=8)
    ax.set_xticklabels([])
    ax.set_title("R2 of Input or IP Expermients\nand gene expression", fontsize=8)
    [tick.set_fontsize(8) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(8) for tick in ax.get_yticklabels()]
    ax.legend(fontsize=8)

# Why can't I reproduce Eric's results?

In [None]:
test = merged_rpkms.xs('417', level=4, axis=1)
test = test.ix[cell_expression_counts.index].fillna(0)

In [None]:
# featurecounts = ('ip', 'featurecounts', 'exon', 'count', '417' 'K562', 'POLR2G', 'rep1')
# eric = ('ip', 'eric', 'exon', 'count', '417', 'K562', 'POLR2G', 'rep1')
# exon_counts = ('417', 'K562', 'POLR2G', 'rep1')

# foo = clip_exon_counts[exon_counts]
# bar = eric_exon_counts[eric]

# featurecounts = ('ip', 'featurecounts', 'exon', 'count', 'K562', 'POLR2G', 'rep1')
# eric = ('ip', 'eric', 'exon', 'count', 'K562', 'POLR2G', 'rep1')
# corr = test[[featurecounts, eric]]

# print scipy.stats.linregress(corr[corr.columns[0]], corr[corr.columns[1]]).rvalue
# corr_all = corr[corr.applymap(lambda x: x != 0).all(axis=1)]
# corr_all = corr_all.dropna()

# print scipy.stats.linregress(corr_all[corr_all.columns[0]], corr_all[corr_all.columns[1]]).rvalue

# corr_any = corr[corr.applymap(lambda x: x != 0).any(axis=1)]
# corr_any = corr_any.dropna()

# print scipy.stats.linregress(corr_any[corr_any.columns[0]], corr_any[corr_any.columns[1]]).rvalue


In [None]:
# corr_all.columns[0]

In [None]:
# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "foo.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     ax.scatter(np.log10(corr_all[corr_all.columns[0]]), np.log10(corr_all[corr_all.columns[1]]))
#     ax.set_xlabel("FeatureCounts")
#     ax.set_xlabel("EricsCounts")

#     sns.despine(ax=ax)
    
# print scipy.stats.linregress(corr_all[corr_all.columns[0]], corr_all[corr_all.columns[1]]).rvalue

The really important thing here is to remove any sites that have at least one 0 coverage.  Really drives up correlation

ok, so I can count correctly, now lets see if what the correlations look like

In [None]:
# merged_expression = pd.merge(k562_rep1, counts, left_index=True, right_index=True, how="inner")
# merged_expression['rep1_normalized'] = merged_expression.rep1 / merged_expression.effective_length
# merged_expression['mine_normalized_my_length'] = merged_expression['/projects/ps-yeolab3/encode/analysis/encode_master/417_01_POLR2G.merged.r2.bam'] / merged_expression.Length
# merged_expression['mine_normalized_tpm_length'] = merged_expression['/projects/ps-yeolab3/encode/analysis/encode_master/417_01_POLR2G.merged.r2.bam'] / merged_expression.effective_length
# merged_expression = merged_expression[~np.isinf(merged_expression['mine_normalized_tpm_length'])]

# merged_expression = merged_expression.fillna(0)

In [None]:
# counts_test = ('input', 'featurecounts', 'exon', 'count', '417', 'K562', 'POLR2G', 'rep2')
# counts_test_e = ('input', 'eric', 'exon', 'count', '417', 'K562', 'POLR2G', 'rep2')


In [None]:
# length_normalized = ('input', 'featurecounts', 'exon', 'length_normalized', '417', 'K562', 'POLR2G', 'rep2')
# merged_rpkms[length_normalized] = merged_rpkms[counts_test] / merged_rpkms[length]

# effective_length_normalized = ('input', 'featurecounts', 'exon', 'effective_length_normalized', '417', 'K562', 'POLR2G', 'rep2')
# merged_rpkms[effective_length_normalized] = merged_rpkms[counts_test] / merged_rpkms[effective_length]

# length_normalized_e = ('input', 'eric', 'exon', 'length_normalized', '417', 'K562', 'POLR2G', 'rep2')
# merged_rpkms[length_normalized_e] = merged_rpkms[counts_test_e] / merged_rpkms[length]

# effective_length_normalized_e = ('input', 'eric', 'exon', 'effective_length_normalized', '417', 'K562', 'POLR2G', 'rep2')
# merged_rpkms[effective_length_normalized_e] = merged_rpkms[counts_test_e] / merged_rpkms[effective_length]

In [None]:
# rnaseq_tpm = ('rnaseq', 'tpm', 'exon', 'tpm_length_normalized', 'ENCFF986DBN', 'K562', 'rnaseq', 'rep2')
# rnaseq_tpm_transcript = ('rnaseq', 'tpm_transcript', 'exon', 'tpm_length_normalized', 'ENCFF424CXV', 'K562', 'rnaseq', 'rep1')

# length = ('rnaseq', 'tpm', 'exon', 'length', 'ENCFF424CXV', 'K562', 'rnaseq', 'rep1')
# effective_length = ('rnaseq', 'tpm', 'exon', 'effective_length', 'ENCFF424CXV', 'K562', 'rnaseq', 'rep1')

# #Basic
# tpm_expression = merged_rpkms[[counts_test, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]
# print "basic", scipy.stats.linregress(tpm_expression[counts_test], tpm_expression[rnaseq_tpm]).rvalue

# #Length normalized
# tpm_expression = merged_rpkms[[length_normalized, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]
# print "length normalized", scipy.stats.linregress(tpm_expression[length_normalized], tpm_expression[rnaseq_tpm]).rvalue

# #effective length normalized
# tpm_expression = merged_rpkms[[effective_length_normalized, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]
# tpm_expression = tpm_expression[tpm_expression.applymap(np.isfinite).all(axis=1)]
# print "effective length", scipy.stats.linregress(tpm_expression[effective_length_normalized], tpm_expression[rnaseq_tpm]).rvalue

# #effective length normalized taking only protein coding genes
# tpm_expression = merged_rpkms[[length_normalized, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]

# protein_coding_genes = [(col in gene_id_to_type) and (gene_id_to_type[col] == "protein_coding") for col in tpm_expression.index]
# tpm_expression = tpm_expression[protein_coding_genes]
# print "protein coding", scipy.stats.linregress(tpm_expression[length_normalized], tpm_expression[rnaseq_tpm]).rvalue

# #length normalized with protein coding genes and highest expressed transcript
# tpm_expression = merged_rpkms[[length_normalized, rnaseq_tpm_transcript]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]

# protein_coding_genes = [(col in gene_id_to_type) and (gene_id_to_type[col] == "protein_coding") for col in tpm_expression.index]
# tpm_expression = tpm_expression[protein_coding_genes]
# print "transcript based", scipy.stats.linregress(tpm_expression[length_normalized], tpm_expression[rnaseq_tpm_transcript]).rvalue


In [None]:
# #Length normalized
# tpm_expression = merged_rpkms[[length_normalized_e, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]
# print "length normalized", scipy.stats.linregress(tpm_expression[length_normalized_e], tpm_expression[rnaseq_tpm]).rvalue


In [None]:
# tpm_expression = merged_rpkms[[length_normalized_e, rnaseq_tpm_transcript]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]

# tpm_expression.to_csv("/home/gpratt/correlations.csv")

In [None]:
# print "transcript based", scipy.stats.linregress(np.log10(tpm_expression[length_normalized_e]), np.log10(tpm_expression[rnaseq_tpm_transcript])).rvalue


In [None]:
# #Basic
# tpm_expression = merged_rpkms[[counts_test_e, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]
# print "basic", scipy.stats.linregress(tpm_expression[counts_test_e], tpm_expression[rnaseq_tpm]).rvalue

# #Length normalized
# tpm_expression = merged_rpkms[[length_normalized_e, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]
# print "length normalized", scipy.stats.linregress(tpm_expression[length_normalized_e], tpm_expression[rnaseq_tpm]).rvalue

# #effective length normalized
# tpm_expression = merged_rpkms[[effective_length_normalized_e, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]
# tpm_expression = tpm_expression[tpm_expression.applymap(np.isfinite).all(axis=1)]
# print "effective length", scipy.stats.linregress(tpm_expression[effective_length_normalized_e], tpm_expression[rnaseq_tpm]).rvalue

# #effective length normalized taking only protein coding genes
# tpm_expression = merged_rpkms[[length_normalized_e, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]

# protein_coding_genes = [(col in gene_id_to_type) and (gene_id_to_type[col] == "protein_coding") for col in tpm_expression.index]
# tpm_expression = tpm_expression[protein_coding_genes]
# print "protein coding", scipy.stats.linregress(tpm_expression[length_normalized_e], tpm_expression[rnaseq_tpm]).rvalue

# #length normalized with protein coding genes and highest expressed transcript
# tpm_expression = merged_rpkms[[length_normalized_e, rnaseq_tpm_transcript]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]

# # protein_coding_genes = [(col in gene_id_to_type) and (gene_id_to_type[col] == "protein_coding") for col in tpm_expression.index]
# # tpm_expression = tpm_expression[protein_coding_genes]
# print "transcript based", scipy.stats.linregress(tpm_expression[length_normalized_e], tpm_expression[rnaseq_tpm_transcript]).rvalue


In [None]:
# #effective length normalized taking only protein coding genes
# tpm_expression = merged_rpkms[[length_normalized, rnaseq_tpm]].dropna()
# tpm_expression = tpm_expression[tpm_expression.applymap(lambda x: x != 0).all(axis=1)]

# protein_coding_genes = [(col in gene_id_to_type) and (gene_id_to_type[col] == "protein_coding") for col in tpm_expression.index]
# tpm_expression = tpm_expression[protein_coding_genes]

# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "foo.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     ax.scatter(np.log10(tpm_expression[length_normalized]), np.log10(tpm_expression[rnaseq_tpm].astype(float)))
#     sns.despine(ax=ax)
# print scipy.stats.linregress(tpm_expression[length_normalized], tpm_expression[rnaseq_tpm]).rvalue


In [None]:
# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "foo.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     ax.scatter(np.log10(merged_expression['rep1_normalized']), np.log10(merged_expression['TPM']))
#     sns.despine(ax=ax)
# print scipy.stats.linregress(merged_expression['rep1_normalized'], merged_expression['TPM']).rvalue


In [None]:
# num_rows = 1
# num_cols = 1
# with dataviz.Figure(os.path.join(img_dir, "foo.svg"), figsize=(4* num_rows, 4*num_cols)) as fig:
#     ax = fig.add_subplot(num_cols,num_rows,1)
#     ax.scatter(np.log10(merged_expression['mine_normalized']), np.log10(merged_expression['TPM']))
#     sns.despine(ax=ax)
# print scipy.stats.linregress(merged_expression['mine_normalized'], merged_expression['TPM']).rvalue


In [None]:
# print scipy.stats.linregress(merged_expression['mine_normalized_my_length'], merged_expression['TPM']).rvalue
# print scipy.stats.linregress(merged_expression['mine_normalized_tpm_length'], merged_expression['TPM']).rvalue


In [None]:
# ip_counts = pd.read_table("/home/gpratt/ad-hoc/test.txt", skiprows=1, index_col=0)
# rnaseq_counts = pd.read_table("/home/gpratt/ad-hoc/expression.txt", skiprows=1, index_col=0)
# rnaseq_counts['ip_counts'] = ip_counts['/projects/ps-yeolab3/encode/analysis/encode_master/417_01_POLR2G.merged.r2.bam']

# # foo = [gene_id_to_type[col] == "protein_coding" for col in rnaseq_counts.index ]
# # rnaseq_counts= rnaseq_counts[foo]

# Plot the heatmap of IP and input correlations

In [None]:
print "Foo"

In [None]:
#Fold Enrichment

rep1_inputs = input_whole_gene_counts.copy()

new_cols = []
for col in rep1_inputs.columns:
    col = list(col)
    col[3] = "rep1"
    new_cols.append(col)
rep1_inputs.columns = pd.MultiIndex.from_tuples(new_cols, names=['uID', 'Cell line', 'RBP', 'rep'])

combined_inputs = pd.concat([rep1_inputs, input_whole_gene_counts], axis=1)
fold_enrichedment = np.log2((clip_whole_gene_counts + 1) / (combined_inputs + 1))

result = defaultdict(dict)
for name_1, row_1 in tqdm(list(fold_enrichedment.iteritems())):
    for name_2, row_2 in tqdm(list(fold_enrichedment.iteritems())):
        both_names = list(set([name_1, name_2]))
        tmp_rpkms = fold_enrichedment[both_names]

        result[name_1][name_2] = scipy.stats.linregress(tmp_rpkms[name_1], tmp_rpkms[name_2]).rvalue
fold_enrichment_correlations = pd.DataFrame(result)

In [None]:
del fold_enrichment_correlations

In [None]:
fold_enrichment_correlations.to_csv("~/scratch/fold_enrichment.csv")

In [None]:
#IP Correlation
result = defaultdict(dict)
for name_1, row_1 in tqdm_notebook(list(clip_whole_gene_counts.iteritems())):
    for name_2, row_2 in tqdm_notebook(list(clip_whole_gene_counts.iteritems())):
        both_names = list(set([name_1, name_2]))
        
        tmp_rpkms = np.log10(clip_whole_gene_counts[both_names])
        tmp_rpkms = tmp_rpkms.replace([np.inf, -np.inf], np.nan).dropna()

        result[name_1][name_2] = scipy.stats.linregress(tmp_rpkms[name_1], tmp_rpkms[name_2]).rvalue
clip_ip_correlations = pd.DataFrame(result)

# #Input correlation
# result = defaultdict(dict)
# for name_1, row_1 in tqdm_notebook(list(input_whole_gene_counts.iteritems())):
#     for name_2, row_2 in tqdm_notebook(list(input_whole_gene_counts.iteritems())):
#         both_names = list(set([name_1, name_2]))
#         tmp_rpkms = np.log10(input_whole_gene_counts[both_names])
#         tmp_rpkms = tmp_rpkms.replace([np.inf, -np.inf], np.nan).dropna()

#         result[name_1][name_2] = scipy.stats.linregress(tmp_rpkms[name_1], tmp_rpkms[name_2]).rvalue
# clip_input_correlations = pd.DataFrame(result)

In [None]:
cell_type = clip_ip_correlations.columns.get_level_values(1)
colors = dict(zip(set(cell_type), sns.color_palette("Set2", 2)))

col_colors = pd.Series(cell_type, index=clip_ip_correlations.columns).map(colors)
row_colors = pd.Series(cell_type, index=clip_ip_correlations.index).map(colors)

foo = sns.clustermap(clip_ip_correlations, cmap='Greys', vmin=0, vmax=1, 
                     col_colors=col_colors, row_colors=row_colors,
                     #xticklabels=False,
                     #yticklabels=False,
                     figsize=(90,90),
                    )
foo.ax_heatmap.set_rasterized(True)
foo.savefig(os.path.join(img_dir, 'ip_clustermap.svg'))

In [None]:
cell_type = clip_ip_correlations.columns.get_level_values(1)
colors = dict(zip(set(cell_type), sns.color_palette("Set2", 2)))

col_colors = pd.Series(cell_type, index=clip_ip_correlations.columns).map(colors)
row_colors = pd.Series(cell_type, index=clip_ip_correlations.index).map(colors)

foo = sns.clustermap(clip_ip_correlations, cmap='Greys', vmin=0, vmax=1, 
                     col_colors=col_colors, row_colors=row_colors,
                     xticklabels=False,
                     yticklabels=False,
                     figsize=(5,5),
                    )
foo.ax_heatmap.set_rasterized(True)
foo.savefig(os.path.join(img_dir, 'ip_clustermap.svg'))

In [None]:
cell_type = clip_input_correlations.columns.get_level_values(1)
colors = dict(zip(set(cell_type), sns.color_palette("Set2", 2)))

col_colors = pd.Series(cell_type, index=clip_input_correlations.columns).map(colors)
row_colors = pd.Series(cell_type, index=clip_input_correlations.index).map(colors)

foo = sns.clustermap(clip_input_correlations, cmap='Greys', vmin=0, vmax=1,
                     col_colors=col_colors, row_colors=row_colors,
                     figsize=(40,40))

foo.savefig(os.path.join(img_dir, 'input_clustermap.svg'))

In [None]:
cell_type = fold_enrichment_correlations.columns.get_level_values(1)
colors = dict(zip(set(cell_type), sns.color_palette("Set2", 2)))

col_colors = pd.Series(cell_type, index=fold_enrichment_correlations.columns).map(colors)
row_colors = pd.Series(cell_type, index=fold_enrichment_correlations.index).map(colors)

foo = sns.clustermap(fold_enrichment_correlations, cmap='Greys', vmin=0, vmax=1,
               col_colors=col_colors, row_colors=row_colors, figsize=(40,40))

foo.savefig(os.path.join(img_dir, 'fold_enrichment_clustermap.svg'))

The last thing I should do with this is entropy it, show I can get much more specific plots when I use entropy rather than just counts