In [1]:
%load_module autoreload 
%autoreload 2

In [2]:
%matplotlib inline

import functools
from itertools import izip
import os
import datetime

from IPython.core.display import HTML
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import pysam
import scipy.stats
import seaborn as sns
from tqdm import tqdm, tqdm_notebook

from clipper.src import CLIP_analysis
from gscripts import qtools
from gscripts.encode import encode_helpers
from gscripts.general import dataviz
from gscripts.rnaseq import splicing_map

tqdm.pandas(desc="Progress")
sns.set_style("ticks")
img_dir = "/home/gpratt/Dropbox/Pratt_Gabriel/PapersInProgress/eCLIP_qc/working_figures/fig_2/"

In [3]:
legend = None

OUTSIDE_LEGEND_SAVEFIG_KWS = dict(bbox_extra_artists=(legend,),
                                  bbox_inches='tight')
from matplotlib import rc

matplotlib.rcParams['svg.fonttype'] = 'none'

rc('text', usetex=False) 
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})

In [4]:
input_norm_dir = "/projects/ps-yeolab3/encode/analysis/Eric_Input_Norm/"
idr_peak_dir = "/projects/ps-yeolab3/encode/analysis/Eric_IDR"
split_dir = "/home/gpratt/projects/idr/analysis/idr_v1/"
out_dir = "/home/gpratt/projects/encode/analysis/peak_reanalysis_v14/"
frip_out_dir = "/home/gpratt/projects/encode/analysis/frip_calculations/"
downsample_path = "/home/gpratt/projects/idr/analysis/downsample_v2/"

In [5]:
merged_data = encode_helpers.get_merged_data()
#Filter out anything sequenced after 16/7/18
merged_data = merged_data[merged_data['Submitted Date'] < datetime.date(2016, 7, 18)]
#Filter out anything not_qced
merged_data = merged_data[merged_data.is_qced]
#Filter out any data we are planning on submitting, but haven't yet submitted
merged_data = merged_data[merged_data.annotation != "Submit"]

# Merge rmrep Counts

In [6]:
#Get all the counts
counts_list = pd.read_csv("downsample_counts_full_v4.csv", dtype={"RBP_ID": str, "bio_rep": str, "tech_rep": str},
                         index_col=["RBP_ID", "RBP", "bio_rep", "tech_rep", "fraction"])
counts_list = counts_list.xs(1.0, level="fraction").groupby(level=['RBP_ID', "RBP", "bio_rep"]).sum()
counts_list.index.rename(["uID", "RBP", "rep"], inplace=True)
counts_list.index = counts_list.index.droplevel("RBP")

counts_list['rep'] = ["rep1" if rep == 1 else "rep2" for rep in counts_list.index.get_level_values(level="rep")]
counts_list.index = counts_list.index.droplevel("rep")
counts_list = counts_list.set_index("rep", append=True)


In [7]:
merged_data = merged_data.reset_index(['Cell line', 'RBP'])

merged_data = pd.merge(merged_data, counts_list,
        left_index=True, right_index=True, how="left")

merged_data = merged_data.set_index(["RBP", "Cell line"], append=True)
merged_data = merged_data.reorder_levels(['uID','RBP', 'Cell line','rep'])

In [8]:
print "number of datasets without reads merged"
print len(merged_data[merged_data.unique.isnull()])

number of datasets without reads merged
0


# Merge regular counts (non-repmapped counts)

In [9]:
final_qc_frame = pd.read_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/encode_master_qc.csv")
final_qc_frame = final_qc_frame.fillna("")
final_qc_frame['Reads Written Round 2'] = final_qc_frame['Reads Written Round 2'].astype(int)
final_qc_frame = final_qc_frame.set_index(keys=["rbp", "encode_id", "rep", "cell_type", "full_name", "barcode"])
filtered_final_qc_frame = final_qc_frame[[ "Input Reads", "Reads Written", "repetitive_count", "Reads Passing Quality Filter",
                                          "Uniquely Mapped Reads", "Uniquely mapped reads %", 'Number of reads mapped to too many loci',
                                          '% of reads unmapped: too short', '% of reads mapped to too many loci', "Usable Reads",
                                          "Fraction Collapsed", "Fraction Usable", "Num Peaks", "Reads Written Round 2"]]

grouped_final_qc_frame = filtered_final_qc_frame.groupby(level=['rbp', 'encode_id', 'rep', 'cell_type', 'full_name']).sum()
grouped_final_qc_frame["Fraction Collapsed"] = grouped_final_qc_frame['Usable Reads'] / grouped_final_qc_frame['Uniquely Mapped Reads'].astype(float)
grouped_final_qc_frame["Fraction Usable"] = grouped_final_qc_frame['Usable Reads'] / grouped_final_qc_frame['Input Reads'].astype(float)
grouped_final_qc_frame = grouped_final_qc_frame.dropna()
grouped_final_qc_frame['full_name'] = grouped_final_qc_frame.index.get_level_values(level="full_name")


In [10]:
#Make sure we can join input counts
fn_label_map = pd.read_table("/home/gpratt/projects/encode/scripts/encode_GRCh38_v1.txt", header=None,
              names=['fn', 'species', 'label', 'foo', 'bar', 'baz', 'biz'])
fn_label_map['fn_basename'] = fn_label_map.fn.apply(lambda x: os.path.basename(x.split(";")[0]).split(".")[0])
fn_label_map['label'] = fn_label_map.label.apply(lambda x: "_".join(x.split("_")[:-1]))

#Need to set the full name of the inputs to the actual name of the file, sadly I'm going to do this badly
inputs = grouped_final_qc_frame.xs("INPUT", level="rep")    
inputs = pd.merge(inputs, fn_label_map,
         left_on="full_name", right_on="label")

In [11]:
new_full_name = []
for name, row in grouped_final_qc_frame.iterrows():
    #print name, row
    name = inputs[inputs.label == row.full_name]
    if len(name) == 1:
        new_full_name.append(name.iloc[0].fn_basename)
    elif len(name) == 0:
        new_full_name.append(row.full_name)
    else:
        new_full_name.append(np.nan)
        print "error", row.full_name
grouped_final_qc_frame['full_file_name'] = new_full_name

error LNG8-M_INPUT_A04F05
error LNG8-S_INPUT_C01D08
error LNG12-N_INPUT_A04F05
error LNG10-M_INPUT_A04F05
error LNG10-S_INPUT_C01D08
error LNG11-M_INPUT_A04F05
error LNG11-S_INPUT_C01D08
error LNG9-M_INPUT_A04F05
error LNG9-S_INPUT_C01D08


In [12]:
merged_data['full_name'] = merged_data.CLIP.apply(lambda x: "_".join(os.path.basename(x).split("_")[:-1]))
merged_data = merged_data.reset_index()

merged_data = pd.merge(merged_data, grouped_final_qc_frame, 
               left_on="full_name", right_on="full_name", 
               how="left")

merged_data = merged_data.set_index(['uID', 'RBP', 'Cell line', 'rep'])

# Get Input Reads

In [13]:
input_names = grouped_final_qc_frame.set_index("full_file_name")
merged_data['input_file_name'] = merged_data.INPUT.apply(lambda x: os.path.basename(x).split(".")[0])
merged_data['input_usable'] = merged_data['input_file_name'].apply(lambda x: input_names.ix[x]['Usable Reads'])
merged_data['input_name'] = merged_data['input_file_name'].apply(lambda x: input_names.ix[x]['full_name'])

# Get Erics rmduped input reads

In [14]:
eric_reads = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/scripts/inline_processing/ENCODE_20170429_newannotations_FINAL.readnumbers.csv",
             names=["uID", "rep", "full_name", "same", "input_reads_name", "input_reads", "usable_name", "usable", 
              "unique_genome_nonrep_name", "unique_genome_nonrep"])
eric_reads = eric_reads.drop(["usable_name", "unique_genome_nonrep_name", "input_reads_name", "same"],axis=1)
eric_reads['uid'] = eric_reads.full_name.apply(lambda x: "_".join(x.split("_")[:-1]))
eric_reads = eric_reads.set_index("uid")

merged_data['family_map_input_usable'] = merged_data['input_name'].apply(lambda x: eric_reads.ix[x]['input_reads'])
#Don't forget that family map input usable is different than unique counts by a bit because I collapse more agressaveily than Eric does (stipping strange muiti-mappers)

# flag by min read number

In [15]:
arbitray_threshold = 1500000
merged_data['passed_ip_read_filter'] = merged_data.unique >= arbitray_threshold
merged_data['passed_input_read_filter'] = merged_data.family_map_input_usable > arbitray_threshold

In [16]:
make_and_filter_clipper_stringent = functools.partial(encode_helpers.make_and_filter_clipper, l2fc=5, pval=3)
make_and_filter_clipper_very_lineant = functools.partial(encode_helpers.make_and_filter_clipper, l2fc=0, pval=1)
make_and_filter_clipper_lineant = functools.partial(encode_helpers.make_and_filter_clipper, l2fc=2, pval=1.3)
make_and_filter_clipper_moderate = functools.partial(encode_helpers.make_and_filter_clipper, l2fc=3, pval=3)

In [17]:
merged_data['filtered_moderate'] = merged_data['input_norm'].progress_apply(make_and_filter_clipper_moderate)

Progress: 100%|██████████| 498/498 [00:17<00:00, 27.81it/s]


In [18]:
#Can't make this a progress_apply statement because something strange is going on with opening and closing of files
merged_data["CLIP_counts"] = merged_data.CLIP.apply(encode_helpers.get_mapped_reads)
merged_data["INPUT_counts"] = merged_data.INPUT.apply(encode_helpers.get_mapped_reads)

# Calculate FRiP scores

In [19]:
def format_metrics_file(bed_file, out_file=None):
    path, ext = os.path.splitext(os.path.basename(bed_file))
    if not out_file:
        metrics = os.path.join(frip_out_dir, path + ".metrics")
    else:
        metrics = out_file
    return metrics

def format_frip_analysis(bam_file, bed_file, out_file=None):
    metrics = format_metrics_file(bed_file, out_file)

    return "python /home/gpratt/gscripts/gscripts/clipseq/calculate_frip.py --bed {} --bam {} --out_file {}".format(bed_file, bam_file, metrics)

In [20]:
results = []
input_out_names = []
for name, row in merged_data.iterrows():
    if not os.path.exists(format_metrics_file(row.filtered_moderate)):
        results.append(format_frip_analysis(row.CLIP, row.filtered_moderate))
        
    input_out_name = row['filtered_moderate'] + "input.frip.metrics"
    input_out_names.append(input_out_name)
    if not os.path.exists(input_out_name):
        results.append(format_frip_analysis(row.INPUT, row.filtered_moderate, input_out_name))

#lazy
merged_data['input_frip_name'] = input_out_names

In [21]:
job_name = "frip_calculation"
job = qtools.Submitter(commands=results, 
                 job_name="{}".format(job_name), 
                sh_filename="/home/gpratt/projects/idr/scripts/{}.sh".format(job_name),
                array=True,
                walltime="2:00:00",
                out_filename="/home/gpratt/projects/idr/scripts/{}.out".format(job_name),
                err_filename="/home/gpratt/projects/idr/scripts/{}.err".format(job_name),
                queue="home-yeo")
job.job()

print "/home/gpratt/projects/idr/scripts/{}.sh".format(job_name)

/home/gpratt/projects/idr/scripts/frip_calculation.sh


running 0 tasks as an array-job.


In [22]:
def parse_frip_metrics(fn):
    return pd.read_table(fn).ix[0]

def parse_frip_metrics_ip(fn):
    return parse_frip_metrics(format_metrics_file(fn))

In [23]:
ip_frip_df = merged_data.filtered_moderate.progress_apply(parse_frip_metrics_ip)
input_frip_df = merged_data.input_frip_name.progress_apply(parse_frip_metrics)

Progress: 100%|██████████| 498/498 [00:16<00:00, 29.85it/s]
Progress: 100%|██████████| 498/498 [00:13<00:00, 35.69it/s]


In [24]:
merged_data = pd.merge(merged_data, ip_frip_df,
        left_index=True, right_index=True)

merged_data = pd.merge(merged_data, input_frip_df, 
                       left_index=True, right_index=True,
                      suffixes=("_ip", "_input"))

# Add Entropy to dataframe

In [25]:
annotated_bedtool_header = ['chrom', 'start', "stop", "name", "score", "strand", "annotation", "gene_id"]
full_header = ["chrom", "start", "stop", "full_name", "ip_reads", "input_reads", "p_val", "chisq", "test_type", 
               "enrichment", "log10_p_val", "log2_fold_change"]

In [26]:
def get_full_from_annotated(fn):
    stripped_fn = ".".join(fn.split(".")[:-3])
    return stripped_fn + ".full.compressed2.bed.full"

def calculate_entropy(row, total_ip_reads, total_input_reads):
    p_ip = float(row.ip_reads) / total_ip_reads
    p_input = float(row.input_reads) / total_input_reads
    
    return p_ip * np.log2(p_ip / p_input)

def get_entropy_from_annotated(fn):
    fn = os.path.basename(fn)
    stripped_fn = ".".join(fn.split(".")[:-3])
    stripped_fn = os.path.join(out_dir, stripped_fn)
    return stripped_fn + ".full.compressed2.bed.full.entropy.bed"

def sum_entropy(filtered_peaks, original_peaks):
    entropy = pd.read_table(get_entropy_from_annotated(original_peaks))
    filtered_peaks = pd.read_table(filtered_peaks, names=annotated_bedtool_header)

    merged_peaks = pd.merge(filtered_peaks, entropy, 
             left_on=['chrom', 'start', 'stop'],
             right_on=['chrom', 'start', 'stop'])

    return merged_peaks.entropy.sum()

def sum_entropy_row(row):
    #Sadly the majority of the time in this operation is opening the files, can't make it faster :(
    return sum_entropy(row.filtered_moderate, row['input_norm'])

In [27]:
for name, row in tqdm(list(merged_data.iterrows())):    
    full_fn = get_full_from_annotated(row['input_norm'])
    out_fn = os.path.join(out_dir, os.path.basename(full_fn) + ".entropy.bed")
    if os.path.exists(out_fn):
        continue

    ip_reads = row['CLIP_counts']
    input_reads = row['INPUT_counts']

    read_counts = pd.read_table(full_fn, names=full_header)
    
    tool = functools.partial(calculate_entropy, total_ip_reads=ip_reads, total_input_reads=input_reads)
    read_counts['entropy'] = read_counts.apply(tool, axis=1)
    read_counts.to_csv(out_fn, sep="\t", index=False, header=True)

100%|██████████| 498/498 [00:15<00:00, 31.92it/s]


In [28]:
merged_data['entropy'] = merged_data.progress_apply(sum_entropy_row, axis=1)

Progress: 498it [15:49,  3.71s/it]                       


# Combine Reproducable Information with reads mapping to repetitive elements

In [29]:
rep_elements = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/scripts/inline_processing/20170505.ALLENCODEinclnotsubmitted.txt.nopipes.txt",
                            header=0, names= ["element", "file","something", "clip_count", "clip_rpr", "input_count", "input_rpr", "fold-enrichment", "information content"])
merged_data['ip_raw_names'] = merged_data.CLIP.apply(lambda x: os.path.basename(x).split(".")[0])
important_reads = merged_data[['ip_raw_names', 'family_map_input_usable', 'unique']]

In [30]:
entropy_to_ignore = set([
    'unique_3utr',
    'unique_5utr',
    'unique_5utr_and_3utr',
    'unique_CDS',
    'unique_antisense_gencode',
    'unique_distintron',
    'unique_intergenic',
    'unique_noncoding_distintron',
    'unique_noncoding_exon',
    'unique_noncoding_proxintron',
    'unique_proxintron',
    'chrM',
    'antisense_chrM',])

entropy_to_use = list(set(rep_elements.element) - entropy_to_ignore)
rep_elements = rep_elements[rep_elements.element.isin(entropy_to_use)]

In [31]:
def get_chisq_value(row):
    "Calculates Chiseq values for all the reptitive elements"
    clip_total = row['unique'] - row.clip_count 
    input_total = row.family_map_input_usable - row.input_count

    g, p, dof, expctd = scipy.stats.chi2_contingency([[row.clip_count, clip_total], [row.input_count, input_total]])
    return p

In [32]:
rep_elements = pd.merge(rep_elements, important_reads, left_on="file", right_on="ip_raw_names")
rep_elements['p_val'] = rep_elements.apply(get_chisq_value, axis=1)
rep_elements['log10_p_val'] = rep_elements['p_val'].apply(lambda x: np.log10(x) * -1)
rep_elements.loc[np.isinf(rep_elements.log10_p_val), "log10_p_val"] = 400

In [33]:
sig_rep_elements = rep_elements[((rep_elements['log10_p_val'] > 3) & (rep_elements['fold-enrichment'] > 3)) | ((rep_elements.clip_rpr > .4) & (rep_elements['fold-enrichment'] > 1))]
rep_element_information = sig_rep_elements.groupby("file").sum()
rep_element_information = rep_element_information.drop(["fold-enrichment", "family_map_input_usable", "unique", "p_val", "log10_p_val"], axis=1)

In [34]:
merged_data = merged_data.reset_index()

#Some RBPs don't have any rep elements enriched, so I'm just saying they have 0 entropy
merged_data = pd.merge(merged_data, rep_element_information, left_on="ip_raw_names", right_index=True, how="left")
merged_data = merged_data.set_index(['uID', 'RBP', 'Cell line', 'rep'])
merged_data = merged_data.fillna(0)

# Create Analysis for General Plotting

In [35]:
def thresholding_plotter(threshold_col, actual_classification, df, out_fig=""):
    df['submitted_plot'] = ["Passed" if submitted else "Failed" for submitted in df[actual_classification]]
    df['group'] = 1
    
    true_positive_array, false_positive_array, threshold_array, best_threshold = encode_helpers.get_best_f_score(threshold_col, 
                                                                                              true_clasification_col= actual_classification,
                                                                                              df=df)

        
    num_rows = 1
    num_cols = 2
    with dataviz.Figure(os.path.join(img_dir, "{}_distribution.svg".format(out_fig)), figsize=(2.5 * num_cols, 2.5*num_rows)) as fig:
        ax = fig.add_subplot(num_rows, num_cols, 1)
        sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", data=df, ax=ax, alpha=.7, size=2, linewidth=.1)
        sns.despine(ax=ax)
        ax.axhline(best_threshold, color=".7", linewidth=1.5, linestyle="--")
        ax.set_ylim(0,)
        [tick.set_fontsize(8) for tick in ax.get_xticklabels()]
        [tick.set_fontsize(8) for tick in ax.get_yticklabels()]
        ax.set_ylabel("Entropy", fontsize=8)
        ax.set_title("Entropy in eCLIP datasets", fontsize=8)
        
        ax = fig.add_subplot(num_rows, num_cols, 2)
        sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", 
                      data=df[df[threshold_col] < .7], alpha=.7, size=2, linewidth=.1, ax=ax)
        sns.despine(ax=ax)
        ax.set_ylabel("", fontsize=8)

        ax.axhline(best_threshold, color=".7", linewidth=1.5, linestyle="--")
        ax.set_ylim(0,.7)
        ax.axhline()
        [tick.set_fontsize(8) for tick in ax.get_xticklabels()]
        [tick.set_fontsize(8) for tick in ax.get_yticklabels()]
        
    num_rows = 1
    num_cols = 3
    with dataviz.Figure(os.path.join(img_dir, "{}_reproducibility_test_passed_qc.svg".format(out_fig)), figsize=(3 * num_cols, 3 * num_rows)) as fig:
        ax = fig.add_subplot(1,3,1)
        ax.plot(false_positive_array, true_positive_array, label="ROC Curve")
        ax.set_ylabel("True Positive Rate")
        ax.set_xlabel("False Positive Rate")
        ax.plot([0,1], [0,1], label="Null Expectation") 
        sns.despine(ax=ax)
        ax.set_title("ROC Curve")
        ax.legend()

        ax = fig.add_subplot(1,3,2)
        ax.plot(threshold_array, true_positive_array)
        ax.set_ylabel("True Positive Rate")
        ax.set_xlabel("Threshold")
        sns.despine(ax=ax)
        ax.set_title("ROC Curve")
        ax.axvline(best_threshold)

        ax = fig.add_subplot(1,3,3)
        ax.plot(threshold_array, false_positive_array)
        ax.set_ylabel("False Positive Rate")
        ax.set_xlabel("Threshold")
        sns.despine(ax=ax)
        ax.set_title("ROC Curve")
        ax.axvline(best_threshold)

    print encode_helpers.confusion_numbers(best_threshold, threshold_col, true_clasification_col= actual_classification, df=df)

In [36]:
def thresholding_plotter_presentation(threshold_col, actual_classification, df, out_fig=""):
    df['submitted_plot'] = ["Passed" if submitted else "Failed" for submitted in df[actual_classification]]
    df['group'] = 1
    
    true_positive_array, false_positive_array, threshold_array, best_threshold = encode_helpers.get_best_f_score(threshold_col, 
                                                                                              true_clasification_col= actual_classification,
                                                                                              df=df)

        
    num_rows = 1
    num_cols = 2
    with dataviz.Figure(os.path.join(img_dir, "{}_distribution_presentation.svg".format(out_fig)), figsize=(4 * num_cols, 4*num_rows)) as fig:
        ax = fig.add_subplot(num_rows, num_cols, 1)
        sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", data=df, 
                      ax=ax, alpha=.7, size=5, linewidth=0)
        sns.despine(ax=ax)
        ax.axhline(best_threshold, color=".7", linewidth=3, linestyle="--")
        ax.set_ylim(0,)
        ax.legend(fontsize=20)
        [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
        [tick.set_fontsize(20) for tick in ax.get_yticklabels()]
        ax.set_ylabel("Entropy", fontsize=20)
        ax.set_title("Entropy in eCLIP datasets", fontsize=20)
        
        ax = fig.add_subplot(num_rows, num_cols, 2)
        sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", 
                      data=df[df[threshold_col] < .7], alpha=.7, size=7, linewidth=0, ax=ax)
        sns.despine(ax=ax)
        ax.set_ylabel("", fontsize=20)

        ax.axhline(best_threshold, color=".7", linewidth=3, linestyle="--")
        ax.set_ylim(0,.5)
        ax.axhline()
        [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
        [tick.set_fontsize(20) for tick in ax.get_yticklabels()]
        ax.legend(fontsize=20)

    num_rows = 1
    num_cols = 3
    with dataviz.Figure(os.path.join(img_dir, "{}_reproducibility_test_passed_qc_presentation.svg".format(out_fig)), figsize=(4 * num_cols, 4 * num_rows)) as fig:
        ax = fig.add_subplot(1,3,1)
        ax.plot(false_positive_array, true_positive_array, label="ROC Curve", linewidth=5)
        ax.set_ylabel("True Positive Rate", fontsize=20)
        ax.set_xlabel("False Positive Rate", fontsize=20)
        ax.plot([0,1], [0,1], label="Null Expectation") 
        sns.despine(ax=ax)
        ax.set_title("ROC Curve", fontsize=20)
        ax.legend(fontsize=20)
        [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
        [tick.set_fontsize(20) for tick in ax.get_yticklabels()]

        ax = fig.add_subplot(1,3,2)
        ax.plot(threshold_array, true_positive_array, linewidth=5)
        ax.set_ylabel("True Positive Rate", fontsize=20)
        ax.set_xlabel("Threshold", fontsize=20)
        sns.despine(ax=ax)
        ax.set_title("ROC Curve", fontsize=20)
        ax.axvline(best_threshold)

        ax = fig.add_subplot(1,3,3)
        ax.plot(threshold_array, false_positive_array)
        ax.set_ylabel("False Positive Rate", fontsize=20)
        ax.set_xlabel("Threshold", fontsize=20)
        sns.despine(ax=ax)
        ax.set_title("ROC Curve", fontsize=20)
        ax.axvline(best_threshold)

    print encode_helpers.confusion_numbers(best_threshold, threshold_col, true_clasification_col= actual_classification, df=df)

# Single Count Entropy

In [37]:
def calc_entropy(row):
    p = row.reads_in_peaks_ip / row.total_reads_ip
    q = row.reads_in_peaks_input / row.total_reads_input

    return p * np.log2(p/q)
merged_data['single_count_entropy'] = merged_data.apply(calc_entropy, axis=1)

# Reproducable Entropy

In [38]:
#This doesn't work because Eric hasn't run his entropy annotator for 374
def get_entropy(fn):
    if not os.path.exists(fn):
        print fn
        return 0
    
    df = pd.read_table(fn, names=['chrom', 'start', 'stop', 
                                  'l2fc', 'pval', 'strand', 
                                  'annotation', 'annotation_v2', 
                                  'gene_id', 'entropy'])
    return df.entropy.sum()

merged_data['idr_peaks'] = merged_data.index.get_level_values(level="uID").map(lambda x: os.path.join(idr_peak_dir, "{}.01v02.IDR.out.0102merged.bed.annotated_proxdist.entropy".format(x)))

idr_datasets = merged_data.xs("rep1", level=3).copy()
idr_datasets['rep_entropy'] = merged_data['information content'].groupby(level=['uID', 'RBP', 'Cell line']).mean()
idr_datasets['total_entropy'] = idr_datasets['idr_peaks'].apply(get_entropy)

/projects/ps-yeolab3/encode/analysis/Eric_IDR/374.01v02.IDR.out.0102merged.bed.annotated_proxdist.entropy


# EM Entropy

In [39]:
# em_peaks_dir = "/home/gpratt/projects/idr/analysis/input_em_peaks"

# def get_entropy_bed(fn):
#     if not os.path.exists(fn):
#         return 0
#     df = pd.read_table(fn, names=['chrom', 'start', 'stop', 'l2fc', 
#                                   'pval', 'strand', 'entropy'])
#     return df.entropy.sum()

# merged_data['input_norm_em'] = merged_data['input_norm'].apply(lambda x: os.path.join(em_peaks_dir, ".".join(os.path.basename(x).split(".")[:2]) + ".peaks.em.v3.bed"))
# merged_data['input_norm_em_filtered'] = merged_data['input_norm_em'].progress_apply(make_and_filter_clipper_moderate)
# merged_data['input_norm_em_entropy'] = merged_data['input_norm_em_filtered'].progress_apply(get_entropy_bed)

# Add IDR Label to merged data

In [40]:
idr_results

NameError: name 'idr_results' is not defined

In [None]:
idr_results = pd.read_csv("/home/gpratt/ipython_notebook/encode/idr_results.csv", header=[0,1], index_col=[0,1,2])
result = []
for index_1, index_2 in idr_results.columns:
    if index_2.startswith("Unnamed"):
        index_2 = ""
    result.append([index_1, index_2])
idr_results.columns = pd.MultiIndex.from_tuples(result)

important_cols = ['pesudoreplicate_count_v2',
                  'rep1_count_v2', 
                  'rep2_count_v2', 
                  'replicate_count_v2', 
                  'reproducibility_test_v2',
                  'rescue_ratio_v2',
                  'self_consistency_ratio_v2',
                 ]
idr_results = idr_results[important_cols]
idr_results.columns = idr_results.columns.get_level_values(level=0)


In [None]:
idr_results = idr_results.reset_index()
merged_data = merged_data.reset_index()
merged_data = pd.merge(merged_data, idr_results, 
         left_on=['uID', 'Cell line', 'RBP'], right_on=['uID', 'Cell line', 'RBP'], how="left")
merged_data = merged_data.set_index(['uID', 'RBP', 'Cell line', 'rep'])

# Add in rep element data to analysis

In [None]:
merged_data["frip_rep"] = merged_data['FRiP_ip'] + merged_data['clip_rpr']
merged_data['entropy_rep_exp'] = merged_data['entropy'] + merged_data['information content']
# merged_data['clip_rep_entropy_em'] = merged_data['input_norm_em_entropy'] + merged_data['information content']
idr_datasets['peak_rep_entropy'] = idr_datasets['rep_entropy'] + idr_datasets['total_entropy']

# Filter Merged Data for plotting of frip-like analysis

In [None]:
merged_data_read_filtered = merged_data[merged_data.passed_ip_read_filter & merged_data.passed_input_read_filter].copy()

filtered_data = merged_data_read_filtered.groupby(level=["uID", 'RBP', "Cell line"]).count().CLIP
idr_datasets_read_filtered = idr_datasets.ix[filtered_data[filtered_data == 2]]

# Basic FRIP Analysis 

In [None]:
thresholding_plotter("FRiP_ip", "generally_submittable", merged_data_read_filtered, out_fig="FRiP")

I ask given FRiP scores calculated for each dataset, and our hand annotated success vs fail metric we defined, can we use FRiP to predict if a dataset will pass or fail.  I generated a confusion matrix, and calculated an f-score for each cutoff possible.  I Also plotted the true positive and false positive rate.  I was able to achieve a fairly high true positive rate, but the descriminatory power was low overall.  The strongest thing I can say about this, is we've got a method that might be useful as a smell test.  

I'll try explanied entropy next.  

Long story short, the FRiP score isn't a good tool to identify good and bad datasets

# FRiP Analysis + Repetitive Elements

In [None]:
thresholding_plotter("frip_rep", "generally_submittable", merged_data_read_filtered, out_fig="FRiP_rep")

# Does Entropy Work better than FRiP?

In [None]:
thresholding_plotter("entropy", "generally_submittable", merged_data_read_filtered, out_fig="entropy")

In [None]:
thresholding_plotter_presentation("entropy", "generally_submittable", merged_data_read_filtered, out_fig="entropy")

In [None]:
df =merged_data_read_filtered
threshold_col = "entropy"
actual_classification = "generally_submittable"
out_fig = "foo_unlabeled"

df['submitted_plot'] = ["Passed" if submitted else "Failed" for submitted in df[actual_classification]]
df['group'] = 1


num_rows = 1
num_cols = 2
best_threshold = .044

plotting_cutoff = .5
with dataviz.Figure(os.path.join(img_dir, "{}_distribution_presentation.svg".format(out_fig)), figsize=(4 * num_cols, 4*num_rows)) as fig:
    ax = fig.add_subplot(num_rows, num_cols, 1)
    sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", data=df, 
                  ax=ax, alpha=.7, size=6, linewidth=0)
    sns.despine(ax=ax)
    #ax.axhline(best_threshold, color=".7", linewidth=5, linestyle="--")
    ax.set_ylim(0,)
    ax.legend(fontsize=20)
    [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(20) for tick in ax.get_yticklabels()]
    ax.set_ylabel("Entropy", fontsize=20)
    ax.set_title("Entropy in eCLIP datasets", fontsize=20)

    ax = fig.add_subplot(num_rows, num_cols, 2)
    sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", 
                  data=df[df[threshold_col] < plotting_cutoff], alpha=.7, size=5, linewidth=0, ax=ax)
    sns.despine(ax=ax)
    ax.set_ylabel("", fontsize=20)

    #ax.axhline(best_threshold, color=".7", linewidth=6, linestyle="--")
    ax.set_ylim(0, plotting_cutoff)
    ax.axhline()
    [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(20) for tick in ax.get_yticklabels()]
    ax.legend(fontsize=20)

In [None]:
df = merged_data_read_filtered
threshold_col = "entropy"
actual_classification = "generally_submittable"
out_fig = "foo"

df['submitted_plot'] = ["Passed" if submitted else "Failed" for submitted in df[actual_classification]]
df['group'] = 1


num_rows = 1
num_cols = 2
best_threshold = .044

plotting_cutoff = .5
with dataviz.Figure(os.path.join(img_dir, "{}_distribution_presentation.svg".format(out_fig)), figsize=(4 * num_cols, 4*num_rows)) as fig:
    ax = fig.add_subplot(num_rows, num_cols, 1)
    sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", data=df, 
                  ax=ax, alpha=.7, size=6, linewidth=0)
    sns.despine(ax=ax)
    ax.axhline(best_threshold, color=".7", linewidth=5, linestyle="--")
    ax.set_ylim(0,)
    ax.legend(fontsize=20)
    [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(20) for tick in ax.get_yticklabels()]
    ax.set_ylabel("Entropy", fontsize=20)
    ax.set_title("Entropy in eCLIP datasets", fontsize=20)

    ax = fig.add_subplot(num_rows, num_cols, 2)
    sns.swarmplot(x="group", y=threshold_col, hue="submitted_plot", 
                  data=df[df[threshold_col] < plotting_cutoff], alpha=.7, size=5, linewidth=0, ax=ax)
    sns.despine(ax=ax)
    ax.set_ylabel("", fontsize=20)

    ax.axhline(best_threshold, color=".7", linewidth=6, linestyle="--")
    ax.set_ylim(0, plotting_cutoff)
    ax.axhline()
    [tick.set_fontsize(20) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(20) for tick in ax.get_yticklabels()]
    ax.legend(fontsize=20)

In [None]:
#Try with just true positives that we pass due to peaks
peaks_and_negatives = merged_data_read_filtered[~merged_data_read_filtered.family_mapping_submitable]
thresholding_plotter("entropy", "generally_submittable", peaks_and_negatives)

In [None]:
fp = peaks_and_negatives[(peaks_and_negatives.entropy >= 0.044) & ~peaks_and_negatives.generally_submittable]
fp_grp = fp.groupby(level=['uID', "RBP", 'Cell line']).count().CLIP >= 2
fp_grp[fp_grp]

15 experiments in total are false positives

In [None]:
fn = peaks_and_negatives[(peaks_and_negatives.entropy < 0.044) & peaks_and_negatives.generally_submittable]
fn.groupby(level=['uID', "RBP", 'Cell line']).count().CLIP >= 2

15 datasets are false negatives, 7 are reproducably bad in both reps, and are broad binders, 8 are negatives in one dataset or another, I'll need to look into why these datasets are bad.  

# Entropy + Repetitive Elements


In [None]:
thresholding_plotter("entropy_rep_exp", "generally_submittable", merged_data_read_filtered, out_fig="entropy_rep")

# Single Count Entropy

In [None]:
thresholding_plotter("single_count_entropy", "generally_submittable", merged_data_read_filtered)

# EM Entropy

In [None]:
# thresholding_plotter("input_norm_em_entropy", "generally_submittable", merged_data_read_filtered)

# EM Entropy + Repetitive Elements

In [None]:
# thresholding_plotter("clip_rep_entropy_em", "generally_submittable", merged_data_read_filtered)

# Reproducable peaks

In [None]:
# thresholding_plotter("total_entropy", "generally_submittable", idr_datasets_read_filtered)

# Reproducable Entropy + Repetitive Elements

In [None]:
# thresholding_plotter("peak_rep_entropy", "generally_submittable", idr_datasets_read_filtered)

In [None]:
#merged_data.to_csv("/home/gpratt/Dropbox/ENCODE_FINAL_ANNOTATIONS_PEAKS_IDR.csv")

Various families have very different information content paramaters, so I think I'll probably have to classify for each family independently?

1. 246 -- need to check QC numbers, MT-trna binding, am I capturing this? Is this being lost due to my re-assignments?
1. 311 -- no notes as to why passed
1. 358 rep2 -- tRNAs not quite enriched... maybe need to create a better thresholding system for each element?
1. 461 rep1 -- rep1 peaks aren't as enriched as rep2 peaks.  Also the repetitive elements kind of suck.  I might 1. actually want to fail this dataset due to lack of repdoucabilty...
1. 470 rep1 -- rep1 peaks aren't as enriched as rep2 peaks.  I might actually want to fail this dataset due to lack of repdoucabilty...
1. 550 rep2 -- rep2 peaks aren't as enriched as rep1 peaks.  I might actually want to fail this dataset due to lack of repdoucabilty...

# Passed entropy datasets

In [None]:
merged_data['passed_entropy'] = merged_data.entropy >= .044

passed_exp_entropy = merged_data.groupby(level=['uID', 'RBP', 'Cell line']).apply(lambda x: all([all(x['passed_entropy']),
                                                                                                 all(x['passed_ip_read_filter']),
                                                                                                 all(x['passed_input_read_filter'])]))
passed_exp_entropy = passed_exp_entropy.rename("passed_exp")
passed_exp_entropy = passed_exp_entropy.reset_index()

merged_data = merged_data.reset_index()
merged_data = pd.merge(merged_data, passed_exp_entropy,
         left_on=['uID', 'Cell line', 'RBP'], right_on=['uID', 'Cell line', 'RBP'], how="left")
merged_data = merged_data.set_index(['uID', 'RBP', 'Cell line', 'rep'])

# Understanding False Positives and False Negatives for Basic Entropy

In [None]:
peak_level_fn = merged_data[merged_data.passed_ip_read_filter & merged_data.passed_input_read_filter & merged_data.generally_submittable & ~merged_data.passed_exp]
peak_level_fp = merged_data[merged_data.passed_ip_read_filter & merged_data.passed_input_read_filter & ~merged_data.generally_submittable & merged_data.passed_exp]
peak_level_tp = merged_data[merged_data.passed_ip_read_filter & merged_data.passed_input_read_filter & merged_data.generally_submittable & merged_data.passed_exp]

In [None]:
peak_level_fn[['entropy', 'notes']]

In [None]:
peak_level_fp[['entropy', 'notes']]

# Of the datasets passed, how many of them would then fail via IDR?

In [None]:
peak_level_fp_grp = peak_level_fp.groupby(level=['uID', 'RBP', 'Cell line'])
peak_level_fp_rep1 = peak_level_fp_grp.first()
peak_level_fp_rep1 = peak_level_fp_rep1[peak_level_fp_grp.count().CLIP == 2]

In [None]:
peak_level_fp_rep1

In [None]:
num_rows = 1
num_cols = 1
with dataviz.Figure(os.path.join(img_dir, "false_positives.svg"), figsize=(2.5 * num_cols, 2.5 * num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)

    ax.set_ylabel("Count")
    ax.set_xlabel("Reproducablity Test")
    #ax.set_title("Number of replicates\npassing reproducibility test", fontsize=18)
    sns.despine(ax=ax)

    sns.factorplot(x='reproducibility_test_v2', 
                   kind='count', 
                   data=peak_level_fp_rep1, 
                   order=["pass", "borderline", "fail"],
                   ax=ax
                  )

    ax.set_ylabel("Number of Datasets", fontsize=18)
    ax.set_xlabel("")
    [tick.set_fontsize(14) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(14) for tick in ax.get_yticklabels()]

In [None]:
#peak_level_fp_rep1[peak_level_fp_rep1.reproducibility_test_v2 == "pass"]['notes']

In [None]:
peak_level_tp_grp = peak_level_tp.groupby(level=['uID', 'RBP', 'Cell line'])
peak_level_tp_rep1 = peak_level_tp_grp.first()
peak_level_tp_rep1 = peak_level_tp_rep1[peak_level_tp_grp.count().CLIP == 2]

In [None]:
num_rows = 1
num_cols = 1
with dataviz.Figure(os.path.join(img_dir, "reproducibility_test_tp.svg"), figsize=(2.5 * num_cols, 2.5 * num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)
    
    ax.set_ylabel("Count")
    ax.set_xlabel("Reproducablity Test")
    #ax.set_title("True Positives\npassing reproducibility test", fontsize=18)
    sns.despine(ax=ax)
    
    sns.factorplot(x='reproducibility_test_v2', 
                   kind='count', 
                   data=peak_level_tp_rep1, 
                   order=["pass", "borderline", "fail"],
                   ax=ax
                  )
    
    #ax.set_ylabel("Number of Datasets", fontsize=18)
    ax.set_xlabel("")
    [tick.set_fontsize(14) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(14) for tick in ax.get_yticklabels()]

In [None]:
#peak_level_tp_rep1[peak_level_tp_rep1.reproducibility_test_v2_y == "fail"]['notes']

In [None]:
exp_data = merged_data.xs("rep1", level="rep")

In [None]:
exp_data.loc[(exp_data.generally_submittable & exp_data.passed_exp), 'classification'] = 'True Positive'
exp_data.loc[(~exp_data.generally_submittable & exp_data.passed_exp), 'classification'] = 'False Positive'
exp_data.loc[(exp_data.generally_submittable & ~exp_data.passed_exp), 'classification'] = 'False Negative'
exp_data.loc[(~exp_data.generally_submittable & ~exp_data.passed_exp), 'classification'] = 'True Negative'

In [None]:
num_rows = 1
num_cols = 1
with dataviz.Figure(os.path.join(img_dir, "reproducibility_test_presentation.svg"), figsize=(5 * num_cols, 5 * num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)
    
    ax.set_ylabel("Count")
    ax.set_xlabel("Reproducablity Test")
    ax.set_title("Number of replicates\npassing reproducibility test", fontsize=18)
    sns.despine(ax=ax)
    
    sns.factorplot(x='reproducibility_test_v2', 
                   hue="classification",
                   kind='count', 
                   data=exp_data, 
                   order=["pass", "borderline", "fail"],
                   hue_order=['True Positive', 'False Positive', 'False Negative', 'True Negative'],
                   ax=ax
                  )
    
    ax.set_ylabel("Number of Datasets", fontsize=18)
    ax.set_xlabel("")
    ax.legend(fontsize=18)
    [tick.set_fontsize(14) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(14) for tick in ax.get_yticklabels()]

In [None]:
num_rows = 1
num_cols = 1
with dataviz.Figure(os.path.join(img_dir, "reproducibility_test.svg"), figsize=(2.5 * num_cols, 2.5 * num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)
    
    ax.set_ylabel("Count")
    ax.set_xlabel("Reproducablity Test")
    ax.set_title("Number of replicates\npassing reproducibility test", fontsize=8)
    sns.despine(ax=ax)
    
    sns.factorplot(x='reproducibility_test_v2', 
                   hue="classification",
                   kind='count', 
                   data=exp_data, 
                   order=["pass", "borderline", "fail"],
                   hue_order=['True Positive', 'False Positive', 'False Negative', 'True Negative'],
                   ax=ax
                  )
    
    ax.set_ylabel("Number of Datasets", fontsize=8)
    ax.set_xlabel("")
    ax.legend(fontsize=8)
    [tick.set_fontsize(8) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(8) for tick in ax.get_yticklabels()]

1. Use just basic entropy to classify RBPs.
1. Need to look into why the false negatives are false negatives
    - Broad Binders generally, don't need to worry
1. How many false positives are kept around after only taking both replicates 
    - on par with IDR filtering 
1. Need to look at IDR filtering on all datasets after, how many datasets get passed or failed due to the new IDR approach. 
    - done slightly better
1. Need to see if its worth coming up with a different classifier for just the datasets with repetitive elements
    - No they get classified nicely 
1. Need to see if there is some sort of other filtering criteria I can apply to the false positives
   - Still looking into this
1. Need to check on that final non-submitted dataset --re-running, very slow...
    - done
1. Need to re-do the final model I come up with on a training and test dataset to show accuracy
    - done

# Create Matrix of QC results for all data

In [None]:
output_frame = merged_data[['passed_ip_read_filter',
                            'passed_input_read_filter', 
                            'passed_entropy', 
                            #'reproducibility_test_v2', 
                            #'passed_exp_entropy'
                           ]]

In [None]:
HTML(output_frame.to_html())

In [None]:
#Fscore for final thing

In [None]:
encode_helpers.get_best_f_score??

In [None]:
foo = merged_data.xs("rep1", level="rep")

In [None]:
import sklearn

In [None]:
sklearn.metrics.f1_score(foo.generally_submittable, (foo.reproducibility_test_v2.isin(["pass"]) & foo.passed_exp))

In [None]:
sklearn.metrics.f1_score(foo.generally_submittable, (foo.reproducibility_test_v2.isin(["pass", "borderline"]) & foo.passed_exp))

In [None]:
tp_final = foo[foo.generally_submittable & (foo.reproducibility_test_v2.isin(["pass", "borderline"]) & foo.passed_exp)]
fp_final = foo[~foo.generally_submittable & (foo.reproducibility_test_v2.isin(["pass", "borderline"]) & foo.passed_exp)]
fn_final = foo[foo.generally_submittable & ~(foo.reproducibility_test_v2.isin(["pass", "borderline"]) & foo.passed_exp)]
tn_final = foo[~foo.generally_submittable & ~(foo.reproducibility_test_v2.isin(["pass", "borderline"]) & foo.passed_exp)]

In [None]:
with dataviz.Figure(os.path.join(img_dir, "submittable.svg"), figsize=(2.5 * num_cols, 2.5 * num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)

    sns.countplot(x="generally_submittable", data=foo, ax=ax)
    [tick.set_fontsize(18) for tick in ax.get_xticklabels()]
    [tick.set_fontsize(18) for tick in ax.get_yticklabels()]
    ax.set_ylabel("All Experiments", fontsize=18)
    ax.set_xticklabels(['Rejected', 'Passed'])
    ax.set_xlabel("")
    sns.despine(ax=ax)

In [None]:
print len(foo[foo.generally_submittable])
print len(foo[~foo.generally_submittable])

In [None]:
tp_final

In [None]:
len(tp_final), len(fp_final), len(fn_final), len(tn_final)

In [None]:
fn_final.entropy < .04

In [None]:
fn_final.reproducibility_test_v2.isin(["pass", "borderline"])

In [None]:
fn_final.passed_exp

In [None]:
fn_final.unique