In [1]:
%matplotlib inline
from collections import defaultdict, Counter
import glob
import os
import json

from IPython.core.display import HTML, Image
from matplotlib_venn import venn3
import pandas as pd
import pybedtools
import pysam
import gspread
from gscripts.general import parsers
from gscripts.general import dataviz
%load_ext autoreload
import numpy as np
%autoreload 2
reload(parsers)
reload(pybedtools)
import seaborn as sns
import matplotlib
from oauth2client.client import SignedJwtAssertionCredentials

In [2]:
from matplotlib import rc
rc('text', usetex=False)
matplotlib.rcParams['svg.fonttype'] = 'none'

rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})

In [3]:
json_key = json.load(open("../public clip-588adbc137f3.json"))
scope = ['https://spreadsheets.google.com/feeds']

credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)
gc = gspread.authorize(credentials)

sht1 = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ZU2mQh54jentqvhR_oMnviLGWR8Nw_x338gULzKjNDI/edit#gid=0")
ws = sht1.worksheet("Sheet1")
list_of_lists = ws.get_all_values()
manifest = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])
manifest['qc_id'] = manifest.apply(lambda x: "{}_{}".format(x.ENCODE_ID, x.RBP), axis=1)
manifest.is_encode = manifest.is_encode == "TRUE"

In [None]:
def format_file(index, adapter):
    index_1 = illumina_adapters.ix[index.index_1].values[0]
    index_2 = illumina_adapters.ix[index.index_2].values[0]
    if index.Lane == "": #Incase we are doing a rapid run and there isn't lane info
        sample_name = "{}_{}-{}_{}.fastq.gz".format(index.Hiseq_file_name, index_2, index_1, adapter)
        dir_name = "Sample_{}".format(index.Hiseq_file_name)

    elif index.Lane.startswith("S"):
        name = index.Hiseq_file_name.split("-")[0].replace("_", "-")
        sample_name = "{}_{}_{}.fastq.gz".format(name, index.Lane, adapter)
        dir_name = index.Hiseq_file_name
    else:
        sample_name = "{}_{}-{}_{}_{}.fastq.gz".format(index.Hiseq_file_name, index_2, index_1, index.Lane, adapter)
        dir_name = "Sample_{}".format(index.Hiseq_file_name)
    return os.path.join(index.file_location, dir_name, sample_name)

#power Curve

In [None]:
encode_qc_v12 = parsers.clipseq_metrics("/projects/ps-yeolab2/encode/analysis/encode_v12/", iclip=True)
encode_qc_v13 = parsers.clipseq_metrics("/projects/ps-yeolab/encode/analysis/encode_v13/", iclip=True)

encode_qc = pd.concat([encode_qc_v12, encode_qc_v13])

#encode_qc = parsers.clipseq_metrics(analysis_dir, iclip=True)

encode_qc["Fraction Collapsed"] = encode_qc['Usable Reads'] / encode_qc['Uniquely Mapped Reads'].astype(float)
encode_qc["Fraction Usable"] = encode_qc['Usable Reads'] / encode_qc['Input Reads'].astype(float)
encode_qc['is_v12'] = 1

unmerged_clip_manifest = encode_qc[["merged" not in index for index in encode_qc.index]]

manifest_df = pd.read_table("/home/gpratt/projects/encode/scripts/encode_v12.txt", header=None,
             names=['fastq', 'species', 'encode_id', 'barcodes', 'barcodes_len', 'more_barcodes', "randomer_length"])
manifest_df['qc_name'] = manifest_df.fastq.apply(lambda x: ".".join(os.path.basename(x.split(";")[0]).split(".")[:2]))

tmp = pd.merge(manifest_df, unmerged_clip_manifest, left_on="qc_name", right_index=True)

final_qc_frame = pd.merge(manifest, tmp, left_on='qc_id', right_on='encode_id')
final_qc_frame['exp_id'] = final_qc_frame.ENCODE_ID.apply(lambda x:x.split("_")[0])

def get_rep_num(encode_id):
    try:
        return encode_id.split("_")[1]
    except: 
        return np.nan
final_qc_frame['rep_num'] = final_qc_frame.ENCODE_ID.apply(get_rep_num)
final_qc_frame['Input Reads'] = final_qc_frame['Input Reads'].fillna(0)


In [None]:
encode_qc[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            #"Num Peaks",
                            #"Passed QC"
                            ]].to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/unannoated_qc_filtered_v13.csv")

In [None]:
# analysis_dir = "/home/gpratt/projects/encode/analysis/encode_v8/"
# encode_v8 = parsers.clipseq_metrics(analysis_dir, iclip=True)

# encode_v8["Fraction Collapsed"] = encode_v8['Usable Reads'] / encode_v8['Uniquely Mapped Reads'].astype(float)
# encode_v8["Fraction Usable"] = encode_v8['Usable Reads'] / encode_v8['Input Reads'].astype(float)

In [None]:
encode_qc.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/unannoated_qc_v12.csv")

In [None]:
#encode_qc = pd.read_csv("/home/gpratt/Dropbox/encode_integration/for_eric/unannoated_qc_v9_stable.csv", index_col=0)
#encode_qc['is_v12'] = 0

unmerged_clip_manifest = encode_qc[["merged" not in index for index in encode_qc.index]]

manifest_df = pd.read_table("/home/gpratt/projects/encode/scripts/encode_v13.txt", header=None,
             names=['fastq', 'species', 'encode_id', 'barcodes', 'barcodes_len', 'more_barcodes', "randomer_length"])
manifest_df['qc_name'] = manifest_df.fastq.apply(lambda x: ".".join(os.path.basename(x.split(";")[0]).split(".")[:2]))

tmp = pd.merge(manifest_df, unmerged_clip_manifest, left_on="qc_name", right_index=True)

final_qc_frame = pd.merge(manifest, tmp, left_on='qc_id', right_on='encode_id')
final_qc_frame['exp_id'] = final_qc_frame.ENCODE_ID.apply(lambda x:x.split("_")[0])

def get_rep_num(encode_id):
    try:
        return encode_id.split("_")[1]
    except: 
        return np.nan
final_qc_frame['rep_num'] = final_qc_frame.ENCODE_ID.apply(get_rep_num)
final_qc_frame['Input Reads'] = final_qc_frame['Input Reads'].fillna(0)

In [None]:
new_index = []
for index, row in final_qc_frame.iterrows():
    new_index.append([row.RBP, row.exp_id, row.rep_num, row.cell_type, row.ENCODE_ID])
    
final_qc_frame.index = pd.MultiIndex.from_tuples(new_index, names=['rbp', 'encode_id', 'rep', 'cell_type', "full_name"])
final_qc_frame = final_qc_frame.sort_index()

In [None]:
lf_frame = encode_qc[["LF_" in item for item in encode_qc.index]]

HTML(lf_frame[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            #"Num Peaks",
                            #"Passed QC"
                            ]].to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas}))

In [None]:
lf_frame = encode_qc[["LF-" in item for item in encode_qc.index]]

HTML(lf_frame[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            #"Num Peaks",
                            #"Passed QC"
                            ]].to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas}))

In [None]:
filtered_final_qc_frame = final_qc_frame[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Reads Passing Quality Filter",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            'Number of reads mapped to too many loci',
                            '% of reads unmapped: too short',
                            '% of reads mapped to too many loci',
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            'is_v12',
                            #"Passed QC"
                            ]]

#filtered_final_qc_frame = filtered_final_qc_frame.astype(float)

In [None]:
cvb_frame = unmerged_clip_manifest[["CVB" in item for item in unmerged_clip_manifest.index]]
cvb_frame.to_csv("CBV_stats_all.csv")

In [None]:
cvb_frame = unmerged_clip_manifest[["CVB" in item for item in unmerged_clip_manifest.index]]
cvb_frame['percent_repetitive'] = cvb_frame['repetitive_count'] / cvb_frame['Reads Written']
cvb_frame.to_csv("CBV_stats_all.csv")
filtered_cbv_frame = cvb_frame[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            'percent_repetitive',
                            "Uniquely Mapped Reads",
                            'Reads Passing Quality Filter',
                            "Uniquely mapped reads %",
                            'Number of reads mapped to too many loci',
                            '% of reads unmapped: too short',
                            '% of reads mapped to too many loci',
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                           
                            #"Passed QC"
                            ]]

HTML(filtered_cbv_frame.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

In [None]:
HTML(filtered_final_qc_frame.sort_index(level="encode_id", ascending=False).to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

In [None]:
final_qc_frame.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/master_qc_v13.csv")

In [None]:
HTML(filtered_final_qc_frame.to_html())

In [None]:
filtered_final_qc_frame.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/master_qc_filtered_v13.csv")

In [None]:
grouped_final_qc_frame = filtered_final_qc_frame.groupby(level=['rbp', 'encode_id', 'rep', 'cell_type', 
                                                                'full_name'
                                                               ]).sum()
grouped_final_qc_frame["Fraction Collapsed"] = grouped_final_qc_frame['Usable Reads'] / grouped_final_qc_frame['Uniquely Mapped Reads'].astype(float)
grouped_final_qc_frame["Fraction Usable"] = grouped_final_qc_frame['Usable Reads'] / grouped_final_qc_frame['Input Reads'].astype(float)
grouped_final_qc_frame = grouped_final_qc_frame.drop("Num Peaks", axis=1)

In [None]:
grouped_final_qc_frame = grouped_final_qc_frame.dropna()

In [None]:
encode_only_qc = grouped_final_qc_frame[[item[0].isdigit() and item[-1].isdigit() for item in grouped_final_qc_frame.index.get_level_values(level="encode_id")]]
#encode_only_qc = encode_only_qc.drop("R60")
#encode_only_qc = encode_only_qc.drop('INPUT', level="rep")
#encode_only_qc['repetitive_count'] = (encode_only_qc['Reads Written'] - encode_only_qc['Reads Passing Quality Filter']).astype(int)

encode_only_qc.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/encode_master_qc_v13.csv")

In [None]:
HTML(grouped_final_qc_frame.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

In [None]:
grouped_final_qc_frame.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/master_qc_v12.csv")

In [None]:
grouped_final_qc_frame.dropna()

In [None]:
cvb_frame = unmerged_clip_manifest[["CVB" in item for item in unmerged_clip_manifest.index]]
cvb_frame['percent_repetitive'] = cvb_frame['repetitive_count'] / cvb_frame['Reads Written']
cvb_frame.to_csv("CBV_stats_all.csv")
filtered_cbv_frame = cvb_frame[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            'percent_repetitive',
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            #"Passed QC"
                            ]]

HTML(filtered_cbv_frame.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

#QC for Kris' data only

Good:
NUMA1 
RANGAP1
ZNF217 
ZNF184 
RNF219 
NUMA1 

Borderline:
RNF219 -- low mapping %, and doesn't hit 1m reads
AIFM1  -- low mapping % and barely hits 1m reads
DHX30 
BAG2 -- Might bind repetitive elements?

Bad:
VIM -- low mapping % doesn't hit 1m reads
AIFM1 -- low mapping % doesn't hit 1m reads
RANGAP1 -- KB1 odd result


In [None]:
kb_filtered_final_qc_frame = filtered_final_qc_frame[['KB' in item for item in filtered_final_qc_frame.index.get_level_values("encode_id")]]
kb_filtered_final_qc_frame.index = kb_filtered_final_qc_frame.index.droplevel(["cell_type"])
kb_filtered_final_qc_frame.index = kb_filtered_final_qc_frame.index.droplevel(["rep"])
kb_filtered_final_qc_frame = kb_filtered_final_qc_frame.drop("repetitive_count", axis=1)
kb_filtered_final_qc_frame['repetitive_count'] = (kb_filtered_final_qc_frame['Reads Written'] - kb_filtered_final_qc_frame['Reads Passing Quality Filter']).astype(int)

HTML(kb_filtered_final_qc_frame.sort_index(level="rbp", ascending=False).to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Reads Passing Quality Filter" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

In [None]:
kb_filtered_final_qc_frame.sort_index(level="rbp", ascending=False).to_csv("/home/gpratt/Dropbox/kris_qc_table.csv")

#For Natasha

In [None]:
kb_filtered_final_qc_frame = filtered_final_qc_frame[['NM' in item for item in filtered_final_qc_frame.index.get_level_values("encode_id")]]
kb_filtered_final_qc_frame.index = kb_filtered_final_qc_frame.index.droplevel(["cell_type"])
kb_filtered_final_qc_frame.index = kb_filtered_final_qc_frame.index.droplevel(["rep"])
kb_filtered_final_qc_frame = kb_filtered_final_qc_frame.drop("repetitive_count", axis=1)
kb_filtered_final_qc_frame['repetitive_count'] = (kb_filtered_final_qc_frame['Reads Written'] - kb_filtered_final_qc_frame['Reads Passing Quality Filter']).astype(int)

HTML(kb_filtered_final_qc_frame.sort_index(level="rbp", ascending=False).to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Reads Passing Quality Filter" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

In [None]:
kb_filtered_final_qc_frame.sort_index(level="rbp", ascending=False).to_csv("/home/gpratt/Dropbox/Natasha_qc_table.csv")

In [None]:
grouped_final_qc_frame[grouped_final_qc_frame['Usable Reads'] < 1000000]

#Public CLIP QC

In [None]:
analysis_dir = "/home/gpratt/projects/public_clip/analysis/public_clip_v9/"
public_clip = parsers.clipseq_metrics(analysis_dir, iclip=True)

public_clip["Fraction Collapsed"] = public_clip['Usable Reads'] / public_clip['Uniquely Mapped Reads'].astype(float)
public_clip["Fraction Usable"] = public_clip['Usable Reads'] / public_clip['Input Reads'].astype(float)
unmerged_public_clip_manifest = public_clip[["merged" not in index for index in public_clip.index]]


In [35]:
filtered_unmerged_public_clip_manifest = unmerged_public_clip_manifest[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            #"Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            #"Passed QC"
                            ]]

filtered_unmerged_public_clip_manifest = filtered_unmerged_public_clip_manifest.astype(float)
filtered_unmerged_public_clip_manifest.index = filtered_unmerged_public_clip_manifest.index.map(lambda x: x.split(".")[0])

In [36]:
HTML(filtered_unmerged_public_clip_manifest.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

Unnamed: 0,Input Reads,Reads Written,repetitive_count,Uniquely Mapped Reads,Usable Reads,Fraction Collapsed,Fraction Usable,Num Peaks
ERR018282,2610554,2582733,2593238,1431706,313534,0.218993,0.120102,5869
ERR018283,66022,65400,117905,37144,6122,0.164818,0.092727,38
ERR018284,17669,17521,30536,9419,1429,0.151715,0.080876,13
ERR039833,266483,263875,1071,4513,1646,0.364724,0.006177,9
ERR039834,292233,287271,7647,5874,1085,0.184712,0.003713,17
ERR039835,95517,93031,7183,7643,2434,0.318461,0.025482,87
ERR039836,1161916,1141954,345543,765320,338656,0.442502,0.291463,1423
ERR039837,8238622,8144369,2557742,3949372,774362,0.196072,0.093992,8009
ERR039838,1094384,1070309,459870,696175,83171,0.119469,0.075998,914
ERR039839,823710,798132,245971,332118,116000,0.349273,0.140826,925


In [37]:
analysis_dir = "/home/gpratt/projects/public_clip/analysis/public_iclip_v1/"
public_iclip = parsers.clipseq_metrics(analysis_dir, iclip=True)

public_iclip["Fraction Collapsed"] = public_iclip['Usable Reads'] / public_iclip['Uniquely Mapped Reads'].astype(float)
public_iclip["Fraction Usable"] = public_iclip['Usable Reads'] / public_iclip['Input Reads'].astype(float)
unmerged_public_iclip_manifest = public_iclip[["merged" not in index for index in public_iclip.index]]

unmerged_public_iclip_manifest.index = pd.MultiIndex.from_tuples([item.split(".") for item in unmerged_public_iclip_manifest.index], 
                          names=["sra_id", "barcode"])

In [38]:
unmerged_public_iclip_manifest = unmerged_public_iclip_manifest.groupby(level="sra_id").sum()
unmerged_public_iclip_manifest["Fraction Collapsed"] = unmerged_public_iclip_manifest['Usable Reads'] / unmerged_public_iclip_manifest['Uniquely Mapped Reads'].astype(float)
unmerged_public_iclip_manifest["Fraction Usable"] = unmerged_public_iclip_manifest['Usable Reads'] / unmerged_public_iclip_manifest['Input Reads'].astype(float)


In [39]:
filtered_unmerged_public_iclip_manifest = unmerged_public_iclip_manifest[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            #"Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            #"Passed QC"
                            ]]

HTML(filtered_unmerged_public_iclip_manifest.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

Unnamed: 0_level_0,Input Reads,Reads Written,repetitive_count,Uniquely Mapped Reads,Usable Reads,Fraction Collapsed,Fraction Usable,Num Peaks
sra_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ERR018283,66022,62866,120426,38040,6188,0.162671,0.093726,38
ERR018284,17669,16066,30832,9545,1451,0.152017,0.082121,16
ERR039833,266483,196342,3302,2795,1493,0.534168,0.005603,33
ERR039834,292233,262220,10360,6338,1192,0.188072,0.004079,41
ERR039835,95517,48117,20371,10224,3211,0.314065,0.033617,105
ERR039836,1161916,1134233,453178,848002,401071,0.47296,0.345181,1664
ERR039837,8238622,7923278,3708214,4540004,899570,0.198143,0.109189,9219
ERR039838,1094384,1062718,517192,716667,86915,0.121277,0.079419,946
ERR039839,823710,775221,341660,359604,128309,0.356806,0.15577,1023
ERR039840,4868365,4291887,2134540,2327262,93997,0.04039,0.019308,2054


In [40]:
public_clip_database = sht1.worksheet("public_clip_database")
list_of_lists = public_clip_database.get_all_values()
public_clip_database = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])

merged_public_df = pd.merge(public_clip_database, filtered_unmerged_public_clip_manifest , left_on="SRA", right_index=True)
merged_public_df = merged_public_df[merged_public_df.Species.isin(["hg19", "mn9"])]
merged_public_df.index = merged_public_df.SRA

public_iclip = merged_public_df[merged_public_df['type'] == 'iclip']
public_clip = merged_public_df[merged_public_df['type'].isin({'clip', 'par-clip', 'par-clip 4SU'})]

BadStatusLine: ''

In [None]:
analysis_dir = "/home/gpratt/projects/encode/analysis/v10/"
old_encode = parsers.clipseq_metrics(analysis_dir, iclip=True)

old_encode["Fraction Collapsed"] = old_encode['Usable Reads'] / old_encode['Uniquely Mapped Reads'].astype(float)
old_encode["Fraction Usable"] = old_encode['Usable Reads'] / old_encode['Input Reads'].astype(float)
unmerged_old_encode_manifest = old_encode[[len(index.split(".")[1].split("_")) > 1 for index in old_encode.index]]

In [None]:
img_dir = "/home/gpratt/Dropbox/encode_integration/for_eric/"

#Show that the total number of usable reads is far and away better than other non-encode stuff

In [None]:
publication_list = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/Method_paper_figures/MethodPaperFileList_FINAL100415.txt")

In [None]:
stats_for_publication = encode_only_qc[[int(encode_id) in publication_list.uID.values for encode_id in encode_only_qc.index.get_level_values(level="encode_id")]]
stats_for_publication = stats_for_publication[["INPUT" != rep for rep in stats_for_publication.index.get_level_values(level="rep")]]
stats_for_publication = stats_for_publication.drop(("SF3B4", '228', '01', ''), axis=0)
stats_for_publication = stats_for_publication.dropna()

In [None]:
911499388/ 4

In [None]:
HTML(submitted_old_encode.to_html())

In [None]:
real_encode_v10 = unmerged_old_encode_manifest.ix[["DirectIP" not in name for name in unmerged_old_encode_manifest.index]]
real_encode_v10 = real_encode_v10.ix[[len(name.split(".")[1].split("_")) > 1 for name in real_encode_v10.index]]
real_encode_v10 = real_encode_v10.ix[["LiCLIP" not in name for name in real_encode_v10.index]]
real_encode_v10 = real_encode_v10.ix[["ZNL" not in name for name in real_encode_v10.index]]
real_encode_v10 = real_encode_v10.ix[["ZNH" not in name for name in real_encode_v10.index]]
real_encode_v10 = real_encode_v10.ix[[not name.endswith("_a") for name in real_encode_v10.index]]
real_encode_v10 = real_encode_v10.ix[[not name.endswith("_b") for name in real_encode_v10.index]]

fus = real_encode_v10.ix[[("FUS" in name) or ("EIF4G1" in name) or ("TAL1" in name) for name in real_encode_v10.index]]

imp_data = pd.read_table("/home/gpratt/projects/encode/scripts/encode_clipseq_imp.txt", header=None, names=['path', 'species', 'merge'])
imp_data['full_name'] = imp_data.path.apply(os.path.basename).apply(lambda x: ".".join(x.split(".")[:2]))
merged_imp = pd.merge(real_encode_v10, imp_data, left_index=True, right_on="full_name")
merged_imp = merged_imp.groupby("merge").sum()

merged_imp["Fraction Collapsed"] = merged_imp['Usable Reads'] / merged_imp['Uniquely Mapped Reads'].astype(float)
merged_imp["Fraction Usable"] = merged_imp['Usable Reads'] / merged_imp['Input Reads'].astype(float)
submitted_old_encode = pd.concat([fus, merged_imp])

In [None]:
(327047516 + 296048148 + 209800956 + 250986560 + 328070348 + 5394964) / 4

In [None]:
81761879 
354337123

In [None]:
HTML(submitted_old_encode.to_html())

In [None]:
all_iclip = pd.concat([submitted_old_encode, unmerged_public_iclip_manifest])
all_iclip['annotation'] = "All iCLIP"

In [None]:
stats_for_publication['annotation'] = "ENCODE eCLIP"
submitted_old_encode['annotation'] = "ENCODE iCLIP Submitted"
#unmerged_old_encode_manifest['annotation'] = "ENCODE iCLIP"
unmerged_public_iclip_manifest['annotation'] = "Public iCLIP"
public_clip['annotation'] = "Public CLIP"

master_df = pd.concat([stats_for_publication,
                       submitted_old_encode,
                       #unmerged_old_encode_manifest,
                       unmerged_public_iclip_manifest,
                       public_clip,
                       all_iclip])

In [None]:
master_df = master_df[master_df['Uniquely Mapped Reads'] > 100000]

In [None]:
master_df = master_df[[ "Input Reads",
                                #'Trimmed bases',
                            #"Reads Written",
                            #"repetitive_count",
                            "Uniquely Mapped Reads",
                            #"Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            #"Num Peaks",
                       "annotation"
                            #"Passed QC"
                            ]]

In [None]:
num_rows = 1 
num_cols = 2 
with dataviz.Figure(os.path.join(img_dir, "usable_reads_plots_publication.svg"), figsize=(2.5 * num_cols,2.5*num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)
    sns.violinplot(y="Usable Reads", x="annotation", data=master_df,
                   ax=ax,
                   fontsize=10,
                  inner="box",
                   bw=.4,
                  )
    ax.set_ylim(0,15000000)
    xx = ax.get_yticks()
    ll = ['{:,d}'.format(int(a)) for a in xx]
    ax.set_yticklabels(ll, fontsize=10)
    sns.despine(ax=ax)
    ax.set_title("Number of Usable Reads", fontsize=12)
    ax.set_ylabel("Number of Usable Reads")
    ax.set_xlabel("")
    #[tick.set_rotation(90) for tick in ax.get_xticklabels()]
    #[tick.set_fontsize(8) for tick in ax.get_xticklabels()]

In [None]:
print stats_for_publication['Usable Reads'].dropna().mean(),unmerged_old_encode_manifest['Usable Reads'].dropna().mean(), public_iclip['Usable Reads'].dropna().mean(), public_clip['Usable Reads'].dropna().mean()
print stats_for_publication['Usable Reads'].dropna().median(),unmerged_old_encode_manifest['Usable Reads'].dropna().median(), public_iclip['Usable Reads'].dropna().median(), public_clip['Usable Reads'].dropna().median()

In [None]:
num_rows = 1 
num_cols = 2 
with dataviz.Figure(os.path.join(img_dir, "fraction_usable_reads_plots_publication.svg"), figsize=(2.5 * num_cols,2.5*num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)
    sns.violinplot(y="Fraction Usable", x="annotation", data=master_df,
                   ax=ax,
                   fontsize=10,
                  inner="box")

    ax.set_ylim(0,1)
    #xx = ax.get_yticks()
    #ll = ['{:,d}'.format(int(a)) for a in xx]
    #ax.set_yticklabels(ll, fontsize=10)
    sns.despine(ax=ax)
    ax.set_title("Fraction of Usable / Input", fontsize=12)
    ax.set_ylabel("Fraction of Usable / Input")
    ax.set_xlabel("")

In [None]:
print stats_for_publication['Fraction Usable'].dropna().mean(), unmerged_old_encode_manifest['Fraction Usable'].dropna().mean(), public_iclip['Fraction Usable'].dropna().mean(), public_clip['Fraction Usable'].dropna().mean()
print stats_for_publication['Fraction Usable'].dropna().median(), unmerged_old_encode_manifest['Fraction Usable'].dropna().median(), public_iclip['Fraction Usable'].dropna().median(), public_clip['Fraction Usable'].dropna().median()

In [None]:
num_rows = 1 
num_cols = 2 
with dataviz.Figure(os.path.join(img_dir, "fraction_collapsed_reads_publication.svg"), figsize=(2.5 * num_cols,2.5*num_rows)) as fig:
    ax = fig.add_subplot(1,1,1)

    sns.stripplot(y="Fraction Collapsed", x="annotation", data=master_df,
                   ax=ax,
                   #fontsize=10,
                  #inner="box",
                   # bw=.3,
                   #scale="width"
                  jitter=1,
                  edgecolor=None,
                  size=3
                  )
    sns.violinplot(y="Fraction Collapsed", x="annotation", data=master_df,
                   ax=ax,
                   fontsize=10,
                  inner="box",
                    bw=.3,
                   scale="width"
                  )
    
    ax.set_ylim(0,1)
    #xx = ax.get_yticks()
    #ll = ['{:,d}'.format(int(a)) for a in xx]
    #ax.set_yticklabels(ll, fontsize=10)
    sns.despine(ax=ax)
    ax.set_title("Fraction of Collapsed Reads / Mapped", fontsize=12)
    ax.set_ylabel("Fraction Usable / Mapped")
    ax.set_xlabel("")

In [None]:
print stats_for_publication['Fraction Collapsed'].dropna().mean(),unmerged_old_encode_manifest['Fraction Collapsed'].dropna().mean(), public_iclip['Fraction Collapsed'].dropna().mean(), public_clip['Fraction Collapsed'].dropna().mean()
print stats_for_publication['Fraction Collapsed'].dropna().median(),unmerged_old_encode_manifest['Fraction Collapsed'].dropna().median(), public_iclip['Fraction Collapsed'].dropna().median(), public_clip['Fraction Collapsed'].dropna().median()

In [None]:
len(stats_for_publication), len(submitted_old_encode), len(unmerged_old_encode_manifest), len(public_iclip), len(public_clip)

In [None]:
encode_only_qc.to_csv("/home/gpratt/for_eric/encode_only_qc.csv")

In [None]:
submitted_old_encode.to_csv('/home/gpratt/Dropbox/encode_integration/for_eric/old_encode_clips.csv')

In [None]:
public_clip.to_csv('/home/gpratt/Dropbox/encode_integration/for_eric/old_public_clips.csv')

In [None]:
public_iclip.to_csv('/home/gpratt/Dropbox/encode_integration/for_eric/old_public_iclips.csv')

In [None]:
unmerged_public_iclip_manifest.to_csv('/home/gpratt/Dropbox/encode_integration/for_eric/old_public_iclips.csv')

In [None]:
def join_if_tuple(item):
    if type(item) is tuple:
        return "_".join(item)
    else:
        return item
    
master_df.index = [join_if_tuple(item) for item in master_df.index]

In [None]:
pd.read_csv("/home/gpratt/Dropbox/encode_integration/for_eric/master_qc.csv")

In [None]:
master_df.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/master_qc.csv", sep="\t")

In [None]:
HTML(master_df.to_html())

In [None]:
analysis_dir = "/home/gpratt/projects/encode/analysis/encode_mouse_v8/"
encode_mouse = parsers.clipseq_metrics(analysis_dir, iclip=True)

encode_mouse["Fraction Collapsed"] = encode_mouse['Usable Reads'] / encode_mouse['Uniquely Mapped Reads'].astype(float)
encode_mouse["Fraction Usable"] = encode_mouse['Usable Reads'] / encode_mouse['Input Reads'].astype(float)
unmerged_clip_manifest = encode_mouse[["merged" not in index for index in encode_mouse.index]]

manifest_df = pd.read_table("/home/gpratt/projects/encode/scripts/encode_mouse_v8.txt", header=None,
             names=['fastq', 'species', 'encode_id', 'barcodes', 'barcodes_len', 'more_barcodes', "randomer_length"])
manifest_df['qc_name'] = manifest_df.fastq.apply(lambda x: ".".join(os.path.basename(x.split(";")[0]).split(".")[:2]))

tmp = pd.merge(manifest_df, unmerged_clip_manifest, left_on="qc_name", right_index=True)

final_qc_frame = pd.merge(manifest, tmp, left_on='qc_id', right_on='encode_id')
final_qc_frame['exp_id'] = final_qc_frame.ENCODE_ID.apply(lambda x:x.split("_")[0])

def get_rep_num(encode_id):
    try:
        return encode_id.split("_")[1]
    except: 
        return np.nan
final_qc_frame['rep_num'] = final_qc_frame.ENCODE_ID.apply(get_rep_num)
final_qc_frame['Input Reads'] = final_qc_frame['Input Reads'].fillna(0)

In [None]:
filtered_final_qc_frame = final_qc_frame[["Hiseq_file_name", "ENCODE_ID", "RBP", "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            #"Passed QC"
                            ]]

#filtered_final_qc_frame = filtered_final_qc_frame.astype(float)

HTML(filtered_final_qc_frame.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

In [None]:
filtered_final_qc_frame.to_csv("/home/gpratt/Dropbox/Rbfox3/qc_table.csv")

In [None]:
foo = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_v9_20151209/encode_v9_filelist.ENCODE.20151209_newsubset.txt")

In [None]:
foo = pd.concat([foo.CLIP_rep1, foo.CLIP_rep2, foo.INPUT]).dropna()

In [None]:
foo.CLIP_rep1.apply(os.path.basename)

In [None]:
"scp tscc-login1.sdsc.edu:/home/gpratt/projects/encode/analysis/encode_v9/{" + ",".join(foo.apply(os.path.basename).values) + "} ."

#Remaking data from stable qc numbers

In [None]:
master_df = pd.read_table("/home/gpratt/Dropbox/encode_integration/for_eric/master_qc_stable.csv", index_col=0)

In [None]:
master_df

In [None]:
public_df = master_df[(master_df.annotation != "ENCODE eCLIP") & \
                      (master_df.annotation != "ENCODE iCLIP Submitted")
                     #(master_df.annotation != "All iCLIP") \
                     ]

public_clip_df = master_df[master_df.annotation == "Public CLIP"]
public_iclip_df = master_df[master_df.annotation == "Public iCLIP"]
all_iclip_df = master_df[master_df.annotation == "All iCLIP"]

In [None]:
sup_table_1 = pd.read_excel("/home/gpratt/Dropbox/encode_integration/for_eric/SupTables/SupTable1.xlsx", "Sheet1", index_col=0)
sup_table_1_public_clip = sup_table_1[sup_table_1['experiment type (general)'] == 'Public CLIP'].copy()
sup_table_1_all_iclip = sup_table_1[sup_table_1['experiment type (general)'] == 'All iCLIP'].copy()

In [None]:
public_clip_df = sup_table_1_public_clip.join(public_clip_df)
public_iclip_df = sup_table_1.join(public_iclip_df)
all_iclip_df = sup_table_1.join(all_iclip_df)

In [None]:
set(sup_table_1['experiment type (general)'])

In [None]:
print len(public_clip_df), len(public_clip_df.dropna())
print len(all_iclip_df), len(all_iclip_df.dropna())

In [None]:
all_iclip_df['iclip_joiner'] = all_iclip_df.RBP + all_iclip_df['Cell type']
master_df['iclip_joiner'] = [item.split("_")[0] + item.split("_")[-1] for item in master_df.index]

In [None]:
grouped_final_qc_frame['id'] = ["_".join(item) for item in grouped_final_qc_frame.index]
grouped_final_qc_frame.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/encode_v9_qc.csv")

In [None]:
analysis_dir = "/home/gpratt/projects/encode/analysis/encode_iclip_submitted_v1/"
encode_qc = parsers.clipseq_metrics(analysis_dir, iclip=False)

encode_qc["Fraction Collapsed"] = encode_qc['Usable Reads'] / encode_qc['Uniquely Mapped Reads'].astype(float)
encode_qc["Fraction Usable"] = encode_qc['Usable Reads'] / encode_qc['Input Reads'].astype(float)
#encode_qc['is_v12'] = 1

In [None]:
encode_qc.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/encode_iclip_submitted_qc_v12.csv")