In [1]:
%matplotlib inline
from collections import defaultdict, Counter
import glob
import os
import json

from IPython.core.display import HTML, Image
from matplotlib_venn import venn3
import pandas as pd
import pybedtools
import pysam
import gspread
from gscripts.general import parsers
from gscripts.general import dataviz
%load_ext autoreload
import numpy as np
%autoreload 2
reload(parsers)
reload(pybedtools)
import seaborn as sns
import matplotlib
from oauth2client.client import SignedJwtAssertionCredentials
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter

img_dir = "/home/gpratt/Dropbox/encode_integration/qc_work/"

In [2]:
from matplotlib import rc
rc('text', usetex=False)
matplotlib.rcParams['svg.fonttype'] = 'none'

rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})

In [3]:
json_key = json.load(open("../public clip-588adbc137f3.json"))
scope = ['https://spreadsheets.google.com/feeds']

credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)
gc = gspread.authorize(credentials)

sht1 = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ZU2mQh54jentqvhR_oMnviLGWR8Nw_x338gULzKjNDI/edit#gid=0")
ws = sht1.worksheet("Sheet1")
list_of_lists = ws.get_all_values()
manifest = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])
manifest['qc_id'] = manifest.apply(lambda x: "{}_{}".format(x.ENCODE_ID, x.RBP), axis=1)
manifest.is_encode = manifest.is_encode == "TRUE"

In [4]:
def get_rep_num(encode_id):
    try:
        return encode_id.split("_")[1]
    except:
        return np.nan
    
def format_qc_frame(qc_frame, metadata_table):
    """Takes dataframe from clipseq metrics and formats it for more general aggergaion
    qc_frame: dataframe clipseq_metrics function
    metadata_table: file location generated from eCLIP manifest generator (might make more general later)
    """
    
    qc_frame["Fraction Collapsed"] = qc_frame['Usable Reads'] / qc_frame['Uniquely Mapped Reads'].astype(float)
    qc_frame["Fraction Usable"] = qc_frame['Usable Reads'] / qc_frame['Input Reads'].astype(float)
    qc_frame['barcode'] = qc_frame.index.map(lambda x: x.split(".")[1].split("_")[0])

    #If its a merged file remove it, can have merged in the file name
    unmerged_clip_manifest = qc_frame[["merged" not in index.split(".")[1] for index in qc_frame.index]]

    manifest_df = pd.read_table(metadata_table, header=None,
                 names=['fastq', 'species', 'encode_id', 'barcodes', 'barcodes_len', 'more_barcodes', "randomer_length"])
    manifest_df['qc_name'] = manifest_df.fastq.apply(lambda x: ".".join(os.path.basename(x.split(";")[0]).split(".")[:2]))

    tmp = pd.merge(manifest_df, unmerged_clip_manifest, left_on="qc_name", right_index=True)

    final_qc_frame = pd.merge(manifest, tmp, left_on='qc_id', right_on='encode_id')
    final_qc_frame['exp_id'] = final_qc_frame.ENCODE_ID.apply(lambda x:x.split("_")[0])

    final_qc_frame['rep_num'] = final_qc_frame.ENCODE_ID.apply(get_rep_num)
    final_qc_frame['Input Reads'] = final_qc_frame['Input Reads'].fillna(0)
    final_qc_frame['repetitive_count'] = final_qc_frame['Number of input reads rmRep'] - final_qc_frame['Reads Passing Quality Filter']

    new_index = []
    for index, row in final_qc_frame.iterrows():
        new_index.append([row.RBP, row.exp_id, row.rep_num, row.cell_type, row.ENCODE_ID, row.barcode])

    final_qc_frame.index = pd.MultiIndex.from_tuples(new_index, names=['rbp', 'encode_id', 'rep', 'cell_type', "full_name", 'barcode'])
    final_qc_frame = final_qc_frame.sort_index()
    
    return final_qc_frame

# Load save hg19 QC values

#Command to find and remove dangling simlinks

find -L . -type l -delete

In [None]:
encode_qc = parsers.clipseq_metrics("/projects/ps-yeolab3/encode/analysis/encode_master/", iclip=True)
final_qc_frame = format_qc_frame(encode_qc, "/home/gpratt/projects/encode/scripts/encode_GRCh38_v1.txt")
encode_qc.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/unannoated_encode_master_qc.csv")
final_qc_frame.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/encode_master_qc.csv")

# Load and save single end QC values

In [None]:
encode_single_end_qc = parsers.clipseq_metrics("/projects/ps-yeolab3/encode/analysis/encode_single_end_v1/", iclip=True)
encode_single_end_qc["Fraction Collapsed"] = encode_single_end_qc['Usable Reads'] / encode_single_end_qc['Uniquely Mapped Reads'].astype(float)
encode_single_end_qc["Fraction Usable"] = encode_single_end_qc['Usable Reads'] / encode_single_end_qc['Input Reads'].astype(float)
encode_single_end_qc['repetitive_count'] = encode_single_end_qc['Number of input reads rmRep'] - encode_single_end_qc['Reads Passing Quality Filter'] 

encode_single_end_qc[[ "Input Reads",
           #'Trimmed bases',
           "Reads Written",
           "repetitive_count",
           "Uniquely Mapped Reads",
           "Uniquely mapped reads %",
           "Usable Reads",
           "Fraction Collapsed",
           "Fraction Usable",
           #"spot",
           #"Num Peaks",
           #"Passed QC"
          ]].to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/unannoated_single_end_qc_filtered_v1.csv")

encode_single_end_qc.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/unannoated_single_end_qc_v1.csv")

# single_end_qc_frame = format_qc_frame(encode_single_end_qc, "/home/gpratt/projects/encode/scripts/encode_single_end_v1.txt")
# single_end_qc_frame.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/encode_single_end_qc_v1.csv")

# Load and save GRCh38 QC values

In [None]:
encode_qc[[ "Input Reads",
           #'Trimmed bases',
           "Reads Written",
           "repetitive_count",
           "Uniquely Mapped Reads",
           "Uniquely mapped reads %",
           "Usable Reads",
           "Fraction Collapsed",
           "Fraction Usable",
           #"spot",
           #"Num Peaks",
           #"Passed QC"
          ]].to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/unannoated_qc_filtered_v13.csv")

encode_qc.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/unannoated_qc_v13.csv")

In [None]:
encode_GRCh38_qc = parsers.clipseq_metrics("/projects/ps-yeolab3/encode/analysis/encode_GRCh38_v1/", iclip=True)
final_GRCh38_qc_frame = format_qc_frame(encode_qc, "/home/gpratt/projects/encode/scripts/encode_GRCh38_v1.txt")
encode_GRCh38_qc.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/unannoated_encode_GRCh38_qc.csv")
final_GRCh38_qc_frame.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/encode_GRCh38_qc.csv")

# Load Public CLIP QC Metrics

In [None]:
analysis_dir = "/home/gpratt/projects/public_clip/analysis/public_clip_v9/"
public_clip = parsers.clipseq_metrics(analysis_dir, iclip=True)

public_clip["Fraction Collapsed"] = public_clip['Usable Reads'] / public_clip['Uniquely Mapped Reads'].astype(float)
public_clip["Fraction Usable"] = public_clip['Usable Reads'] / public_clip['Input Reads'].astype(float)

public_clip.to_csv
unmerged_public_clip_manifest = public_clip[["merged" not in index for index in public_clip.index]]


In [None]:
filtered_unmerged_public_clip_manifest = unmerged_public_clip_manifest[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            #"Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            #"Passed QC"
                            ]]

filtered_unmerged_public_clip_manifest = filtered_unmerged_public_clip_manifest.astype(float)
filtered_unmerged_public_clip_manifest.index = filtered_unmerged_public_clip_manifest.index.map(lambda x: x.split(".")[0])

In [None]:
# HTML(filtered_unmerged_public_clip_manifest.to_html(formatters={"Input Reads" : parsers.commas,
#                                      "Reads Written" : parsers.commas,
#                                      "repetitive_count": parsers.commas,
#                                      "Reads after Quality Filtering" : parsers.commas,
#                                      "Uniquely Mapped Reads" : parsers.commas,
#                                      "Usable Reads" : parsers.commas,
#                                      "Num Peaks": parsers.commas
#                                      } ))

In [None]:
analysis_dir = "/home/gpratt/projects/public_clip/analysis/public_iclip_v1/"
public_iclip = parsers.clipseq_metrics(analysis_dir, iclip=True)

public_iclip["Fraction Collapsed"] = public_iclip['Usable Reads'] / public_iclip['Uniquely Mapped Reads'].astype(float)
public_iclip["Fraction Usable"] = public_iclip['Usable Reads'] / public_iclip['Input Reads'].astype(float)

unmerged_public_iclip_manifest = public_iclip[["merged" not in index for index in public_iclip.index]]
unmerged_public_iclip_manifest.index = pd.MultiIndex.from_tuples([item.split(".") for item in unmerged_public_iclip_manifest.index], 
                          names=["sra_id", "barcode"])
unmerged_public_iclip_manifest = unmerged_public_iclip_manifest.groupby(level="sra_id").sum()
unmerged_public_iclip_manifest["Fraction Collapsed"] = unmerged_public_iclip_manifest['Usable Reads'] / unmerged_public_iclip_manifest['Uniquely Mapped Reads'].astype(float)
unmerged_public_iclip_manifest["Fraction Usable"] = unmerged_public_iclip_manifest['Usable Reads'] / unmerged_public_iclip_manifest['Input Reads'].astype(float)
unmerged_public_iclip_manifest.to_csv('/home/gpratt/Dropbox/encode_integration/for_eric/public_iclip_qc.csv')

In [None]:
public_clip_database = sht1.worksheet("public_clip_database")
list_of_lists = public_clip_database.get_all_values()
public_clip_database = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])

merged_public_df = pd.merge(public_clip_database, filtered_unmerged_public_clip_manifest , left_on="SRA", right_index=True)
merged_public_df = merged_public_df[merged_public_df.Species.isin(["hg19", "mn9"])]
merged_public_df.index = merged_public_df.SRA

public_iclip = merged_public_df[merged_public_df['type'] == 'iclip']
public_clip = merged_public_df[merged_public_df['type'].isin({'clip', 'par-clip', 'par-clip 4SU'})]


In [None]:
public_clip.to_csv('/home/gpratt/Dropbox/encode_integration/for_eric/public_clip_qc.csv')
public_iclip.to_csv('/home/gpratt/Dropbox/encode_integration/for_eric/public_iclip_cleaned_qc.csv')

In [None]:
filtered_unmerged_public_iclip_manifest = unmerged_public_iclip_manifest[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            #"Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            #"Passed QC"
                            ]]

# HTML(filtered_unmerged_public_iclip_manifest.to_html(formatters={"Input Reads" : parsers.commas,
#                                      "Reads Written" : parsers.commas,
#                                      "repetitive_count": parsers.commas,
#                                      "Reads after Quality Filtering" : parsers.commas,
#                                      "Uniquely Mapped Reads" : parsers.commas,
#                                      "Usable Reads" : parsers.commas,
#                                      "Num Peaks": parsers.commas
#                                      } ))

# Load encode v10 data (legacy code)

In [None]:
analysis_dir = "/projects/ps-yeolab3/encode/analysis/encode_v10/"
old_encode = parsers.clipseq_metrics(analysis_dir, iclip=True)

old_encode["Fraction Collapsed"] = old_encode['Usable Reads'] / old_encode['Uniquely Mapped Reads'].astype(float)
old_encode["Fraction Usable"] = old_encode['Usable Reads'] / old_encode['Input Reads'].astype(float)
old_encode.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/old_encode_qc.csv")

# Load Mouse Data

In [19]:
analysis_dir = "/projects/ps-yeolab3/encode/analysis/encode_mouse_v9"
encode_mouse = parsers.clipseq_metrics(analysis_dir, iclip=True)
final_qc_frame = format_qc_frame(encode_mouse, "/home/gpratt/projects/encode/scripts/encode_mouse_v9.txt")
encode_mouse.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/mouse_clip_qc.csv")
final_qc_frame.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/mouse_master_qc.csv")

KeyboardInterrupt: 

In [22]:
filtered_final_qc_frame = final_qc_frame[[ "Input Reads", "Reads Written", "repetitive_count", "Reads Passing Quality Filter",
                                          "Uniquely Mapped Reads", "Uniquely mapped reads %", 'Number of reads mapped to too many loci',
                                          '% of reads unmapped: too short', '% of reads mapped to too many loci', "Usable Reads",
                                          "Fraction Collapsed", "Fraction Usable", "Num Peaks",]]

filtered_final_qc_frame.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/mouse_clip_qc_filtered.csv")

In [None]:
filtered_final_qc_frame = final_qc_frame[["Hiseq_file_name", "ENCODE_ID", "RBP", "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            "Num Peaks",
                            #"Passed QC"
                            ]]

#filtered_final_qc_frame = filtered_final_qc_frame.astype(float)

HTML(filtered_final_qc_frame.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

In [None]:
filtered_final_qc_frame.to_csv("/home/gpratt/Dropbox/Rbfox3/qc_table.csv")

In [None]:
encode_qc = parsers.clipseq_metrics("/projects/ps-yeolab3/encode/analysis/singapore_clip/", iclip=True)
final_qc_frame = format_qc_frame(encode_qc, "/home/gpratt/projects/encode/scripts/encode_GRCh38_v1.txt")
encode_qc.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/singapore_clip_qc.csv")
final_qc_frame.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/singapore_clip_master_qc.csv")

In [None]:
filtered_final_qc_frame = final_qc_frame[[ "Input Reads", "Reads Written", "repetitive_count", "Reads Passing Quality Filter",
                                          "Uniquely Mapped Reads", "Uniquely mapped reads %", 'Number of reads mapped to too many loci',
                                          '% of reads unmapped: too short', '% of reads mapped to too many loci', "Usable Reads",
                                          "Fraction Collapsed", "Fraction Usable", "Num Peaks",]]

In [None]:
filtered_final_qc_frame.to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/singapore_clip_master_qc.csv")

In [None]:
filtered_final_qc_frame