In [87]:
%matplotlib inline
from collections import defaultdict, Counter
import glob
import os
import json

from IPython.core.display import HTML, Image
from matplotlib_venn import venn3
import pandas as pd
import pybedtools
import pysam
import gspread
from gscripts.general import parsers
from gscripts.general import dataviz
%load_ext autoreload
%autoreload 2
reload(parsers)
reload(pybedtools)

from oauth2client.client import SignedJwtAssertionCredentials

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
json_key = json.load(open("../public clip-588adbc137f3.json"))
scope = ['https://spreadsheets.google.com/feeds']

credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)
gc = gspread.authorize(credentials)

sht1 = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ZU2mQh54jentqvhR_oMnviLGWR8Nw_x338gULzKjNDI/edit#gid=0")
ws = sht1.worksheet("Sheet1")
list_of_lists = ws.get_all_values()
manifest = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])
manifest.is_encode = manifest.is_encode == "TRUE"

In [89]:
def format_file(index, adapter):
    index_1 = illumina_adapters.ix[index.index_1].values[0]
    index_2 = illumina_adapters.ix[index.index_2].values[0]
    if index.Lane == "": #Incase we are doing a rapid run and there isn't lane info
        sample_name = "{}_{}-{}_{}.fastq.gz".format(index.Hiseq_file_name, index_2, index_1, adapter)
        dir_name = "Sample_{}".format(index.Hiseq_file_name)

    elif index.Lane.startswith("S"):
        name = index.Hiseq_file_name.split("-")[0].replace("_", "-")
        sample_name = "{}_{}_{}.fastq.gz".format(name, index.Lane, adapter)
        dir_name = index.Hiseq_file_name

    else:
        sample_name = "{}_{}-{}_{}_{}.fastq.gz".format(index.Hiseq_file_name, index_2, index_1, index.Lane, adapter)
        dir_name = "Sample_{}".format(index.Hiseq_file_name)
    return os.path.join(index.file_location, dir_name, sample_name)

In [90]:
input_manifest = pd.read_csv("../input_manifest.csv")
clip_manifest  = pd.read_csv("../clipseq_manifest.csv")

In [167]:
analysis_dir = "/home/gpratt/projects/encode/analysis/encode_v8/"
encode_v7 = parsers.clipseq_metrics(analysis_dir, iclip=True)

encode_v7["Fraction Collapsed"] = encode_v7['Usable Reads'] / encode_v7['Uniquely Mapped Reads'].astype(float)
encode_v7["Fraction Usable"] = encode_v7['Usable Reads'] / encode_v7['Input Reads'].astype(float)
unmerged_clip_manifest = encode_v7[["merged" not in index for index in encode_v7.index]]

In [168]:
filtered_encode_v7 = encode_v7[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            #"spot",
                            #"Num Peaks",
                            #"Passed QC"
                            ]]

In [169]:
HTML(filtered_encode_v7.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

Unnamed: 0,Input Reads,Reads Written,repetitive_count,Uniquely Mapped Reads,Uniquely mapped reads %,Usable Reads,Fraction Collapsed,Fraction Usable
202_02_PTBP1.merged,0,0,0,0,,0,,
203-CLIPs_S6_R1.A01_203_02_HNRNPC,2859437,2735512,5298570,1702445,78.82%,1574516,0.924856,0.550638
203-CLIPs_S6_R1.B06_203_02_HNRNPC,2975593,2845499,5770278,1729375,77.68%,1587979,0.918239,0.533668
203-CLIPs_S6_R1.C01_203_01_HNRNPC,2183329,2073163,3680603,1435632,83.95%,0,,
203-CLIPs_S6_R1.D08fixed_203_01_HNRNPC,3222598,3098404,5092729,2163569,83.47%,260445,0.120377,0.080818
203-INPUT_S7_R1.unassigned,16965536,16776482,35357905,2581714,50.76%,2516566,0.974766,0.148334
203_02_HNRNPC.merged,0,0,0,0,,0,,
205_02_IGF2BP1.merged,0,0,0,0,,0,,
206-01_S8_R1.C01_206_01_HNRNPK,4228741,3784137,2946818,2335968,74.73%,2247791,0.962252,0.531551
206-01_S8_R1.D08fixed_206_01_HNRNPK,3173611,2967005,2158625,1886623,75.38%,1814884,0.961975,0.571867


In [225]:
manifest_df = pd.read_table("/home/gpratt/projects/encode/scripts/encode_v8.txt", header=None,
             names=['fastq', 'species', 'encode_id', 'barcodes', 'barcodes_len', 'more_barcodes'])
manifest_df['qc_name'] = manifest_df.fastq.apply(lambda x: ".".join(os.path.basename(x.split(";")[0]).split(".")[:2]))

tmp = pd.merge(manifest_df, unmerged_clip_manifest, left_on="qc_name", right_index=True)
final_qc_frame = pd.merge(manifest, tmp, left_on='qc_id', right_on='encode_id')

In [226]:
final_qc_frame['exp_id'] = final_qc_frame.ENCODE_ID.apply(lambda x:x.split("_")[0])
final_qc_frame['rep_num'] = final_qc_frame.ENCODE_ID.apply(lambda x:x.split("_")[1])

In [227]:
new_index = []
for index, row in final_qc_frame.iterrows():
    new_index.append([row.RBP, row.exp_id, row.rep_num, row.cell_type])
    
final_qc_frame.index = pd.MultiIndex.from_tuples(new_index)
final_qc_frame = final_qc_frame.sort_index()

In [228]:
filtered_final_qc_frame = final_qc_frame[[ "Input Reads",
                                #'Trimmed bases',
                            "Reads Written",
                            "repetitive_count",
                            "Uniquely Mapped Reads",
                            "Uniquely mapped reads %",
                            "Usable Reads",
                            "Fraction Collapsed",
                            "Fraction Usable",
                            "spot",
                            "Num Peaks",
                            "Passed QC"
                            ]]

In [231]:
HTML(filtered_final_qc_frame.to_html(formatters={"Input Reads" : parsers.commas,
                                     "Reads Written" : parsers.commas,
                                     "repetitive_count": parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Input Reads,Reads Written,repetitive_count,Uniquely Mapped Reads,Uniquely mapped reads %,Usable Reads,Fraction Collapsed,Fraction Usable,spot,Num Peaks,Passed QC
ASCC1,219,02,HepG2,3842217,3490186,6345625,968835,61.62%,0,,,,0,False
ASCC1,219,02,HepG2,3080568,2864636,5320957,770047,61.43%,0,,,,0,False
ASCC1,219,INPUT,HepG2,14060169,13742528,27753147,2567306,68.27%,2517257,0.980505,0.179035,,24771,True
ASCC1,323,01,K562,16325,10739,5824,742,8.35%,0,,,,0,False
ASCC1,323,01,K562,2598984,2469473,4347233,411996,39.64%,0,,,,0,False
ASCC1,323,02,K562,7003008,6612656,11209599,1374116,47.10%,0,,,,0,False
ASCC1,323,02,K562,2344342,2198486,3625561,472326,47.50%,0,,,,0,False
ASCC1,323,INPUT,K562,10914914,10558369,19399500,2255402,52.51%,2179690,0.966431,0.199698,0.112592,22853,True
AUH,246,01,K562,9848178,8304018,18251116,988286,41.83%,547917,0.554411,0.055636,,10016,True
AUH,246,01,K562,3783411,3268230,7081259,377375,39.98%,192642,0.510479,0.050918,0.073645,2572,False
