In [2]:
import datetime
import glob
import functools
import urllib2
from collections import defaultdict

import pandas as pd
import os
from IPython.core.display import HTML
import numpy as np
import shutil

from gscripts.encode import encode_helpers
import boto

In [3]:
class ArrayJob():
    def __init__(self):
        self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

    def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
        return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn={3}
#PBS -o {0}.out
#PBS -e {0}.err
#PBS -V
#PBS -q home-yeo
#PBS -W group_list=yeo-group
#PBS -t 1-{1}
#PBS -l walltime={4}:00:00
cd {2}
echo "hello, starting"

""".format(os.path.basename(name), count, run_dir, ppn, walltime)

        
    def make_script(self, commands, script_name, run_dir, ppn=1, walltime=8):
        total = 0 
        result = []
        num_out = 0
        for cmd in commands:
            total += 1
            result.append('cmd[{}]="{}"'.format(total, cmd))
            if total >= 500:
                with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
                    out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime ))
                    for line in result:
                        out_file.write(line + "\n\n")
                    out_file.write(self._epilogue + "\n")
                total = 0 
                num_out += 1
                result = []

        with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
            out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime))
            for line in result:
                out_file.write(line + "\n\n")
            out_file.write(self._epilogue + "\n")
            
job_maker = ArrayJob()

#Important Data Metadata
Barcodes to split on and their ids 
illumina adapters and their IDs so I can make name dcisions

In [4]:
barcodes = """AAGCAAT A01
GGCTTGT B06
ACAAGTT C01
TGGTCCT D08fixed
ATGACCNNNNT  A03
TCCTGTNNNNT  G07
CAGCTTNNNNT  A04
GGATACNNNNT  F05
NNNNNCCTATAT X1A
NNNNNTGCTATT X1B
NNNNNTATACTT X2A
NNNNNATCTTCT X2B""".split("\n")
barcodes = dict([item.split() for item in barcodes])


In [5]:
set([len(item) for item in barcodes.keys()])

{7, 11, 12}

In [6]:
barcodes = """AAGCAAT A01
GGCTTGT B06
ACAAGTT C01
TGGTCCT D08fixed
ATGACCNNNNT  A03
TCCTGTNNNNT  G07
CAGCTTNNNNT  A04
GGATACNNNNT  F05
NNNNNCCTATAT X1A
NNNNNTGCTATT X1B
NNNNNTATACTT X2A
NNNNNATCTTCT X2B""".split("\n")


barcodes = dict([item.split() for item in barcodes])
barcode_name_to_sequence = {value.strip(): key.strip() for key, value in barcodes.items()}

#overlap is for the second trimimg round to throw out doube ligation events on the end of read 2 (we ignore tripple ligation events)
#only issue is actually A03-F05 where we might be trimming Ts off the start of read 1 and causing dove tailing events
overlap = """A01 5
B06 5
C01 5
D08fixed 5
A03 9
G07 9 
A04 9
F05 9
none 5
X1A 10
X1B 10
X2A 10
X2B 10""".split("\n")


overlap = dict([item.split() for item in overlap])

illumina_adapters = """501 TATAGCCT 
502 ATAGAGGC 
503 CCTATCCT 
504 GGCTCTGA 
505 AGGCGAAG 
506 TAATCTTA 
505,506 eric_is_stupid
507 CAGGACGT 
508 GTACTGAC 
502s ATA 
503s CCT
504s GGC
505s AGG
506s TAA
507s CAG
508s GTA
701 ATTACTCG
702 TCCGGAGA
703 CGCTCATT
704 GAGATTCC
705 ATTCAGAA
706 GAATTCGT""".split("\n")

illumina_adapters = pd.DataFrame([item.strip().split() for item in illumina_adapters], columns=["label", "barcode"])
illumina_adapters = illumina_adapters.set_index("label")

adapters = """A01    ATTGCTTAGATCGGAAGAGCGTCGTGT
B06    ACAAGCCAGATCGGAAGAGCGTCGTGT
C01    AACTTGTAGATCGGAAGAGCGTCGTGT
D08fixed    AGGACCAAGATCGGAAGAGCGTCGTGT
A03    ANNNNGGTCATAGATCGGAAGAGCGTCGTGT
G07    ANNNNACAGGAAGATCGGAAGAGCGTCGTGT
A04    ANNNNAAGCTGAGATCGGAAGAGCGTCGTGT
F05    ANNNNGTATCCAGATCGGAAGAGCGTCGTGT
none   AGATCGGAAGAGCGTCGTGT
X1A    ATATAGGNNNNNAGATCGGAAGAGCGTCGTGTAG
X1B    AATAGCANNNNNAGATCGGAAGAGCGTCGTGTAG 
X2A    AAGTATANNNNNAGATCGGAAGAGCGTCGTGTAG
X2B    AGAAGATNNNNNAGATCGGAAGAGCGTCGTGTAG""".split("\n")
adapters = dict([item.split() for item in adapters])



five_prime_adapter = "CTTCCGATCT"

#Files that were generated pre the great-demux disaster of 2017

/projects/ps-yeolab3/encode/analysis/encode_master/already_demuxed_files.txt


# Get Manifest from Google Docs

#if you want to do this with other sheets you need to share with client email, which is a long string, not gpratt@ucsd.edu or other things

In [None]:
manifest = encode_helpers.get_lab_manifest()

In [8]:
def format_file(index, adapter):
    index_1 = illumina_adapters.ix[index.index_1].values[0]
    index_2 = illumina_adapters.ix[index.index_2].values[0]
    if index.Lane == "": #Incase we are doing a rapid run and there isn't lane info
        sample_name = "{}_{}-{}_{}.fastq.gz".format(index.Hiseq_file_name, index_2, index_1, adapter)
        dir_name = "Sample_{}".format(index.Hiseq_file_name)

    #For Sebastians samples Sample_G3BP-S-input/G3BP-S-input_S1_L001_R1.fastq.gz
    elif index.Lane.startswith("S") and "L" in index.Lane:
        name = index.Hiseq_file_name
        sample_name = "{}_{}_{}.fastq.gz".format(name, index.Lane, adapter)
        dir_name = index.Hiseq_file_name
    
    elif index.Lane.startswith("S"):
        name = index.Hiseq_file_name.split("-")[0].replace("_", "-")
        sample_name = "{}_{}_{}.fastq.gz".format(name, index.Lane, adapter)
        dir_name = index.Hiseq_file_name
 
    #This needs to go above my other 4000 check because of name colisions
    elif index.is_4000:
        sample_name = "{}_{}_{}_001.fastq.gz".format(index.Hiseq_file_name, index.Lane, adapter)
        return os.path.join(index.file_location, sample_name)
    
    #this is a bit hacky, keep an eye on it
    elif "4000" in index.Hiseq_file_name:
        sample_name = index.Hiseq_file_name + ".fastq.gz"
        sample_name = sample_name.replace("R1", adapter)
        return os.path.join(index.file_location, sample_name)
      
    else:
        sample_name = "{}_{}-{}_{}_{}.fastq.gz".format(index.Hiseq_file_name, index_2, index_1, index.Lane, adapter)
        dir_name = "Sample_{}".format(index.Hiseq_file_name)
    return os.path.join(index.file_location, dir_name, sample_name)

In [9]:
manifest['r1'] = manifest.apply(functools.partial(format_file, adapter="R1"), axis=1)
manifest['r2'] = manifest.apply(functools.partial(format_file, adapter="R2"), axis=1)

# Make sure there are no duplicated names or ids in the encode manifest

In [10]:
input_manifest_check = manifest[manifest.inline_1 == "none"]
manifest_grouped_manifest = input_manifest_check.groupby(["Hiseq_file_name"]).count()
manifest_grouped_manifest[manifest_grouped_manifest.RBP > 1]

Unnamed: 0_level_0,ENCODE_ID,RBP,inline_1,inline_2,index_1,index_2,Lane,file_location,unmerged_location,original_file_name,is_encode,cell_type,hiseq_run_date,randomer_length,Method_Paper_flag,species,is_4000,exp_id,r1,r2
Hiseq_file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
OG_IN_B_S59,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
OG_IP_B_S60,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [11]:
manifest_grouped_manifest = manifest.groupby(["Hiseq_file_name", "ENCODE_ID"]).count()
manifest_grouped_manifest[manifest_grouped_manifest.RBP > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,RBP,inline_1,inline_2,index_1,index_2,Lane,file_location,unmerged_location,original_file_name,is_encode,cell_type,hiseq_run_date,randomer_length,Method_Paper_flag,species,is_4000,exp_id,r1,r2
Hiseq_file_name,ENCODE_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
CHH27433_qsort,CHH27433_qsort,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [12]:
manifest_grouped_manifest = manifest.groupby(["ENCODE_ID"]).count()
manifest_grouped_manifest[manifest_grouped_manifest.RBP > 1]

Unnamed: 0_level_0,Hiseq_file_name,RBP,inline_1,inline_2,index_1,index_2,Lane,file_location,unmerged_location,original_file_name,is_encode,cell_type,hiseq_run_date,randomer_length,Method_Paper_flag,species,is_4000,exp_id,r1,r2
ENCODE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
633_01,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
664_01,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
675_01,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
702_02,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
722_02,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
723_01,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
723_02,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
CHH27433_qsort,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
WT3P2I_caution,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [13]:
unprocessed_datasets = manifest.ix[~manifest['r1'].apply(os.path.exists)]

for x in unprocessed_datasets.r1:
    print x

/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2/AS_FOX2_AFFSC_CLIP_S105_L008_R1_001.fastq.gz
/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2/AS_FOX2_AFFSC_IN_S106_L008_R1_001.fastq.gz
/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2/AS_FOX2_SS4_CLIP_S113_L008_R1_001.fastq.gz
/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2/AS_FOX2_SS4_IN_S114_L008_R1_001.fastq.gz
/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2/AS_FOX2_SS4_MN_CLIP_S117_L008_R1_001.fastq.gz
/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2/AS_FOX2_SS4_MN_IN_S118_L008_R1_001.fastq.gz
/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2/AS_FOX2_SSIII_CLIP_S109_L008_R1_001.fastq.gz
/projects/ps-yeolab/seqdata/igm-sto

# Figure out if old datasets that might have been deleted should be processed with all barcodes or just old barcodes

Order of priority
- If a file with X1A or X1B exists then it was demuxed with those files, if not, we don't have any information
- If the metrics file exists then it exists
- If the files was done before 160229 then it does not have X1A, otherwise it does

I'll demux everything again to a different directory, and compare md5sums when I'm done...

In [15]:
all_demuxed_files = glob.glob("/home/gpratt/projects/encode/analysis/encode_split/*")
result = defaultdict(list)
for fn in all_demuxed_files:
    fn, barcode = os.path.basename(fn).split(".")[:-2]
    barcode = barcode.split("_")[0]
    result[fn].append(barcode)

added_barcodes = set(["X1A", "X1B", "X2A", "X2B"])

In [16]:
#This code gets the date sequenced (or downloaded in rare caeses) for each sample
def parse_date(date):
    if date.startswith("20"):
        year = date[:4]
        month = date[4:6]
        day = date[6:]
    else:
        year = int(date[:2]) + 2000
        month = date[2:4]
        day = date[4:]
        
    return datetime.date(int(year), int(month), int(day))

unmerged_location = []
for name, row in manifest.iterrows():
    if row.file_location == "/home/gpratt/projects/encode/data/encode_merged/":
        unmerged_location.append(row.unmerged_location)        
    else:
        unmerged_location.append(row.file_location) 
        
manifest['unified_location'] = unmerged_location
manifest['date'] = manifest.unified_location.apply(lambda x: x.replace("igm-storage1.ucsd.edu/", ""))
manifest['date'] = manifest['date'].apply(lambda x: x.split("/")[4].split("_")[0])
manifest['date'] = manifest['date'].apply(lambda x: x.replace("igm-storage1.ucsd.edu/", ""))
manifest['date'] = manifest['date'].apply(parse_date)

added_date = datetime.date(2016, 2, 29)

In [17]:
metrics_dir = "/home/gpratt/projects/encode/scripts/"

In [18]:
new_barcode_list = []

for name, row in manifest.iterrows():
    new_barcodes = False

    #If a file with X1A or X1B exists then it was demuxed with those files, if not, we don't have any information
    existing_barcodes = set(result[row.Hiseq_file_name])
    if len(added_barcodes & existing_barcodes) > 0:
        new_barcodes = True
        #continue

    #If the metrics file exists and its got X1A in it then its got the new barcodes
    metrics_file = metrics_file = os.path.join(metrics_dir, row.Hiseq_file_name  + ".txt") 
    if os.path.exists(metrics_file):
        metrics_df = pd.read_table(metrics_file, header=None, 
                                   names=["barcode", "randomer", "real_barcode", "count"]
                                   )
        existing_barcodes_sequences = set(metrics_df['barcode'].values)
        existing_barcodes = set([barcodes[item] for item in existing_barcodes_sequences if item != "unassigned"])
        if len(added_barcodes & existing_barcodes) > 0:
            new_barcodes = True
            #continue

    #If the files was done after 160229 then it has X1A, otherwise its the old dataset
    if row.date >= added_date:
        new_barcodes = True
    new_barcode_list.append(new_barcodes)
manifest['has_new_barcodes'] = new_barcode_list

# Togle Datasets to be processed

In [19]:
#manifest = manifest[manifest.method_Paper_flag]

In [20]:
# Make demuxing stuffs

In [21]:
clip_seq_manifest = manifest[manifest.inline_1 != "none"].copy()
input_manifest = manifest[manifest.inline_1 == "none"].copy()

In [22]:
tscc_dir = "/home/gpratt/projects/encode/scripts/barcodes/encode_barcodes"


In [23]:
clip_seq_manifest['inline_name_1'] = clip_seq_manifest.apply(lambda x: "{}_{}_{}".format(x.inline_1, x.ENCODE_ID, x.RBP), axis=1)
clip_seq_manifest['inline_name_2'] = clip_seq_manifest.apply(lambda x: "{}_{}_{}".format(x.inline_2, x.ENCODE_ID, x.RBP), axis=1)

In [24]:
#Writes barcodes to approprate locations
for lane, manifest_by_lane in clip_seq_manifest.groupby("Lane"):
    for file_name, df in manifest_by_lane.groupby("Hiseq_file_name"):
        cur_barcodes = barcodes.copy()
    
        #This renames the barcodes to the correct name
        for key, series in df.iterrows():
            try:
                inline_1 = barcode_name_to_sequence[series.inline_1]
                inline_2 = barcode_name_to_sequence[series.inline_2]
                cur_barcodes[inline_1] = series.inline_name_1
                cur_barcodes[inline_2] = series.inline_name_2
            except KeyError as e:
                print "Key Error, barcode", e, "doesn't exit, won't demux", series.ENCODE_ID
        barcode_file = os.path.join(tscc_dir, file_name + ".txt")
        with open(barcode_file, 'w') as fn:
            for key, value in cur_barcodes.items():
                #if the dataset didn't have new barcodes and the barcode is in there, skip it
                if (not df.has_new_barcodes.values[0]) and (value in added_barcodes):
                    continue
                fn.write("{}\t{}\n".format(key, value))
                
barcode_file = os.path.join(tscc_dir, "new_barcodes.txt")
with open(barcode_file, 'w') as fn:
    for key, value in barcodes.items():
        fn.write("{}\t{}\n".format(key, value))
        
barcode_file = os.path.join(tscc_dir, "old_barcodes.txt")
with open(barcode_file, 'w') as fn:
    for key, value in barcodes.items():
        if (value not in added_barcodes):
            fn.write("{}\t{}\n".format(key, value))

In [25]:
new_metrics_dir = "/projects/ps-yeolab3/encode/analysis/barcode_metrics"

In [26]:
#Functions to format output
def create_demuxer(barcode, r1, r2, length, outdir):
    out_file_1 = os.path.join(outdir, os.path.basename(r1))
    out_file_2 = os.path.join(outdir, os.path.basename(r2))

    metrics = os.path.join(new_metrics_dir, os.path.basename(barcode))
    
    return "demux_paired_end.py --fastq_1 {} --fastq_2 {} -b {} --out_file_1 {} --out_file_2 {} --length {} -m {}".format(r1, r2, barcode, out_file_1, out_file_2, length, metrics)


def create_demuxer_input(barcode, r1, r2, length, outdir):
    out_file_1 = os.path.join(outdir, os.path.basename(r1))
    out_file_2 = os.path.join(outdir, os.path.basename(r2))
    
    metrics = os.path.join(new_metrics_dir, os.path.basename(barcode))
    
    return "demux_paired_end.py --fastq_1 {} --fastq_2 {} -b {} --out_file_1 {} --out_file_2 {} --length {} -m {}".format(r1, r2, barcode, out_file_1, out_file_2, length, metrics)

# Make downstream file names

In [27]:
split_out_dir = "/home/gpratt/projects/encode/analysis/encode_split" #this is the old dir switching to stable ps-yeolab space
#split_out_dir = "/home/gpratt/projects/encode/analysis/encode_split_v2" #this is a test to make sure emilys stuff works / my old data is correctly processed

def make_processing_name(series, inline_name):
    
    r1 = os.path.basename(series.r1).split(".")
    r1.insert(-2, series[inline_name])
    r1 = ".".join(r1)
    
    r2 = os.path.basename(series.r2).split(".")
    r2.insert(-2, series[inline_name])
    r2 = ".".join(r2)

    return ";".join([os.path.join(split_out_dir, r1), 
                     os.path.join(split_out_dir, r2)])

def make_input_processing_name(series):
    
    r1 = os.path.basename(series.r1).split(".")
    r1.insert(-2, "unassigned")
    r1 = ".".join(r1)
    
    r2 = os.path.basename(series.r2).split(".")
    r2.insert(-2, "unassigned")
    r2 = ".".join(r2)

    return ";".join([os.path.join(split_out_dir, r1), 
                     os.path.join(split_out_dir, r2)])

In [28]:
clip_seq_manifest['processing_name_1'] = clip_seq_manifest.apply(functools.partial(make_processing_name, inline_name="inline_name_1"), axis=1)
clip_seq_manifest['processing_name_2'] = clip_seq_manifest.apply(functools.partial(make_processing_name, inline_name="inline_name_2"), axis=1)

input_manifest['processing_name'] = input_manifest.apply(make_input_processing_name, axis=1)

In [29]:
clip_seq_manifest.to_csv("clipseq_manifest.csv")
input_manifest.to_csv("input_manifest.csv")

# Write Demuxing Code

In [30]:
process_locations = ["/home/gpratt/projects/encode/data/encode_merged/",
                    "/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq",
                    "/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160927_K00180_0248_AHCT55BBXX/Data/Fastq_v2",
                    "/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/170325_K00180_0336_BHHG57BBXX_Yeo/Data/Fastq",
                    '/projects/ps-yeolab3/seqdata/20170611_emily_julia_yan_ryan_kristopher_eric/170611_K00180_0382_AHJY7KBBXX_PE50_Yeo/Data/Fastq',
                    '/projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160720_K00180_0223_BHCLHCBBXX/Data/Fastq',
            ]

In [31]:
series = clip_seq_manifest.groupby("Lane").get_group("L001").groupby("Hiseq_file_name").get_group("A5_IP_S6").iloc[0]

In [34]:
series

Hiseq_file_name                                                A5_IP_S6
ENCODE_ID                                                      A5_IP_01
RBP                                                               RBM11
inline_1                                                            A01
inline_2                                                            B06
index_1                                                             701
index_2                                                             501
Lane                                                               L001
file_location         /projects/ps-yeolab/seqdata/igm-storage1.ucsd....
unmerged_location                                                      
original_file_name                                                     
is_encode                                                         False
cell_type                                                              
hiseq_run_date                                                  

In [37]:
result = []
barcode_location = os.path.join(tscc_dir, series.Hiseq_file_name + ".txt")
result.append(create_demuxer(barcode_location, series.r1, series.r2, series.randomer_length, split_out_dir))

In [38]:
result

['demux_paired_end.py --fastq_1 /projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160720_K00180_0223_BHCLHCBBXX/Data/Fastq/A5_IP_S6_L001_R1_001.fastq.gz --fastq_2 /projects/ps-yeolab/seqdata/igm-storage1.ucsd.edu/160720_K00180_0223_BHCLHCBBXX/Data/Fastq/A5_IP_S6_L001_R2_001.fastq.gz -b /home/gpratt/projects/encode/scripts/barcodes/encode_barcodes/A5_IP_S6.txt --out_file_1 /home/gpratt/projects/encode/analysis/encode_split/A5_IP_S6_L001_R1_001.fastq.gz --out_file_2 /home/gpratt/projects/encode/analysis/encode_split/A5_IP_S6_L001_R2_001.fastq.gz --length 10 -m /projects/ps-yeolab3/encode/analysis/barcode_metrics/A5_IP_S6.txt']

In [114]:
#output results in list, only outputs unprocessed datasets
result = []
for lane, manifest_by_lane in clip_seq_manifest.groupby("Lane"):
    if "228_02" in manifest_by_lane.ENCODE_ID:
        print "found"
    for file_name, series in manifest_by_lane.groupby("Hiseq_file_name").first().iterrows():
        
        #Skip if the file has already been demuxed
        r1_file_1, r2_file_1 = series.processing_name_1.split(";")
        r1_file_2, r2_file_2 = series.processing_name_1.split(";")
        if os.path.exists(r1_file_1) and os.path.exists(r2_file_1) and os.path.exists(r1_file_2) and os.path.exists(r2_file_2):
            continue
        #If its not from the current processing run, ignore it
        if series.file_location not in process_locations:
            continue
            
        barcode_location = os.path.join(tscc_dir, series.name + ".txt")
        result.append(create_demuxer(barcode_location, series.r1, series.r2, series.randomer_length, split_out_dir))

for lane, manifest_by_lane in input_manifest.groupby("Lane"):
    for file_name, series in manifest_by_lane.groupby("Hiseq_file_name").first().iterrows():

        #Skip if the file has already been demuxed
        r1_file, r2_file = series.processing_name.split(";")
        if os.path.exists(r1_file) and os.path.exists(r2_file):
            continue
        
        #If its not from the current processing run, ignore it
        if series.file_location not in process_locations:
            continue
            
        if series.has_new_barcodes:
            barcode_location = "/home/gpratt/projects/encode/scripts/barcodes/encode_barcodes/new_barcodes.txt"
        else:
            barcode_location = "/home/gpratt/projects/encode/scripts/barcodes/encode_barcodes/old_barcodes.txt"
        result.append(create_demuxer_input(barcode_location, series.r1, series.r2, series.randomer_length, split_out_dir))

In [111]:
#When you're done demuxing rsync the files to a non-removable location
#!rsync -a . /projects/ps-yeolab3/encode/analysis/encode_split/

In [112]:
job_maker.make_script(result, 
                      "/home/gpratt/projects/encode/scripts/encode_demux", 
                      "/home/gpratt/projects/encode/data/encode_split",
                     walltime=8)

In [92]:
#kicked off 17/1/11 incase I really fucked up and need to roll back

# Make the files to process

In [93]:
def make_current_adapters(adapter1, adapter2, triming_len  = 15):
    result = []
    for x in range(len(adapter1) - triming_len + 1):
        short_adapter1 = adapter1[x:x+triming_len]
        short_adapter2 = adapter2[x:x+triming_len]
        if short_adapter1 == short_adapter2:
            result.append(short_adapter1)
        else:
            result += list([short_adapter1, short_adapter2])
    return ";".join(result)

def make_five_prime_adapters(adapter1, adapter2):
    result = []
    result.append(five_prime_adapter + adapter1)
    result.append(five_prime_adapter + adapter2)
    result = list(set(result))
    #This adapter is incase read 1 and read 2 overlap badly
    return ";".join(result)

# Check if individual files exist
only make the manifest it they do

In [94]:
#out_dir_hg19 = "/projects/ps-yeolab/encode/analysis/encode_v13/"
out_dir_hg19 = "/projects/ps-yeolab3/encode/analysis/encode_master/"
#out_dir_hg19 = "/projects/ps-yeolab2/encode/analysis/encode_GRCh38_v1/"

out_dir_mm9 = "/projects/ps-yeolab2/encode/analysis/encode_mouse_v8/"

def define_basename(row):
    if row.species == "hg19":
        return os.path.join(out_dir_hg19, ".".join(os.path.basename(row.processing_name.split(";")[0]).split(".")[:-2]))
    if row.species == "mm9":
        return os.path.join(out_dir_mm9, ".".join(os.path.basename(row.processing_name.split(";")[0]).split(".")[:-2]))

input_manifest['base_name'] = input_manifest.apply(define_basename, axis=1)
input_manifest['input_bam'] = input_manifest.base_name.apply(lambda x: ".".join([x, "adapterTrim.round2.rmRep.rmDup.sorted.bam"]))
input_manifest['input_peaks'] = input_manifest.base_name.apply(lambda x: ".".join([x, "adapterTrim.round2.rmRep.rmDup.sorted.r2.peaks.fixed.bb"]))
input_manifest['merged_qc'] = input_manifest.apply(lambda x:  "{}.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.peaks.bed.qc_fig.svg".format(x.base_name), axis=1) 
input_manifest['pos_bw'] = input_manifest.apply(lambda x:  "{}.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.pos.bw".format(x.base_name), axis=1) 
input_manifest['neg_bw'] = input_manifest.apply(lambda x:  "{}.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.neg.bw".format(x.base_name), axis=1) 

def define_basename_merged(row):
    if row.species == "hg19":
        return os.path.join(out_dir_hg19, "{}_{}".format(row['ENCODE_ID'], row['RBP']))
    if row.species == "mm9":
        return os.path.join(out_dir_mm9, "{}_{}".format(row['ENCODE_ID'], row['RBP']))

clip_seq_manifest['merged_base_name'] = clip_seq_manifest.apply(define_basename_merged, axis=1)
clip_seq_manifest['merged_bam'] = clip_seq_manifest.apply(lambda x: "{}.merged.r2.bam".format(x.merged_base_name), axis=1) 
clip_seq_manifest['merged_peaks'] = clip_seq_manifest.apply(lambda x:  "{}.merged.r2.peaks.fixed.bb".format(x.merged_base_name), axis=1)
clip_seq_manifest['merged_qc'] = clip_seq_manifest.apply(lambda x:  "{}.merged.r2.peaks.bed.qc_fig.svg".format(x.merged_base_name), axis=1) 
clip_seq_manifest['pos_bw'] = clip_seq_manifest.apply(lambda x:  "{}.merged.r2.norm.pos.bw".format(x.merged_base_name), axis=1) 
clip_seq_manifest['neg_bw'] = clip_seq_manifest.apply(lambda x:  "{}.merged.r2.norm.neg.bw".format(x.merged_base_name), axis=1) 

In [95]:
rerun = """459_CLIP_S29
459_CLIP_S29
459_INPUT_S30
235_CLIP_S53
235_CLIP_S53
235_INPUT_S54
284_CLIP_S55
284_CLIP_S55
284_INPUT_S56
285_CLIP_S57
285_CLIP_S57
390_CLIP_S59
390_CLIP_S59
390_INPUT_S60
534_CLIP_S1
534_CLIP_S1
534_INPUT_S2
567_CLIP_S51
567_CLIP_S51
567_INPUT_S52
570_CLIP_S3
570_CLIP_S3
570_INPUT_S4
580_CLIP_S11
580_CLIP_S11
580_INPUT_S12
584_CLIP_S21
584_CLIP_S21
584_INPUT_S22
590_CLIP_S7
590_CLIP_S7
590_INPUT_S8
591_CLIP_S13
591_CLIP_S13
591_INPUT_S14
592_CLIP_S5
592_CLIP_S5
592_INPUT_S6
593_CLIP_S15
593_CLIP_S15
593_INPUT_S16
594_CLIP_S23
594_CLIP_S23
594_INPUT_S24
598_CLIP_S17
598_CLIP_S17
598_INPUT_S18
614_CLIP_S25
614_CLIP_S25
614_INPUT_S26
622_CLIP_S27
622_CLIP_S27
622_INPUT_S28
624_CLIP_S29
624_CLIP_S29
624_INPUT_S30
627_CLIP_S9
627_CLIP_S9
627_INPUT_S10
631_CLIP_S19
631_CLIP_S19
631_INPUT_S20
LNG10-E_INPUT_S61
LNG10-E_CLIP_S62
LNG10-N_INPUT_S63
LNG10-N_CLIP_S64
LNG10-M_INPUT_S65
LNG10-M_CLIP_S66
LNG10-S_INPUT_S67
LNG10-S_CLIP_S68
LNG9-E_INPUT_S69
LNG9-E_CLIP_S70
LNG9-N_INPUT_S71
LNG9-N_CLIP_S72
LNG9-M_INPUT_S73
LNG9-M_CLIP_S74
LNG9-S_INPUT_S75
LNG9-S_CLIP_S76
LNG1-E_INPUT_S77
LNG1-E_CLIP_S78
LNG1-N_INPUT_S79
LNG1-N_CLIP_S80
LNG1-M_INPUT_S81
LNG1-M_CLIP_S82
LNG1-S_INPUT_S83
LNG1-S_CLIP_S84
LNG19-E_INPUT_S85
LNG19-E_CLIP_S86
LNG19-N_INPUT_S87
LNG19-N_CLIP_S88
LNG20-E_INPUT_S89
LNG20-E_CLIP_S90
LNG20-N_INPUT_S91
LNG20-N_CLIP_S92
323_01_CLIP_S8""".split("\n")

rerun = []

In [96]:
#This is for just running stuff thats not done yet

input_manifest_run = input_manifest[~(input_manifest['input_bam'].apply(os.path.exists) & 
                                      input_manifest['input_peaks'].apply(os.path.exists) &
                                      input_manifest.merged_qc.apply(os.path.exists) &
                                      input_manifest.pos_bw.apply(os.path.exists) & 
                                      input_manifest.neg_bw.apply(os.path.exists) 
                                     )]

#input_manifest_run = input_manifest[~(input_manifest['input_bam'].apply(os.path.exists) & input_manifest['input_peaks'].apply(os.path.exists)) | (input_manifest.Hiseq_file_name.isin(rerun))]


clip_seq_manifest_run = clip_seq_manifest[~(clip_seq_manifest.merged_bam.apply(os.path.exists) & 
                                            clip_seq_manifest.merged_peaks.apply(os.path.exists) &
                                            clip_seq_manifest.merged_qc.apply(os.path.exists) &
                                            clip_seq_manifest.pos_bw.apply(os.path.exists) & 
                                            clip_seq_manifest.neg_bw.apply(os.path.exists) 
                                           )]

#debugging
#clip_seq_manifest_run = clip_seq_manifest[(clip_seq_manifest.ENCODE_ID == "204_01") | (clip_seq_manifest.ENCODE_ID == "204_02") ]
#clip_seq_manifest_run = clip_seq_manifest[~(clip_seq_manifest.merged_bam.apply(os.path.exists) & clip_seq_manifest.merged_peaks.apply(os.path.exists)) | clip_seq_manifest.Hiseq_file_name.isin(rerun)]

#This is for an allup run
# input_manifest_run = input_manifest
# clip_seq_manifest_run = clip_seq_manifest

In [97]:
# clip_seq_manifest_encode = clip_seq_manifest[clip_seq_manifest.is_encode]
# clip_seq_manifest_encode = clip_seq_manifest_encode[clip_seq_manifest_encode.exp_id.isin(ids_good_to_go)]

In [98]:
#Make Manifest for human files
with open(os.path.join("/home/gpratt/projects/encode/scripts", "encode_v13.txt"), 'w') as out_file: 
    for name, line in clip_seq_manifest_run[clip_seq_manifest_run.species == "hg19"].iterrows():
        current_adapters = make_current_adapters(adapters[line.inline_1], adapters[line.inline_2])
        five_prime_adapters = make_five_prime_adapters(barcode_name_to_sequence[line.inline_1], 
                                                       barcode_name_to_sequence[line.inline_2])
        
        out_file.write("\t".join([line.processing_name_1, 
                                  "hg19", 
                                  line.ENCODE_ID + "_" + line.RBP, 
                                  current_adapters,
                                  overlap[line.inline_1],
                                  five_prime_adapters,
                                 line.randomer_length]) + '\n')
        if line.inline_1 != line.inline_2: #Handles single inline barcode samples, if the barcodes are differet run it
            out_file.write("\t".join([line.processing_name_2, 
                                      "hg19", 
                                      line.ENCODE_ID + "_" + line.RBP, 
                                      current_adapters,
                                      overlap[line.inline_2],
                                      five_prime_adapters,
                                     line.randomer_length]) + '\n')
        
    for name, line in input_manifest_run[input_manifest_run.species == "hg19"].iterrows(): 
        
        #I wrote the function badly, this is a hack around that...
        current_adapters = make_current_adapters(adapters["none"], adapters["none"])
        five_prime_adapters = make_five_prime_adapters("", "")
        out_file.write("\t".join([line.processing_name, 
                                  "hg19", 
                                  line.ENCODE_ID + "_" + line.RBP,
                                  current_adapters,
                                  overlap["none"],
                                  five_prime_adapters,
                                  line.randomer_length
                                  ]) + '\n')
        
# #Make manifest for downsampling (hopefully it works)
# with open(os.path.join("/home/gpratt/projects/encode/scripts", "encode_downsample.txt"), 'w') as out_file: 
#     for name, line in clip_seq_manifest_encode[clip_seq_manifest_encode.species == "hg19"].iterrows():
#         current_adapters = make_current_adapters(adapters[line.inline_1], adapters[line.inline_2])
#         five_prime_adapters = make_five_prime_adapters(barcode_name_to_sequence[line.inline_1], 
#                                                        barcode_name_to_sequence[line.inline_2])
        
#         out_file.write("\t".join([line.processing_name_1, 
#                                   "hg19", 
#                                   line.ENCODE_ID + "_" + line.RBP, 
#                                   current_adapters,
#                                   overlap[line.inline_1],
#                                   five_prime_adapters,
#                                  line.randomer_length]) + '\n')
#         out_file.write("\t".join([line.processing_name_2, 
#                                   "hg19", 
#                                   line.ENCODE_ID + "_" + line.RBP, 
#                                   current_adapters,
#                                   overlap[line.inline_2],
#                                   five_prime_adapters,
#                                  line.randomer_length]) + '\n')
        
#Make Manifest for human GRCh38 files
with open(os.path.join("/home/gpratt/projects/encode/scripts", "encode_GRCh38_v1.txt"), 'w') as out_file: 
    for name, line in clip_seq_manifest[clip_seq_manifest.species == "hg19"].iterrows():
        current_adapters = make_current_adapters(adapters[line.inline_1], adapters[line.inline_2])
        five_prime_adapters = make_five_prime_adapters(barcode_name_to_sequence[line.inline_1], 
                                                       barcode_name_to_sequence[line.inline_2])
        
        out_file.write("\t".join([line.processing_name_1, 
                                  "GRCh38", 
                                  line.ENCODE_ID + "_" + line.RBP, 
                                  current_adapters,
                                  overlap[line.inline_1],
                                  five_prime_adapters,
                                 line.randomer_length]) + '\n')
        out_file.write("\t".join([line.processing_name_2, 
                                  "GRCh38", 
                                  line.ENCODE_ID + "_" + line.RBP, 
                                  current_adapters,
                                  overlap[line.inline_2],
                                  five_prime_adapters,
                                 line.randomer_length]) + '\n')
        
    for name, line in input_manifest[input_manifest.species == "hg19"].iterrows(): 
        
        #I wrote the function badly, this is a hack around that...
        current_adapters = make_current_adapters(adapters["none"], adapters["none"])
        five_prime_adapters = make_five_prime_adapters("", "")
        out_file.write("\t".join([line.processing_name, 
                                  "GRCh38", 
                                  line.ENCODE_ID + "_" + line.RBP,
                                  current_adapters,
                                  overlap["none"],
                                  five_prime_adapters,
                                  line.randomer_length
                                  ]) + '\n')
        
#Make Manifest for mouse files
with open(os.path.join("/home/gpratt/projects/encode/scripts", "encode_mouse_v9.txt"), 'w') as out_file: 
    for name, line in clip_seq_manifest[clip_seq_manifest.species == "mm9"].iterrows():
        current_adapters = make_current_adapters(adapters[line.inline_1], adapters[line.inline_2])
        five_prime_adapters = make_five_prime_adapters(barcode_name_to_sequence[line.inline_1], 
                                                       barcode_name_to_sequence[line.inline_2])
        
        out_file.write("\t".join([line.processing_name_1, 
                                  "mm9", 
                                  line.ENCODE_ID + "_" + line.RBP, 
                                  current_adapters,
                                  overlap[line.inline_1],
                                  five_prime_adapters,
                                 line.randomer_length]) + '\n')
        out_file.write("\t".join([line.processing_name_2, 
                                  "mm9", 
                                  line.ENCODE_ID + "_" + line.RBP, 
                                  current_adapters,
                                  overlap[line.inline_2],
                                  five_prime_adapters,
                                 line.randomer_length]) + '\n')
        
    for name, line in input_manifest_run[input_manifest_run.species == "mm9"].iterrows(): 
        
        #I wrote the function badly, this is a hack around that...
        current_adapters = make_current_adapters(adapters["none"], adapters["none"])
        five_prime_adapters = make_five_prime_adapters("", "")
        out_file.write("\t".join([line.processing_name, 
                                  "mm9", 
                                  line.ENCODE_ID + "_" + line.RBP,
                                  current_adapters,
                                  overlap["none"],
                                  five_prime_adapters,
                                  line.randomer_length
                                  ]) + '\n')
    
#     #get all the files in the processing directroy over 1mb in size
#     all_split_files = glob.glob(os.path.join(split_out_dir, "*R2*.gz"))
#     all_large_files = [fn for fn in all_split_files if os.path.getsize(fn) > 6000000]
#     for large_file in all_large_files:
#         annotation = large_file.split(".")[1]
#         if (len(annotation.split("_")) == 1) and (annotation != "unassigned"):
#             large_files = ";".join([large_file.replace("R2", "R1"), large_file])
#             print os.path.basename(large_file), annotation, os.path.getsize(large_file)
#             out_data = [large_files, "hg19"]
            
#             #Now for special cases!
#             #If these are any of KK's sample we will assume that she added all 4 barcodes and 
#             #that they belong to the analysis group that the other two samples came from
#             if os.path.basename(large_file).startswith("KK"):
#                 line = manifest[manifest.r2.apply(os.path.basename) == (os.path.basename(large_file).split(".")[0] + ".fastq.gz")].irow(0)
#                 out_data.append(line.ENCODE_ID + "_" + line.RBP)
            
#             out_data.append(line.randomer_length)
#             print len(out_data)
#             out_file.write("\t".join(out_data) + '\n')

# Check to See If Generated Files Exist

Demuxed Files
---

In [99]:
clip_seq_manifest[~(clip_seq_manifest.processing_name_1.apply(lambda x: all(map(os.path.exists, x.split(';')))) | \
                   clip_seq_manifest.processing_name_2.apply(lambda x: all(map(os.path.exists, x.split(';')))))]

Unnamed: 0,Hiseq_file_name,ENCODE_ID,RBP,inline_1,inline_2,index_1,index_2,Lane,file_location,unmerged_location,...,inline_name_1,inline_name_2,processing_name_1,processing_name_2,merged_base_name,merged_bam,merged_peaks,merged_qc,pos_bw,neg_bw
49,Rhesus_RBFOX2,EVN_Rhesus_RBFOX2,RBFOX2,A04,F05,505,703,L004,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,A04_EVN_Rhesus_RBFOX2_RBFOX2,F05_EVN_Rhesus_RBFOX2_RBFOX2,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
75,KK_8_CLIPip_Fus,KK_8_mnCLIP_Fus_IP_low_2,FUS,C01,D08fixed,504,702,,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,C01_KK_8_mnCLIP_Fus_IP_low_2_FUS,D08fixed_KK_8_mnCLIP_Fus_IP_low_2_FUS,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
78,KK_11_CLIPip_Taf,KK_11_mnCLIP_Taf15_IP_high_1,TAF15,A01,B06,504,705,,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,A01_KK_11_mnCLIP_Taf15_IP_high_1_TAF15,B06_KK_11_mnCLIP_Taf15_IP_high_1_TAF15,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
119,Stefan_IGF2BP3_040915,Stefan_IGF2BP3_040915,IGF2BP3,C01,D08fixed,501,702,,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,C01_Stefan_IGF2BP3_040915_IGF2BP3,D08fixed_Stefan_IGF2BP3_040915_IGF2BP3,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
127,6_K5622_Infminus_AND_8_K5622_Infplus,6_K5622_Infminus,R60,A04,F05,501,706,,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,A04_6_K5622_Infminus_R60,F05_6_K5622_Infminus_R60,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
136,14_GM12878_CLIP_2_Infminus_AND_16_GM12878_CLIP...,16_GM12878_CLIP_2_Infplus,R60,A01,B06,502,706,,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,A01_16_GM12878_CLIP_2_Infplus_R60,B06_16_GM12878_CLIP_2_Infplus_R60,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
173,FOX2gel_pool1-22592619,RBFOX2_293XT_1to12_gel,RBFOX2,A01,A04,505,701,S1,/projects/ps-yeolab/seqdata/20150225_encode_CLIP,,...,A01_RBFOX2_293XT_1to12_gel_RBFOX2,A04_RBFOX2_293XT_1to12_gel_RBFOX2,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
215,283_CLIP,283_02,HNRNPA1,A04,F05,503,706,L002,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,A04_283_02_HNRNPA1,F05_283_02_HNRNPA1,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
217,279_CLIP,279_01,FAM120A,A04,F05,503,702,L002,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,A04_279_01_FAM120A,F05_279_01_FAM120A,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
232,246_CLIP,246_01,AUH,A03,G07,504,706,L003,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,...,A03_246_01_AUH,G07_246_01_AUH,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...


In [None]:
input_manifest[~input_manifest.processing_name.apply(lambda x: all(map(os.path.exists, x.split(';'))))]

Check if individual CLIP input output files exist
----

In [None]:
input_manifest[~input_manifest['input_bam'].apply(os.path.exists)]

In [None]:
input_manifest[~input_manifest['input_peaks'].apply(os.path.exists)]

Check if merged Individual CLIP Files output files exist
----

In [None]:
HTML(clip_seq_manifest[~clip_seq_manifest.merged_bam.apply(os.path.exists)].to_html())

In [None]:
HTML(clip_seq_manifest[~clip_seq_manifest.merged_peaks.apply(os.path.exists)].to_html())

In [None]:
len(clip_seq_manifest)

In [173]:
clip_seq_manifest.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/full_IP_file_list.csv")
input_manifest.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/full_input_file_list.csv")

# Files that exist

In [174]:
v13_done_input = input_manifest[input_manifest['input_bam'].apply(os.path.exists) & input_manifest['input_peaks'].apply(os.path.exists) & (input_manifest.species == "hg19")]
v13_done_clip = clip_seq_manifest[clip_seq_manifest.merged_bam.apply(os.path.exists) & clip_seq_manifest.merged_peaks.apply(os.path.exists) & (clip_seq_manifest.species == "hg19")]
merged_v13_done = pd.merge(v13_done_clip, v13_done_input, left_on="exp_id", right_on="exp_id")

In [175]:
v13_done_clip = clip_seq_manifest[~(clip_seq_manifest.merged_bam.apply(os.path.exists) & clip_seq_manifest.merged_peaks.apply(os.path.exists) & (clip_seq_manifest.species == "hg19"))]

In [176]:
HTML(v13_done_clip.to_html())

Unnamed: 0,Hiseq_file_name,ENCODE_ID,RBP,inline_1,inline_2,index_1,index_2,Lane,file_location,unmerged_location,original_file_name,is_encode,cell_type,hiseq_run_date,randomer_length,Method_Paper_flag,species,is_4000,exp_id,r1,r2,unified_location,date,has_new_barcodes,inline_name_1,inline_name_2,processing_name_1,processing_name_2,merged_base_name,merged_bam,merged_peaks,merged_qc,pos_bw,neg_bw
482,374_CLIP,374_01,APOBEC3C,A01,B06,506,706,L005,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,True,,,10,False,hg19,False,374,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-07-31,False,A01_374_01_APOBEC3C,B06_374_01_APOBEC3C,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
483,374_CLIP,374_02,APOBEC3C,C01,D08fixed,506,706,L005,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,True,,,10,False,hg19,False,374,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-07-31,False,C01_374_02_APOBEC3C,D08fixed_374_02_APOBEC3C,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/projects/ps-yeolab3/encode/analysis/encode_ma...
541,YS2_WHC_CLIP_fox3_2,YS1_fox3_1,Rbfox3,A03,G07,504,704,L007,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,5,False,mm9,False,YS1,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-07-31,False,A03_YS1_fox3_1_Rbfox3,G07_YS1_fox3_1_Rbfox3,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...
542,YS2_WHC_CLIP_fox3_2,YS1_fox3_2,Rbfox3,A04,F05,504,704,L007,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,5,False,mm9,False,YS1,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-07-31,False,A04_YS1_fox3_2_Rbfox3,F05_YS1_fox3_2_Rbfox3,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...
607,SF3B4_Cortex2_CLIP,EV104,SF3B4,A01,B06,506,701,L007,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,10,False,mm9,False,EV104,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-10-02,False,A01_EV104_SF3B4,B06_EV104_SF3B4,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...
608,SF3B4_Cortex4_CLIP,EV105,SF3B4,C01,D08fixed,506,702,L007,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,10,False,mm9,False,EV105,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-10-02,False,C01_EV105_SF3B4,D08fixed_EV105_SF3B4,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...
611,SF3B4_Cerebellum2_CLIP,EV88,SF3B4,A01,B06,506,705,L003,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,10,False,mm9,False,EV88,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-10-02,False,A01_EV88_SF3B4,B06_EV88_SF3B4,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...
612,SF3B4_Cerebellum4_CLIP,EV89,SF3B4,C01,D08fixed,506,706,L003,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,10,False,mm9,False,EV89,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-10-02,False,C01_EV89_SF3B4,D08fixed_EV89_SF3B4,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...
615,SF3B4_Testis2_CLIP,EV92,SF3B4,A01,B06,502,701,L004,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,10,False,mm9,False,EV92,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-10-02,False,A01_EV92_SF3B4,B06_EV92_SF3B4,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...
616,SF3B4_Ovary4_CLIP,EV93,SF3B4,C01,D08fixed,502,702,L004,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,,,False,,,10,False,mm9,False,EV93,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,2015-10-02,False,C01_EV93_SF3B4,D08fixed_EV93_SF3B4,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...,/projects/ps-yeolab2/encode/analysis/encode_mo...


#Upload Ok'ed files to sauron

In [177]:
ids_good_to_go_old = """
204
205
211
215
216
218
220
222
223
224
226
230
236
240
241
242
243
244
245
246
247
249
256
258
260
262
267
278
280
281
283
290
291
203
271
206
209
227
228
237
272
275
279
282
289
291
292
297
298
301
302
312
315
316
321
325
326
331
332
240
338
339
340
341
342
344
345
350
352
353
354
358
366
367
368
384
387
388
393
405
508
497
495
494
477
465
464
452
415
417
484
439
440
441
444
445
447
450
466
478
311
480
481
492
506
383
376
414
425
437
460
461
470
483
507
509
514
516
530
537
539
540
543
544
546
552
553
556
558
560
566
603
235
390
491
501
503
522
529
531
548
550
567
570
577
582
584
589
592
595
596
614
617
624
625
626
628
641
533
538
545
551
629
631
285
493
571
572
575
610
632
649
650
652
654
655
658
668
676
677
678
679
682
684
693
695
696""".split("\n")

ids_good_to_go = [] # """285""".split("\n")

ids_good_to_go_4000 = """235
390
285""".split("\n")

ids_good_to_go_4000 = map(str, ids_good_to_go_4000)
ids_good_to_go = ids_good_to_go_old + ids_good_to_go

In [None]:
merged_fastq_dir = "/home/gpratt/projects/encode/data/combined_fastqs_for_submission"
#analysis_dir = "/projects/ps-yeolab2/encode/analysis/encode_v12"
#analysis_dir = "/projects/ps-yeolab/encode/analysis/encode_v13"
analysis_dir = "/projects/ps-yeolab3/encode/analysis/encode_master/"

analysis_dir_grch38 = "/projects/ps-yeolab3/encode/analysis/encode_GRCh38_v1/"
peaks_dir_grch38 = "/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_hg38_20160901/"

clip_seq_manifest['out_fastq_r1'] = clip_seq_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.R1.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1)
clip_seq_manifest['out_fastq_r2'] = clip_seq_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.R2.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1)

clip_seq_manifest['out_trimmed_fastq_r1'] = clip_seq_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.R1.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1)
clip_seq_manifest['out_trimmed_fastq_r2'] = clip_seq_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.R2.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1)

clip_seq_manifest['out_trimmed_rmRep_fastq_r1'] = clip_seq_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.rmRep.R1.fastq".format(x.ENCODE_ID, x.RBP)), axis=1)
clip_seq_manifest['out_trimmed_rmRep_fastq_r2'] = clip_seq_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.rmRep.R2.fastq".format(x.ENCODE_ID, x.RBP)), axis=1)

clip_seq_manifest['out_bam'] = clip_seq_manifest.apply(lambda x: os.path.join(analysis_dir, "{}_{}.merged.bam".format(x.ENCODE_ID, x.RBP)), axis=1)
clip_seq_manifest['out_bam_grch38'] = clip_seq_manifest.apply(lambda x: os.path.join(analysis_dir_grch38, "{}_{}.merged.bam".format(x.ENCODE_ID, x.RBP)), axis=1)

clip_seq_manifest['out_peaks'] = clip_seq_manifest.apply(lambda x: os.path.join(analysis_dir, "{}_{}.merged.r2.peaks.fixed.bb".format(x.ENCODE_ID, x.RBP)), axis=1)
clip_seq_manifest['out_peaks_grch38'] = clip_seq_manifest.apply(lambda x: os.path.join(peaks_dir_grch38, "{}.basedon_{}.peaks.l2inputnormnew.bed.compressed.bed.narrowPeak.bed.bb".format(x.ENCODE_ID, x.ENCODE_ID)), axis=1)
clip_seq_manifest['out_peaks_bed_grch38'] = clip_seq_manifest.apply(lambda x: os.path.join(peaks_dir_grch38, "{}.basedon_{}.peaks.l2inputnormnew.bed.compressed.bed.narrowPeak.encode.bed.gz".format(x.ENCODE_ID, x.ENCODE_ID)), axis=1)

clip_seq_manifest['out_pos_bw'] = clip_seq_manifest.apply(lambda x: os.path.join(analysis_dir, "{}_{}.merged.r2.norm.pos.bw".format(x.ENCODE_ID, x.RBP)), axis=1)
clip_seq_manifest['out_neg_bw'] = clip_seq_manifest.apply(lambda x: os.path.join(analysis_dir, "{}_{}.merged.r2.norm.neg.bw".format(x.ENCODE_ID, x.RBP)), axis=1)

clip_seq_manifest['out_pos_bw_grch38'] = clip_seq_manifest.apply(lambda x: os.path.join(analysis_dir_grch38, "{}_{}.merged.r2.norm.pos.bw".format(x.ENCODE_ID, x.RBP)), axis=1)
clip_seq_manifest['out_neg_bw_grch38'] = clip_seq_manifest.apply(lambda x: os.path.join(analysis_dir_grch38, "{}_{}.merged.r2.norm.neg.bw".format(x.ENCODE_ID, x.RBP)), axis=1)

input_manifest['out_fastq_r1'] = input_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.R1.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1)
input_manifest['out_fastq_r2'] = input_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.R2.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1) 

input_manifest['out_trimmed_fastq_r1'] = input_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.R1.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1)
input_manifest['out_trimmed_fastq_r2'] = input_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.R2.fastq.gz".format(x.ENCODE_ID, x.RBP)), axis=1)

input_manifest['out_trimmed_rmRep_fastq_r1'] = input_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.rmRep.R1.fastq".format(x.ENCODE_ID, x.RBP)), axis=1)
input_manifest['out_trimmed_rmRep_fastq_r2'] = input_manifest.apply(lambda x: os.path.join(merged_fastq_dir, "{}_{}.trimmed.rmRep.R2.fastq".format(x.ENCODE_ID, x.RBP)), axis=1)

input_manifest['out_pos_bw'] = input_manifest.apply(lambda x: os.path.join(analysis_dir + "{}.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.pos.bw".format(os.path.basename(x.base_name), x.RBP)), axis=1)
input_manifest['out_neg_bw'] = input_manifest.apply(lambda x: os.path.join(analysis_dir + "{}.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.neg.bw".format(os.path.basename(x.base_name), x.RBP)), axis=1)

input_manifest['out_pos_bw_grch38'] = input_manifest.apply(lambda x: os.path.join(analysis_dir_grch38 + "{}.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.pos.bw".format(os.path.basename(x.base_name), x.RBP)), axis=1)
input_manifest['out_neg_bw_grch38'] = input_manifest.apply(lambda x: os.path.join(analysis_dir_grch38 + "{}.adapterTrim.round2.rmRep.rmDup.sorted.r2.norm.neg.bw".format(os.path.basename(x.base_name), x.RBP)), axis=1)


input_manifest['out_bam'] = input_manifest.apply(lambda x: os.path.join(analysis_dir, "{}.adapterTrim.round2.rmRep.rmDup.sorted.bam".format(os.path.basename(x.base_name), x.RBP)), axis=1)
input_manifest['out_bam_grch38'] = input_manifest.apply(lambda x: os.path.join(analysis_dir_grch38, "{}.adapterTrim.round2.rmRep.rmDup.sorted.bam".format(os.path.basename(x.base_name), x.RBP)), axis=1)

In [None]:
def get_4000(row):
    #If we've specified this clip as a 4000 clip only uploads the _4000 version of the dataset
    if row.exp_id in ids_good_to_go_4000:
        return "4000" in row.ENCODE_ID
    else:
        return True

In [None]:
good_to_go_clip = clip_seq_manifest[clip_seq_manifest.exp_id.isin(ids_good_to_go) & clip_seq_manifest.apply(get_4000, axis=1)]
good_to_go_input = input_manifest[input_manifest.exp_id.isin(ids_good_to_go) & input_manifest.apply(get_4000, axis=1)]

good_to_go_input.exp_id = good_to_go_input.exp_id.astype(float)
good_to_go_clip.exp_id = good_to_go_clip.exp_id.astype(float)

In [None]:
good_to_go_clip[~(good_to_go_clip.out_peaks.apply(os.path.exists))]

# Merge fastq files for submission to sauron

In [None]:
encode_submission_dir = "/home/gpratt/projects/encode/data/combined_fastqs_for_submission"

In [None]:
for name, row in good_to_go_clip.iterrows():
    r1_out_file = os.path.join(encode_submission_dir, "{}_{}.R1.fastq.gz".format(row.ENCODE_ID, row.RBP))
    r2_out_file = os.path.join(encode_submission_dir, "{}_{}.R2.fastq.gz".format(row.ENCODE_ID, row.RBP)) 

    barcode_1_read_1 = row.processing_name_1.split(";")[0]
    barcode_2_read_1 = row.processing_name_2.split(";")[0]

    barcode_1_read_2 = row.processing_name_1.split(";")[1]
    barcode_2_read_2 = row.processing_name_2.split(";")[1]
    
    if not os.path.exists(barcode_1_read_1):
        print barcode_1_read_1
    if not  os.path.exists(barcode_2_read_1):
        print barcode_2_read_1
    if not  os.path.exists(barcode_1_read_2):
        print barcode_1_read_2
    if not  os.path.exists(barcode_2_read_2):
        print barcode_2_read_2
        
    try:
        with open(r1_out_file, 'wb') as out_file, open(barcode_1_read_1) as fn_1, open(barcode_2_read_1) as fn_2:
            shutil.copyfileobj(fn_1, out_file)
            shutil.copyfileobj(fn_2, out_file)
    except IOError as e:
        print e
    try:
        with open(r2_out_file, 'wb') as out_file, open(barcode_1_read_2) as fn_1, open(barcode_2_read_2) as fn_2:
            shutil.copyfileobj(fn_1, out_file)
            shutil.copyfileobj(fn_2, out_file)
    except IOError as e:
        print e

In [None]:
for name, row in good_to_go_input.iterrows():
    r1_out_file = os.path.join(encode_submission_dir, "{}_{}.R1.fastq.gz".format(row.ENCODE_ID, row.RBP))
    r2_out_file = os.path.join(encode_submission_dir, "{}_{}.R2.fastq.gz".format(row.ENCODE_ID, row.RBP)) 
    barcode_1_read_1 = row.processing_name.split(";")[0]
    barcode_1_read_2 = row.processing_name.split(";")[1]
    
    print r1_out_file, barcode_1_read_1
    print r2_out_file, barcode_1_read_2

    try:
        with open(r1_out_file, 'wb') as out_file, open(barcode_1_read_1) as fn_1:
            shutil.copyfileobj(fn_1, out_file)
    except IOError as e:
        print e
        
    try:
        with open(r2_out_file, 'wb') as out_file, open(barcode_1_read_2) as fn_1:
            shutil.copyfileobj(fn_1, out_file)
    except IOError as e:
        print e

# Merge trimmed fastq files for sharing with collaborators

In [None]:
for name, row in good_to_go_clip.iterrows():
    r1_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.rmRep.R1.fastq".format(row.ENCODE_ID, row.RBP))
    r2_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.rmRep.R2.fastq".format(row.ENCODE_ID, row.RBP)) 
    #print r1_out_file

    barcode_1_read_1_tmp = row.processing_name_1.split(";")[0]
    barcode_2_read_1_tmp = row.processing_name_2.split(";")[0]
    
    barcode_1_read_1 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_1_tmp).split(".")[:-2]) + ".adapterTrim.round2.rep.bamUnmapped.out.sorted.mate1")
    barcode_2_read_1 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_2_read_1_tmp).split(".")[:-2]) + ".adapterTrim.round2.rep.bamUnmapped.out.sorted.mate1")
    
    barcode_1_read_2_tmp = row.processing_name_1.split(";")[1]
    barcode_2_read_2_tmp = row.processing_name_2.split(";")[1]
    
    barcode_1_read_2 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_1_tmp).split(".")[:-2]) + ".adapterTrim.round2.rep.bamUnmapped.out.sorted.mate2")
    barcode_2_read_2 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_2_read_1_tmp).split(".")[:-2]) + ".adapterTrim.round2.rep.bamUnmapped.out.sorted.mate2")

    
    if not os.path.exists(barcode_1_read_1):
        print barcode_1_read_1
    if not  os.path.exists(barcode_2_read_1):
        print barcode_2_read_1
    if not  os.path.exists(barcode_1_read_2):
        print barcode_1_read_2
    if not  os.path.exists(barcode_2_read_2):
        print barcode_2_read_2
        
    try:
        with open(r1_out_file, 'wb') as out_file, open(barcode_1_read_1) as fn_1, open(barcode_2_read_1) as fn_2:
            shutil.copyfileobj(fn_1, out_file)
            shutil.copyfileobj(fn_2, out_file)
    except IOError as e:
        print e
    try:
        with open(r2_out_file, 'wb') as out_file, open(barcode_1_read_2) as fn_1, open(barcode_2_read_2) as fn_2:
            shutil.copyfileobj(fn_1, out_file)
            shutil.copyfileobj(fn_2, out_file)
    except IOError as e:
        print e

In [None]:
for name, row in good_to_go_input.iterrows():
    r1_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.rmRep.R1.fastq".format(row.ENCODE_ID, row.RBP))
    r2_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.rmRep.R2.fastq".format(row.ENCODE_ID, row.RBP)) 
    barcode_1_read_1_tmp = row.processing_name.split(";")[0]
    barcode_1_read_2_tmp = row.processing_name.split(";")[1]

    barcode_1_read_1 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_1_tmp).split(".")[:-2]) + ".adapterTrim.round2.rep.bamUnmapped.out.sorted.mate1")
    barcode_1_read_2 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_1_tmp).split(".")[:-2]) + ".adapterTrim.round2.rep.bamUnmapped.out.sorted.mate2")

    
    print r1_out_file, barcode_1_read_1
    print r2_out_file, barcode_1_read_2

    try:
        with open(r1_out_file, 'wb') as out_file, open(barcode_1_read_1) as fn_1:
            shutil.copyfileobj(fn_1, out_file)
    except IOError as e:
        print e
        
    try:
        with open(r2_out_file, 'wb') as out_file, open(barcode_1_read_2) as fn_1:
            shutil.copyfileobj(fn_1, out_file)
    except IOError as e:
        print e

In [None]:
for name, row in good_to_go_clip.iterrows():
    r1_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.R1.fastq.gz".format(row.ENCODE_ID, row.RBP))
    r2_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.R2.fastq.gz".format(row.ENCODE_ID, row.RBP)) 
    #print r1_out_file

    barcode_1_read_1 = row.processing_name_1.split(";")[0]
    barcode_2_read_1 = row.processing_name_2.split(";")[0]
    
    barcode_1_read_1 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_1).split(".")[:-2]) + ".adapterTrim.round2.fastq.gz")
    barcode_2_read_1 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_2_read_1).split(".")[:-2]) + ".adapterTrim.round2.fastq.gz")
    
    barcode_1_read_2 = row.processing_name_1.split(";")[1]
    barcode_2_read_2 = row.processing_name_2.split(";")[1]
    
    barcode_1_read_2 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_2).split(".")[:-2]) + ".adapterTrim.round2.fastq.gz")
    barcode_2_read_2 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_2_read_2).split(".")[:-2]) + ".adapterTrim.round2.fastq.gz")

    
    if not os.path.exists(barcode_1_read_1):
        print barcode_1_read_1
    if not  os.path.exists(barcode_2_read_1):
        print barcode_2_read_1
    if not  os.path.exists(barcode_1_read_2):
        print barcode_1_read_2
    if not  os.path.exists(barcode_2_read_2):
        print barcode_2_read_2
        
    try:
        with open(r1_out_file, 'wb') as out_file, open(barcode_1_read_1) as fn_1, open(barcode_2_read_1) as fn_2:
            shutil.copyfileobj(fn_1, out_file)
            shutil.copyfileobj(fn_2, out_file)
    except IOError as e:
        print e
    try:
        with open(r2_out_file, 'wb') as out_file, open(barcode_1_read_2) as fn_1, open(barcode_2_read_2) as fn_2:
            shutil.copyfileobj(fn_1, out_file)
            shutil.copyfileobj(fn_2, out_file)
    except IOError as e:
        print e

In [None]:
for name, row in good_to_go_input.iterrows():
    r1_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.R1.fastq.gz".format(row.ENCODE_ID, row.RBP))
    r2_out_file = os.path.join(encode_submission_dir, "{}_{}.trimmed.R2.fastq.gz".format(row.ENCODE_ID, row.RBP)) 
    barcode_1_read_1_tmp = row.processing_name.split(";")[0]
    barcode_1_read_2_tmp = row.processing_name.split(";")[1]

    barcode_1_read_1 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_1_tmp).split(".")[:-2]) + ".adapterTrim.round2.fastq.gz")
    barcode_1_read_2 = os.path.join(analysis_dir, ".".join(os.path.basename(barcode_1_read_2_tmp).split(".")[:-2]) + ".adapterTrim.round2.fastq.gz")

    try:
        with open(r1_out_file, 'wb') as out_file, open(barcode_1_read_1) as fn_1:
            shutil.copyfileobj(fn_1, out_file)
    except IOError as e:
        print e
        
    try:
        with open(r2_out_file, 'wb') as out_file, open(barcode_1_read_2) as fn_1:
            shutil.copyfileobj(fn_1, out_file)
    except IOError as e:
        print e

# upload files to sauron or s3

In [None]:
import math
from filechunkio import FileChunkIO
from boto.s3.connection import S3Connection
conn = S3Connection("AKIAIXE44O4TRTT6KYCQ", "nWPZgPdoM35lbIGoDrFY5iBWe5vuh0ZJmJin4byA")
b = conn.get_bucket("sauron-yeo")

def upload_file(source_path, dir_path="for_encode"):
    if not os.path.exists(source_path):
        print source_path, "doesn't exist"
        return
    source_size = os.stat(source_path).st_size
    s3_path = os.path.join(dir_path, os.path.basename(source_path))
    mp = b.initiate_multipart_upload(s3_path)
    chunk_size = 52428800
    chunk_count = int(math.ceil(source_size / float(chunk_size)))

    for i in range(chunk_count):
         offset = chunk_size * i
         bytes = min(chunk_size, source_size - offset)
         with FileChunkIO(source_path, 'r', offset=offset,
                             bytes=bytes) as fp:
             mp.upload_part_from_file(fp, part_num=i + 1)
    mp.complete_upload()
    
    possible_key = b.get_key(s3_path)
    possible_key.set_acl('public-read')


In [None]:
for fn in good_to_go_input.out_fastq_r1:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_input.out_fastq_r2:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)

In [116]:
for fn in good_to_go_clip.out_fastq_r1:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_clip.out_fastq_r2:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)

In [121]:
good_to_go_input.out_trimmed_fastq_r1[2]

'/home/gpratt/projects/encode/data/combined_fastqs_for_submission/222_INPUT_HNRNPM.trimmed.R1.fastq.gz'

In [148]:
good_to_go_input.out_trimmed_fastq_r1[2]

'/home/gpratt/projects/encode/data/combined_fastqs_for_submission/222_INPUT_HNRNPM.trimmed.R1.fastq.gz'

In [None]:
# for fn in good_to_go_input.out_trimmed_fastq_r1:
#     upload_file(fn)
# for fn in good_to_go_input.out_trimmed_fastq_r2:
#     upload_file(fn)    
for fn in good_to_go_clip.out_trimmed_fastq_r1:
    upload_file(fn)
for fn in good_to_go_clip.out_trimmed_fastq_r2:
    upload_file(fn)    
# for fn in good_to_go_input.out_trimmed_rmRep_fastq_r1:
#     upload_file(fn)
# for fn in good_to_go_input.out_trimmed_rmRep_fastq_r2:
#     upload_file(fn)    
# for fn in good_to_go_clip.out_trimmed_rmRep_fastq_r1:
#     upload_file(fn)
# for fn in good_to_go_clip.out_trimmed_rmRep_fastq_r2:
#     upload_file(fn)

In [113]:
for fn in good_to_go_clip.out_bam:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_input.out_bam:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_clip.out_peaks:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)

In [139]:
grch38_dir = "for_encode_grch38"

In [None]:
# for fn in good_to_go_clip.out_bam_grch38:
#     #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
#     #upload_file(fn, grch38_dir)
#     possible_key = b.get_key('{}/{}'.format(grch38_dir, os.path.basename(fn)))
#     possible_key.set_acl('public-read')

for fn in good_to_go_input.out_bam_grch38:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn, grch38_dir)

# for fn in good_to_go_clip.out_peaks_bed_grch38:
#     #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
#     upload_file(fn, grch38_dir)
        
# for fn in good_to_go_clip.out_peaks_grch38:
#     #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
#     upload_file(fn, grch38_dir)

In [140]:
grch38_dir

'for_encode_grch38'

In [None]:
for fn in good_to_go_input.out_pos_bw:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_input.out_neg_bw:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_clip.out_pos_bw:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_clip.out_neg_bw:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn)
for fn in good_to_go_input.out_pos_bw_grch38:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn, grch38_dir)
for fn in good_to_go_input.out_neg_bw_grch38:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn, grch38_dir)
for fn in good_to_go_clip.out_pos_bw_grch38:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn, grch38_dir)
for fn in good_to_go_clip.out_neg_bw_grch38:
    #!scp $fn sauron.ucsd.edu:/rpool/www/for_encode
    upload_file(fn, grch38_dir)

In [None]:
for key in conn.get_bucket("sauron-yeo"):
    if os.path.dirname(key.key) == "for_encode":
        key.make_public()
        
for key in conn.get_bucket("sauron-yeo"):
    if os.path.dirname(key.key) == "for_encode_grch38":
        key.make_public()

# Create report to paste into xintao's spreadsheet

In [57]:
merged_good_to_go = pd.concat([good_to_go_clip, good_to_go_input], axis=0, join="outer", ignore_index=True)

In [58]:
len(merged_good_to_go)

566

In [59]:
merged_good_to_go = merged_good_to_go[~(merged_good_to_go.Hiseq_file_name.isin(['285_CLIP',
                                                                                '632_CLIP_S17',
                                                                                '285_INPUT',
                                                                                '632_INPUT_S18',
                                                                                '283_INPUT'
                                                                               ]))]

In [60]:
#sauron_url = "sauron.ucsd.edu/for_encode"
sauron_grch38_url = "https://s3-us-west-1.amazonaws.com/sauron-yeo/for_encode_grch38"
sauron_url = "https://s3-us-west-1.amazonaws.com/sauron-yeo/for_encode"

In [61]:
def to_sauron_url(fn):
    try:
        return os.path.join(sauron_url, os.path.basename(fn))
    except AttributeError:
        return np.nan
    
def to_sauron_grch38_url(fn):
    try:
        return os.path.join(sauron_grch38_url, os.path.basename(fn))
    except AttributeError:
        return np.nan

In [62]:
merged_good_to_go['out_trimmed_fastq_r1'] = merged_good_to_go.out_trimmed_fastq_r1.apply(to_sauron_url)
merged_good_to_go['out_trimmed_fastq_r2'] = merged_good_to_go.out_trimmed_fastq_r2.apply(to_sauron_url)
merged_good_to_go['out_trimmed_rmRep_fastq_r1'] = merged_good_to_go.out_trimmed_rmRep_fastq_r1.apply(to_sauron_url)
merged_good_to_go['out_trimmed_rmRep_fastq_r2'] = merged_good_to_go.out_trimmed_rmRep_fastq_r2.apply(to_sauron_url)
merged_good_to_go['out_fastq_r1'] = merged_good_to_go.out_fastq_r1.apply(to_sauron_url)
merged_good_to_go['out_fastq_r2'] = merged_good_to_go.out_fastq_r2.apply(to_sauron_url)
merged_good_to_go['out_bam'] = merged_good_to_go.out_bam.apply(to_sauron_url)
merged_good_to_go['out_bam_grch38'] = merged_good_to_go.out_bam_grch38.apply(to_sauron_grch38_url)
merged_good_to_go['out_peaks_grch38'] = merged_good_to_go.out_peaks_grch38.apply(to_sauron_grch38_url)
merged_good_to_go['out_peaks_bed_grch38'] = merged_good_to_go.out_peaks_bed_grch38.apply(to_sauron_grch38_url)
merged_good_to_go['out_pos_bw'] = merged_good_to_go.out_pos_bw.apply(to_sauron_url)
merged_good_to_go['out_neg_bw'] = merged_good_to_go.out_neg_bw.apply(to_sauron_url)
merged_good_to_go['out_pos_bw_grch38'] = merged_good_to_go.out_pos_bw.apply(to_sauron_grch38_url)
merged_good_to_go['out_neg_bw_grch38'] = merged_good_to_go.out_neg_bw.apply(to_sauron_grch38_url)

In [63]:
#xintao_manifest = pd.read_csv("/home/gpratt/Dropbox/encode_integration/for_eric/ENCODE_YEO_submission - 2015_1211.csv")
#xintao_manifest = pd.read_csv("/home/gpratt/Dropbox/encode_integration/for_eric/ENCODE_YEO_submission - 2016_0309.csv")
#xintao_manifest = pd.read_csv("/home/gpratt/Dropbox/encode_integration/for_eric/ENCODE_YEO_submission - 2016_0315.csv")
#xintao_manifest = pd.read_csv("/home/gpratt/Dropbox/encode_integration/for_eric/ENCODE_YEO_submission - ENTEX.csv")
#xintao_manifest = pd.read_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/ENCODE_YEO_submission - 2016_07_25.csv")


In [64]:
xintao_manifest = pd.read_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/ENCODE_YEO_submission - HG38_20160902.csv")
merged_good_to_go["join_bam"] = merged_good_to_go.out_bam_grch38.apply(os.path.basename)


In [65]:
xintao_manifest.out_bam = xintao_manifest.out_bam.apply(to_sauron_grch38_url)
xintao_manifest.out_peaks_bb = xintao_manifest.out_peaks_bb.apply(to_sauron_grch38_url)
xintao_manifest.out_peaks_bed = xintao_manifest.out_peaks_bed.apply(to_sauron_grch38_url)

In [78]:
xintao_manifest

Unnamed: 0,UID,RBP,Cell_Line,ENCODE_ID,Replicate,out_bam,out_peaks_bed,out_peaks_bb,Rep_UID,join_bam
0,203,HNRNPC,HepG2,ENCSR550DVK,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,203_01,271_01_HNRNPC.merged.bam
1,203,HNRNPC,HepG2,ENCSR550DVK,Rep2,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,203_02,203_02_HNRNPC.merged.bam
2,203,HNRNPC,HepG2,ENCSR550DVK,Input,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,no_file,no_file,203_INPUT,271_INPUT_ATTACTCG-GGCTCTGA_L002_R1.unassigned...
3,204,RBFOX2,HepG2,ENCSR987FTF,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,204_01,204_01_RBFOX2.merged.bam
4,204,RBFOX2,HepG2,ENCSR987FTF,Rep2,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,204_02,204_02_RBFOX2.merged.bam
5,204,RBFOX2,HepG2,ENCSR987FTF,Input,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,no_file,no_file,204_INPUT,RBFOX2-204-INPUT_S2_R1.unassigned.adapterTrim....
6,205,IGF2BP1,HepG2,ENCSR744GEU,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,205_01,205_01_IGF2BP1.merged.bam
7,205,IGF2BP1,HepG2,ENCSR744GEU,Rep2,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,205_02,205_02_IGF2BP1.merged.bam
8,205,IGF2BP1,HepG2,ENCSR744GEU,Input,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,no_file,no_file,205_INPUT,IGF2BP1-205-INPUT_S4_R1.unassigned.adapterTrim...
9,206,HNRNPK,HepG2,ENCSR828ZID,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,206_01,206_01_HNRNPK.merged.bam


In [66]:
merged_good_to_go['ENCODE_ID'] = merged_good_to_go.ENCODE_ID.apply(lambda x: x.replace("_4000", ""))
merged_good_to_go['ENCODE_ID'] = merged_good_to_go.ENCODE_ID.apply(lambda x: x.replace("_bc2rev", ""))

In [67]:
def remake_pk(row):
    if row.Replicate == "Rep1":
        rep = "01"
    elif row.Replicate == "Rep2":
        rep = "02"
    elif row.Replicate == "Input":
        rep = "INPUT"
    else:
        print "error"
    return "{}_{}".format(row.UID, rep) 

xintao_manifest['Rep_UID'] = xintao_manifest.apply(remake_pk, axis=1)

In [68]:
xintao_manifest['join_bam'] = xintao_manifest.out_bam.apply(os.path.basename)

In [69]:
len(xintao_manifest)

543

In [70]:
len(merged_good_to_go)

562

In [71]:
#This one is for Erics special manifest for GRCh38
foo = pd.merge(xintao_manifest, merged_good_to_go, left_on="join_bam", right_on="join_bam", how="outer").fillna("no_file")

#This one is for a regular submission, but I forgot to back it up, the keys are not right, the are the UID_Rep column and something else
#foo = pd.merge(xintao_manifest, merged_good_to_go, left_on="out_bam", right_on="join_bam", how="outer").fillna("no_file")
#xintao_manifest.to_csv("/home/gpratt/ipython_notebook/xintao_file_locations.csv")
#xintao_manifest.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/xintao_file_locations.csv")
foo.to_csv("/home/gpratt/ipython_notebook/xintao_file_locations.csv")
foo.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/xintao_file_locations.csv")

In [82]:
foo

Unnamed: 0,UID,RBP_x,Cell_Line,ENCODE_ID_x,Replicate,out_bam_x,out_peaks_bed,out_peaks_bb,Rep_UID,join_bam,...,pos_bw,processing_name,processing_name_1,processing_name_2,r1,r2,randomer_length,species,unified_location,unmerged_location
0,203,HNRNPC,HepG2,ENCSR550DVK,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,203_01,271_01_HNRNPC.merged.bam,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,no_file,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/data/encode_merge...,/home/gpratt/projects/encode/data/encode_merge...,5,hg19,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....
1,203,HNRNPC,HepG2,ENCSR550DVK,Rep2,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,203_02,203_02_HNRNPC.merged.bam,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,no_file,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/data/encode_merge...,/home/gpratt/projects/encode/data/encode_merge...,5,hg19,/projects/ps-yeolab/seqdata/20150307_encode_CL...,/projects/ps-yeolab/seqdata/20150307_encode_CL...
2,203,HNRNPC,HepG2,ENCSR550DVK,Input,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,no_file,no_file,203_INPUT,271_INPUT_ATTACTCG-GGCTCTGA_L002_R1.unassigned...,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/home/gpratt/projects/encode/analysis/encode_s...,no_file,no_file,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,5,hg19,/projects/ps-yeolab/seqdata/igm-storage1.ucsd....,
3,204,RBFOX2,HepG2,ENCSR987FTF,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,204_01,204_01_RBFOX2.merged.bam,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,no_file,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,5,hg19,/projects/ps-yeolab/seqdata/20150224_encode_CL...,
4,204,RBFOX2,HepG2,ENCSR987FTF,Rep2,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,204_02,204_02_RBFOX2.merged.bam,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,no_file,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,5,hg19,/projects/ps-yeolab/seqdata/20150224_encode_CL...,
5,204,RBFOX2,HepG2,ENCSR987FTF,Input,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,no_file,no_file,204_INPUT,RBFOX2-204-INPUT_S2_R1.unassigned.adapterTrim....,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/home/gpratt/projects/encode/analysis/encode_s...,no_file,no_file,/projects/ps-yeolab/seqdata/20150224_encode_CL...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,5,hg19,/projects/ps-yeolab/seqdata/20150224_encode_CL...,
6,205,IGF2BP1,HepG2,ENCSR744GEU,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,205_01,205_01_IGF2BP1.merged.bam,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,no_file,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,5,hg19,/projects/ps-yeolab/seqdata/20150224_encode_CL...,
7,205,IGF2BP1,HepG2,ENCSR744GEU,Rep2,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,205_02,205_02_IGF2BP1.merged.bam,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,no_file,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,5,hg19,/projects/ps-yeolab/seqdata/20150224_encode_CL...,
8,205,IGF2BP1,HepG2,ENCSR744GEU,Input,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,no_file,no_file,205_INPUT,IGF2BP1-205-INPUT_S4_R1.unassigned.adapterTrim...,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,/home/gpratt/projects/encode/analysis/encode_s...,no_file,no_file,/projects/ps-yeolab/seqdata/20150224_encode_CL...,/projects/ps-yeolab/seqdata/20150224_encode_CL...,5,hg19,/projects/ps-yeolab/seqdata/20150224_encode_CL...,
9,206,HNRNPK,HepG2,ENCSR828ZID,Rep1,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,https://s3-us-west-1.amazonaws.com/sauron-yeo/...,206_01,206_01_HNRNPK.merged.bam,...,/projects/ps-yeolab3/encode/analysis/encode_ma...,no_file,/home/gpratt/projects/encode/analysis/encode_s...,/home/gpratt/projects/encode/analysis/encode_s...,/projects/ps-yeolab/seqdata/20150307_encode_CL...,/projects/ps-yeolab/seqdata/20150307_encode_CL...,5,hg19,/projects/ps-yeolab/seqdata/20150307_encode_CL...,


In [85]:
foo[['UID', "RBP_x", "Cell_Line", "ENCODE_ID_x", "Replicate", "out_trimmed_fastq_r1", "out_trimmed_fastq_r2"]].to_csv("/home/gpratt/Dropbox/EricGabe_ENCODE/trimmed_fastqs.csv")

In [72]:
def check_exists(url):
    try:
        #urllib2.urlopen('http://{}'.format(url))
        urllib2.urlopen(url)

        return "ok"
    except urllib2.HTTPError, e:
        return "error"
    except urllib2.URLError, e:
        return "error"
    except ValueError:
        print url

In [73]:
good_to_go_clip.out_trimmed_fastq_r1[1]

'/home/gpratt/projects/encode/data/combined_fastqs_for_submission/222_02_HNRNPM.trimmed.R1.fastq.gz'

In [74]:
print foo.iloc[1].out_trimmed_fastq_r2
print foo.iloc[1].out_trimmed_fastq_r2


https://s3-us-west-1.amazonaws.com/sauron-yeo/for_encode/203_02_HNRNPC.trimmed.R2.fastq.gz
https://s3-us-west-1.amazonaws.com/sauron-yeo/for_encode/203_02_HNRNPC.trimmed.R2.fastq.gz


In [75]:
# result = pd.concat([foo.UID,
#                     foo.Replicate,
#                     #foo.out_fastq_r1.apply(check_exists),
#                     #foo.out_fastq_r2.apply(check_exists),
#                     #foo.out_bam.apply(check_exists),
#                     #foo.out_peaks.apply(check_exists),
#                     #foo.out_pos_bw.apply(check_exists),
#                     #foo.out_neg_bw.apply(check_exists),
#                     foo.out_bam_grch38.apply(check_exists),
#                     foo.out_peaks_bed_grch38.apply(check_exists),
#                     foo.out_peaks_grch38.apply(check_exists)], axis=1)

xintao_manifest = xintao_manifest.fillna("no_file")
result = pd.concat([xintao_manifest.UID,
                    xintao_manifest.Replicate,
                    #foo.out_fastq_r1.apply(check_exists),
                    #foo.out_fastq_r2.apply(check_exists),
                    #foo.out_bam.apply(check_exists),
                    #foo.out_peaks.apply(check_exists),
                    foo.out_pos_bw.apply(check_exists),
                    foo.out_neg_bw.apply(check_exists),
                    foo.out_pos_bw_grch38.apply(check_exists),
                    foo.out_neg_bw_grch38.apply(check_exists),
                    xintao_manifest.out_bam.apply(check_exists),
                    xintao_manifest.out_peaks_bb.apply(check_exists),
                    xintao_manifest.out_peaks_bed.apply(check_exists),
                    foo.out_trimmed_fastq_r1.apply(check_exists),
                    foo.out_trimmed_fastq_r2.apply(check_exists),
                    foo.out_trimmed_rmRep_fastq_r1.apply(check_exists),
                    foo.out_trimmed_rmRep_fastq_r2.apply(check_exists),
], axis=1)

no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file
no_file


In [77]:
HTML(result.to_html())

Unnamed: 0,UID,Replicate,out_pos_bw,out_neg_bw,out_pos_bw_grch38,out_neg_bw_grch38,out_bam,out_peaks_bb,out_peaks_bed,out_trimmed_fastq_r1,out_trimmed_fastq_r2,out_trimmed_rmRep_fastq_r1,out_trimmed_rmRep_fastq_r2
0,203,Rep1,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok
1,203,Rep2,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok
2,203,Input,ok,ok,ok,ok,ok,,,ok,ok,ok,ok
3,204,Rep1,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok
4,204,Rep2,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok
5,204,Input,ok,ok,ok,ok,ok,,,ok,ok,ok,ok
6,205,Rep1,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok
7,205,Rep2,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok
8,205,Input,ok,ok,ok,ok,ok,,,ok,ok,ok,ok
9,206,Rep1,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok,ok


In [413]:
HTML(result[result.out_peaks_bb == "error"].to_html())

Unnamed: 0,UID,Replicate,out_bam,out_peaks_bb,out_peaks_bed


In [219]:
#For the mass reupload
#combined_data = pd.read_csv("/home/gpratt/Dropbox/encode_integration/ENCODE_DCC_submittedlist_pre2016.csv", header=None, index_col=0)
# result = pd.concat([combined_data[1],
#             combined_data[21].apply(check_exists),
#           combined_data[22].apply(check_exists),
#           combined_data[23].apply(check_exists),
#           combined_data[24].apply(check_exists)], axis=1)

# HTML(result.to_html())

#Make trackhub with just encode data

In [None]:
only_encode_files = set(list(clip_seq_manifest[clip_seq_manifest.is_encode].Hiseq_file_name.values) + list(input_manifest[input_manifest.is_encode].Hiseq_file_name.values))

In [None]:
",".join(only_encode_files)

In [None]:
encode_files = "{" + (",".join(only_encode_files)) + "}"

In [None]:
encode_files

In [None]:
!echo make_trackhubs.py --genome hg19 --hub 20150921_encode $encode_files *{merged,unassigned}*.{bb,bw}


In [None]:
make_trackhub --genome hg19 --hub 20150921_encode {}merged

In [None]:
import pandas as pd

In [None]:
foo = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_v9_20151209/encode_v9_filelist.ENCODE.20151209.txt")

In [None]:
not_in_rep_1 = foo[~foo.CLIP_rep1.apply(lambda x: x.replace(".bam", ".norm.neg.bw")).apply(os.path.exists)]
not_in_input = foo[~foo.INPUT.apply(lambda x: x.replace(".bam", ".norm.neg.bw")).apply(os.path.exists)]

# Remake all tracks to be proper strand

In [60]:
class ArrayJob():
    def __init__(self):
        self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

    def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
        return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn={3}
#PBS -o {0}.out
#PBS -e {0}.err
#PBS -V
#PBS -q home-yeo
#PBS -W group_list=yeo-group
#PBS -t 1-{1}
#PBS -l walltime={4}:00:00
cd {2}
echo "hello, starting"

""".format(os.path.basename(name), count, run_dir, ppn, walltime)

        
    def make_script(self, commands, script_name, run_dir, ppn=1, walltime=8):
        total = 0 
        result = []
        num_out = 0
        for cmd in commands:
            total += 1
            result.append('cmd[{}]="{}"'.format(total, cmd))
            if total >= 500:
                with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
                    out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime ))
                    for line in result:
                        out_file.write(line + "\n\n")
                    out_file.write(self._epilogue + "\n")
                total = 0 
                num_out += 1
                result = []

        with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
            out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime))
            for line in result:
                out_file.write(line + "\n\n")
            out_file.write(self._epilogue + "\n")

In [56]:
def make_bigwig(bam):
    bam_base = "/projects/ps-yeolab3/encode/analysis/remake_bw/" + os.path.basename(bam)
    pos = bam_base.replace(".bam", ".norm.pos.bw")
    neg = bam_base.replace(".bam", ".norm.neg.bw")
    
    return "make_bigwig_files.py  '--bam' '{}'  '--genome' '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'  '--bw_pos' '{}'  '--bw_neg' '{}' --dont_flip".format(bam, pos, neg)

In [57]:
clip_seq_manifest["r2_bam"] = clip_seq_manifest.out_bam.apply(lambda x: x.replace(".bam", ".r2.bam"))
input_manifest["r2_bam"] = input_manifest.out_bam.apply(lambda x: x.replace(".bam", ".r2.bam"))

In [58]:
bw_files = [make_bigwig(item) for item in clip_seq_manifest.r2_bam] 
bw_files += [make_bigwig(item) for item in input_manifest.r2_bam]

In [59]:
make_bigwig(clip_seq_manifest[clip_seq_manifest.ENCODE_ID == "204_02"].iloc[0].r2_bam)

"make_bigwig_files.py  '--bam' '/projects/ps-yeolab3/encode/analysis/encode_master/204_02_RBFOX2.merged.r2.bam'  '--genome' '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'  '--bw_pos' '/projects/ps-yeolab3/encode/analysis/remake_bw/204_02_RBFOX2.merged.r2.norm.pos.bw'  '--bw_neg' '/projects/ps-yeolab3/encode/analysis/remake_bw/204_02_RBFOX2.merged.r2.norm.neg.bw' --dont_flip"

In [121]:
job_maker = ArrayJob()
job_maker.make_script(bw_files, 
                     script_name="/home/gpratt/projects/encode/scripts/remake_bw", 
                     run_dir="/projects/ps-yeolab3/encode/analysis/remake_bw/",
                     ppn=1)