In [25]:
import gspread
import pandas as pd
import os
from IPython.core.display import HTML
from oauth2client.client import SignedJwtAssertionCredentials
import json
import numpy as np

In [26]:
json_key = json.load(open("/home/gpratt/ipython_notebook/public clip-588adbc137f3.json"))
scope = ['https://spreadsheets.google.com/feeds']

credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'], scope)
gc = gspread.authorize(credentials)

sht1 = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ZU2mQh54jentqvhR_oMnviLGWR8Nw_x338gULzKjNDI/edit#gid=0")
ws = sht1.worksheet("public_clip_database")
list_of_lists = ws.get_all_values()
df = pd.DataFrame(list_of_lists[1:], columns=list_of_lists[0])
df.index = df.SRA

clip_df = df[df['CLIP-seq?'] == "Yes"]
clip_df = clip_df[clip_df.type != "iclip"]
clip_df['file_names'] = clip_df.SRA.apply(lambda x: os.path.join("/home/gpratt/projects/public_clip/data/public_clip_v4", x.strip()) + ".fastq.gz")

In [27]:
#manifest
clip_df.to_csv("/home/gpratt/projects/public_clip/scripts/public_fastq.txt", columns=['file_names', 'Species', 'replicate'], sep="\t", index=False, header=False)

#Run CLIP-seq demux

In [28]:
#demux sra files
result_file = []
with open("/home/gpratt/projects/public_clip/scripts/sra_files.txt", 'w') as out_file:
    for root, subFolders, files in os.walk("/projects/ps-yeolab/clip_public/clip/"):
        for fn in files:
            if fn.endswith("sra"):
                result_file.append(os.path.abspath(os.path.join(root, fn)))

cmd_list = []

with open("/home/gpratt/projects/public_clip/scripts/fastq_dump_clip.sh", 'w') as out_file:

    out_file.write("""#!/bin/bash                                                                                                                                           
#PBS -N fastq_dump_clip                                                                                                                                       
#PBS -o fastq_dump_clip.sh.out                                                                                                                                
#PBS -e fastq_dump_clip.sh.err                                                                                                                                
#PBS -V                                                                                                                                                       
#PBS -l walltime=4:00:00                                                                                                                                      
#PBS -l nodes=1:ppn=1                                                                                                                                         
#PBS -A yeo-group                                                                                                                                             
#PBS -q home-yeo                                                                                                                                                 
#PBS -t 1-335                                                                                                                                                  
# Go to the directory from which the script was called                                                                                                         
cd /home/gpratt/projects/public_clip/data/public_clip_v4
""")

    for line in result_file:
        line = line.strip()
        sra_name = os.path.basename(line).split(".")[0]
        cmd_list.append('fastq-dump --gzip %s' % (line))

    for x,  line in enumerate(cmd_list):
        x += 1
        out_file.write("cmd[{}]=\"{}\"\n".format(x, line))

    out_file.write("eval ${cmd[$PBS_ARRAYID]}\n")


#Get iCLIP files from SRA

In [29]:
iclip_df = df[df['CLIP-seq?'] == "Yes"]
#iclip_df = iclip_df[(iclip_df.type == "par-iclip")]

iclip_df = iclip_df[(iclip_df.type == "iclip") | (iclip_df.type == "par-iclip")]
iclip_df['file_names'] = iclip_df.SRA.apply(lambda x: os.path.join("/home/gpratt/projects/public_clip/data/public_iclip", x.strip()) + ".fastq.gz")

In [30]:
cmd_list = []

with open("/home/gpratt/projects/public_clip/scripts/fastq_dump_iclip.sh", 'w') as out_file:

    out_file.write("""#!/bin/bash                                                                                                                                           
#PBS -N fastq_dump_clip                                                                                                                                       
#PBS -o fastq_dump_clip.sh.out                                                                                                                                
#PBS -e fastq_dump_clip.sh.err                                                                                                                                
#PBS -V                                                                                                                                                       
#PBS -l walltime=24:00:00                                                                                                                                      
#PBS -l nodes=1:ppn=1                                                                                                                                         
#PBS -A yeo-group                                                                                                                                             
#PBS -q home-yeo                                                                                                                                                 
#PBS -t 1-335                                                                                                                                                  
# Go to the directory from which the script was called                                                                                                         
cd /home/gpratt/projects/public_clip/data/public_iclip/
""")

    for line in iclip_df.SRA:
        line = line.strip()
        sra_name = os.path.basename(line).split(".")[0]
        cmd_list.append('fastq-dump --gzip %s' % (line))

    for x,  line in enumerate(cmd_list):
        x += 1    
        out_file.write("cmd[{}]=\"{}\"\n".format(x, line))

    out_file.write("eval ${cmd[$PBS_ARRAYID]}\n")

In [31]:
#manifest
iclip_df.to_csv("/home/gpratt/projects/public_clip/scripts/public_iclip.txt", columns=['file_names', 'Species', 'replicate'], sep="\t", index=False, header=False)

#Generate demuxer

In [32]:
def calculate_barcode(randomers):
    if randomers.lower() == "none":
        return None
    
    if all("N" == base for base in randomers):
        return randomers

    barcodes = []
    for randomer in randomers.split(";"):
          
        randomer = randomer.split("N")
        for n_before, x in enumerate(randomer):
            if len(x) != 0:
                barcodes.append(x)
                break
    return ";".join(barcodes)

def calculate_n_front(row):
    randomer = row['barcode_and_randomer']
    barcode = row['barcode']
    
    if barcode is None:
        return None
    
    barcode = barcode.split(";")[0]

        
    #Special case if we are just trimming, "front" is total length of randomers
    if all("N" == base for base in randomer):
        return len(randomer)
    

    
    randomer = randomer.split(barcode)
    return len(randomer[0])

def calculate_n_back(row):
    randomer = row['barcode_and_randomer'].split(";")[0]


    barcode = row['barcode']
    
    if barcode is None:
        return None
    
    barcode = barcode.split(";")[0]

    #Special case if we are just trimming, "back" is nothing
    if all("N" == base for base in randomer):
        return 0
    


    randomer = randomer.split(barcode)
    return len(randomer[1])

In [33]:
iclip_df['barcode'] = iclip_df['barcode_and_randomer'].apply(calculate_barcode)

In [34]:
iclip_df['front'] = iclip_df.apply(calculate_n_front, axis=1)
iclip_df['back'] = iclip_df.apply(calculate_n_back, axis=1)

In [35]:
HTML(iclip_df.to_html())

Unnamed: 0_level_0,SRA,Cell Type,Antibody,Modification,Species,Notes,CLIP-seq?,type,gencode_v17_id,PMID,replicate,barcode_and_randomer,hg19_equivalent_gencode,GEO,file_names,barcode,front,back
SRA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ERR018283,ERR018283,HeLa,hnRNPC,WT,hg19,iclip,Yes,iclip,ENSG00000092199.13,20601959,rep1,NNNCA,ENSG00000092199.13,,/home/gpratt/projects/public_clip/data/public_...,CA,3.0,0.0
ERR018284,ERR018284,HeLa,hnRNPC,WT,hg19,iclip,Yes,iclip,ENSG00000092199.13,20601959,rep2,NNNGA,ENSG00000092199.13,,/home/gpratt/projects/public_clip/data/public_...,GA,3.0,0.0
SRR901494,SRR901494,HeLa,UPF1,WT,hg19,iclip,Yes,iclip,ENSG00000005007.8,23832275,rep1,GTTNNNN,ENSG00000005007.8,,/home/gpratt/projects/public_clip/data/public_...,GTT,0.0,4.0
SRR901495,SRR901495,HeLa,UPF1,WT,hg19,iclip,Yes,iclip,ENSG00000005007.8,23832275,rep1,GTTNNNN,ENSG00000005007.8,,/home/gpratt/projects/public_clip/data/public_...,GTT,0.0,4.0
SRR901496,SRR901496,HeLa,UPF1,puro,hg19,iclip,Yes,iclip,ENSG00000005007.8,23832275,rep2,GTTNNNN,ENSG00000005007.8,,/home/gpratt/projects/public_clip/data/public_...,GTT,0.0,4.0
ERR102559,ERR102559,Whole Brain,Nova,WT,mm9,iclip,Yes,iclip,ENSMUSG00000021047.6,22863408,rep1,ATCNNNN,ENSG00000139910.15,,/home/gpratt/projects/public_clip/data/public_...,ATC,0.0,4.0
ERR102558,ERR102558,Whole Brain,Nova,WT,mm9,iclip,Yes,iclip,ENSMUSG00000021047.6,22863408,rep2,ACTNNNN,ENSG00000139910.15,,/home/gpratt/projects/public_clip/data/public_...,ACT,0.0,4.0
ERR102560,ERR102560,Whole Brain,Nova,WT,mm9,iclip,Yes,iclip,ENSMUSG00000021047.6,22863408,rep3,AGANNNN,ENSG00000139910.15,,/home/gpratt/projects/public_clip/data/public_...,AGA,0.0,4.0
ERR102557,ERR102557,Whole Brain,Nsun2,WT,mm9,iclip,Yes,iclip,ENSMUSG00000021595.12,22863408,rep3,NNNCAATNN,ENSG00000037474.10,,/home/gpratt/projects/public_clip/data/public_...,CAAT,3.0,2.0
ERR208893,ERR208893,brain,U2AF65,WT,mm9,iclip,Yes,iclip,ENSMUSG00000030435.10,22934129,rep1,GGGNNNN,ENSG00000063244.8,,/home/gpratt/projects/public_clip/data/public_...,GGG,0.0,4.0


In [36]:
#HTML(iclip_df[iclip_df.SRA == "ERR676907"].to_html())

In [44]:
def make_barcode_file(row):
    if row.barcode is None:
        return 
    out_name = "/home/gpratt/projects/public_clip/scripts/barcodes/{}.txt".format(row.SRA.strip())
    with open(out_name, 'w') as out_file:
        for barcode in row.barcode.split(";"):
            if all("N" == base for base in barcode):
                out_file.write("" + "\t" + barcode + "\n")
            else:
                out_file.write(barcode + "\t" + barcode + "\n")
        
    return out_name

In [45]:
iclip_df['barcode_file'] = iclip_df.apply(make_barcode_file, axis=1)

In [39]:
def make_output_file(row):
    try:
        return "python ~/gscripts/gscripts/clipseq/demultiplex_barcoded_fastq.py -f ~/projects/public_clip/data/public_iclip/{}.fastq.gz -o ~/projects/public_clip/analysis/split_fastq/{}.fastq.gz -m ~/projects/public_clip/analysis/split_fastq/{}.metrics -b {} --front {} --back {}".format(row.SRA.strip(),
    row.SRA.strip(),
    row.SRA.strip(),
    row.barcode_file,
    int(row.front),
    int(row.back)
    )
    except ValueError:
        pass

In [40]:
iclip_df['final'] = iclip_df.apply(make_output_file, axis=1)

In [41]:
with open("/home/gpratt/projects/public_clip/scripts/barcode_split.sh", 'w') as out_file:

    out_file.write("""#!/bin/bash                                                                                                                                           
#PBS -N barcode_split                                                                                                                                       
#PBS -o barcode_split.sh.out                                                                                                                                
#PBS -e barcode_split.sh.err                                                                                                                                
#PBS -V                                                                                                                                                       
#PBS -l walltime=16:00:00                                                                                                                                      
#PBS -l nodes=1:ppn=1                                                                                                                                         
#PBS -A yeo-group                                                                                                                                             
#PBS -q home-yeo                                                                                                                                                 
#PBS -t 1-135                                                                                                                                                  
# Go to the directory from which the script was called                                                                                                         
cd /home/gpratt/projects/public_clip/analysis/split_fastq
""")

    for x, line in enumerate(iclip_df.final.values):
        print line
        x += 1
        out_file.write("cmd[{}]=\"{}\"\n".format(x, line))

    out_file.write("eval ${cmd[$PBS_ARRAYID]}\n")


python ~/gscripts/gscripts/clipseq/demultiplex_barcoded_fastq.py -f ~/projects/public_clip/data/public_iclip/ERR018283.fastq.gz -o ~/projects/public_clip/analysis/split_fastq/ERR018283.fastq.gz -m ~/projects/public_clip/analysis/split_fastq/ERR018283.metrics -b /home/gpratt/projects/public_clip/scripts/barcodes/ERR018283.txt --front 3 --back 0
python ~/gscripts/gscripts/clipseq/demultiplex_barcoded_fastq.py -f ~/projects/public_clip/data/public_iclip/ERR018284.fastq.gz -o ~/projects/public_clip/analysis/split_fastq/ERR018284.fastq.gz -m ~/projects/public_clip/analysis/split_fastq/ERR018284.metrics -b /home/gpratt/projects/public_clip/scripts/barcodes/ERR018284.txt --front 3 --back 0
python ~/gscripts/gscripts/clipseq/demultiplex_barcoded_fastq.py -f ~/projects/public_clip/data/public_iclip/SRR901494.fastq.gz -o ~/projects/public_clip/analysis/split_fastq/SRR901494.fastq.gz -m ~/projects/public_clip/analysis/split_fastq/SRR901494.metrics -b /home/gpratt/projects/public_clip/scripts/barc

In [42]:
with open("/home/gpratt/projects/public_clip/scripts/public_iclip_v1.txt", 'w') as out_file:
    for name, row in iclip_df.iterrows():
        if row.barcode is None:
            fastq_file = "/home/gpratt/projects/public_clip/data/public_iclip/{}.fastq.gz".format(row.SRA.strip())
            out_file.write("{}\t{}\n".format(fastq_file, row.Species))
            print os.path.exists(fastq_file)
            continue
        for barcode in row.barcode.split(";"):
            fastq_file = "/home/gpratt/projects/public_clip/analysis/split_fastq/{}.{}.fastq.gz".format(row.SRA.strip(),
                                                                                                       barcode.strip())
            out_file.write("{}\t{}\n".format(fastq_file, row.Species))
            if not os.path.exists(fastq_file):
                print fastq_file

/home/gpratt/projects/public_clip/analysis/split_fastq/SRR1665044.GGTC.fastq.gz
True
True
True
