In [1]:
import pandas as pd
import os
import glob
import functools

In [2]:

def make_miso_submitter(bam, cell_type):
    id_name = bam.split(".")[0]
    bam_loc = os.path.join("/home/elvannostrand/scratch/ENCODE_rnaseq/bam_files_shashank/", cell_type, bam + ".sorted.bam")
    return "submit_miso_pipeline.py  '--bam' '{0}'  '--sample-id' '{1}'  '--genome' 'hg19'  '--do-not-submit'  '--output-sh' '{1}.miso.sh'  &&  'sh' '{1}.miso.sh'  &&  'touch' '/home/gpratt/projects/encode/analysis/miso_out/{1}.miso'".format(bam_loc, id_name) 

In [3]:
master_processing_table = pd.read_table("/home/elvannostrand/scratch/ENCODE_rnaseq/bam_files_shashank/Encode_Datasets_Master_File.txt")

In [4]:
miso_commands = []
for name, row in master_processing_table.iterrows():
    miso_commands.append(make_miso_submitter(row.KD_Bam_Rep1, row.Cell_Type))
    miso_commands.append(make_miso_submitter(row.KD_Bam_Rep2, row.Cell_Type))
    miso_commands.append(make_miso_submitter(row.Ctrl_Bam_Rep1, row.Cell_Type))
    miso_commands.append(make_miso_submitter(row.Ctrl_Bam_Rep2, row.Cell_Type))

In [83]:
class ArrayJob():
    def __init__(self):
        self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

    def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
        return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn={3}
#PBS -o {0}.out
#PBS -e {0}.err
#PBS -V
#PBS -q home-yeo
#PBS -W group_list=yeo-group
#PBS -t 1-{1}
#PBS -l walltime={4}:00:00
cd {2}
echo "hello, starting"

""".format(os.path.basename(name), count, run_dir, ppn, walltime)

        
    def make_script(self, commands, script_name, run_dir, ppn=1, walltime=8):
        total = 0 
        result = []
        num_out = 0
        for cmd in commands:
            total += 1
            result.append('cmd[{}]="{}"'.format(total, cmd))
            if total >= 500:
                with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
                    out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime ))
                    for line in result:
                        out_file.write(line + "\n\n")
                    out_file.write(self._epilogue + "\n")
                total = 0 
                num_out += 1
                result = []

        with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
            out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime))
            for line in result:
                out_file.write(line + "\n\n")
            out_file.write(self._epilogue + "\n")

In [84]:
job_maker = ArrayJob()
job_maker.make_script(miso_commands, 
                     script_name="/home/gpratt/projects/encode/scripts/miso_call_v2", 
                     run_dir="/home/gpratt/projects/encode/analysis/miso_out/",
                     ppn=16)

#Get Mean and Standard Deveation for all datasets

In [8]:
bam_files = glob.glob("/projects/ps-yeolab/seqdata/20160222_sebas_stress_encode/*.bam")
cryptic_exons = pd.DataFrame(pd.Series({os.path.basename(fn).split(".")[0]: fn for fn in bam_files}, name="bam_files"))
cryptic_exons = cryptic_exons.join(pd.read_excel("/home/gpratt/Dropbox/cryptic_splicing/data/human_data/data_annotations.xlsx", "Sheet1", index_col=4)).dropna()

#Download all the bam files
cat bams.txt | xargs -n 2 sh -c 'wget -b "$0" -O "$1".bam' 

In [9]:
def collect_insert_size_metrics(bam, out_dir):
    name = os.path.basename(bam).split(".")[0]
    out_txt = os.path.join(out_dir, name) + ".txt"
    out_pdf = os.path.join(out_dir, name) + ".pdf"

    return "java -jar /home/yeo-lab/software/picard-tools-1.141/picard.jar CollectInsertSizeMetrics I={} O={} H={} VALIDATION_STRINGENCY=SILENT".format(bam, out_txt, out_pdf)

In [10]:
insert_size_metrics = [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics") for bam_file in bam_files]


In [11]:
for x in insert_size_metrics:
    !$x

[Wed Mar 02 16:31:13 PST 2016] picard.analysis.CollectInsertSizeMetrics HISTOGRAM_FILE=/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/HepG2_ut3.pdf INPUT=/projects/ps-yeolab/seqdata/20160222_sebas_stress_encode/HepG2_ut3.bam OUTPUT=/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/HepG2_ut3.txt VALIDATION_STRINGENCY=SILENT    DEVIATIONS=10.0 MINIMUM_PCT=0.05 METRIC_ACCUMULATION_LEVEL=[ALL_READS] ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
[Wed Mar 02 16:31:13 PST 2016] Executing as gpratt@tscc-0-4.sdsc.edu on Linux 2.6.32-504.16.2.el6.x86_64 amd64; OpenJDK 64-Bit Server VM 1.7.0_75-mockbuild_2015_01_20_23_39-b00; Picard version: 1.141(8ece590411350163e7689e9e77aab8efcb622170_1447695087) IntelDeflater
INFO	2016-03-02 16:31:16	SinglePassSamProgram	Processed     1,000,000 records.  Elapsed time: 00:00:02s.  Time for

In [70]:
job_maker.make_script(insert_size_metrics, 
                     script_name="/home/gpratt/projects/cryptic_exons/scripts/get_distributions", 
                     run_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/",
                     ppn=2)

In [184]:
def parse_insert_size_metrics(CollectInsertSizeMetricsFile):
    try:
        with open(CollectInsertSizeMetricsFile) as metrics_file:
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            header = metrics_file.next().split()
            data = metrics_file.next().split()
            return pd.Series({key: value for key, value in zip(header, data)})
    except Exception as e:
        print os.path.basename(CollectInsertSizeMetricsFile)
        raise e

In [15]:
cryptic_exons['metrics'] = [os.path.join("/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/", item + ".txt") for item in cryptic_exons.index]
cryptic_exons = cryptic_exons.join(cryptic_exons.metrics.apply(parse_insert_size_metrics))

In [87]:
def run_rmats(b1, b2, b1_mean, b2_mean, b1_std, b2_std, read_len, gtf, out_dir):
    b1 = ",".join(b1)
    b2 = ",".join(b2)
    b1_mean = ",".join(b1_mean)
    b2_mean = ",".join(b2_mean)
    b1_std = ",".join(b1_std)
    b2_std = ",".join(b2_std)

    return "python /home/gpratt/software/rMATS.3.2.0.beta/RNASeq-MATS.py -b1 {0} -b2 {1} -r1 {2} -r2 {3} -sd1 {4} -sd2 {5} -gtf {6} -t paired -len {7} -a 8 -o {8}".format(b1, b2, b1_mean, b2_mean, b1_std, b2_std, gtf, read_len, out_dir)

In [88]:
interesting_combinations = {(('ars1', 'K562'), ('WT', 'K562')),
                            (('PQ1', 'K562'), ('WT', 'K562')),
                            (('PQCA1', 'K562'), ('WT', 'K562')),
                            (('hs1', 'K562'), ('WT', 'K562')),
                            (('ars1', 'HepG2'), ('WT', 'HepG2')),
                            (('PQ1', 'HepG2'), ('WT', 'HepG2')),
                            (('PQCA1', 'HepG2'), ('WT', 'HepG2')),
                            (('hs1', 'HepG2'), ('WT', 'HepG2')),
                           }

In [93]:
#Stringent Method
results = []
for (genotype_1, cell_type_1), (genotype_2, cell_type_2) in interesting_combinations:
    kd = cryptic_exons[(cryptic_exons.rep  == genotype_1) & (cryptic_exons.stress  == cell_type_1)]
    ctrl = cryptic_exons[(cryptic_exons.rep == genotype_2) & (cryptic_exons.stress  == cell_type_2)]
    results.append(run_rmats(kd.bam_files, 
              ctrl.bam_files, 
              kd.MEAN_INSERT_SIZE, 
              ctrl.MEAN_INSERT_SIZE, 
              kd.STANDARD_DEVIATION,  
              ctrl.STANDARD_DEVIATION, 
              read_len = 100,
              gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
              out_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/{}_{}_vs_{}_{}".format(genotype_1, cell_type_1, genotype_2, cell_type_2)))

In [94]:
job_maker.make_script(results, 
                     script_name="/home/gpratt/projects/cryptic_exons/scripts/rMATS", 
                     run_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/",
                     ppn=1,
                     walltime=48)

#This is for later, after I generate means and standard devations for all my stressed cells

In [187]:
def make_full_path(row, bam_column):
    return os.path.join("/home/elvannostrand/scratch/ENCODE_rnaseq/bam_files_shashank/", row.Cell_Type, row[bam_column])

def make_size_metrics(row, bam_column):
    return os.path.join("/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics/", row[bam_column].split(".")[0] + ".txt")

In [188]:
kd_bam_rep1 = functools.partial(make_full_path, bam_column="KD_Bam_Rep1")
kd_bam_rep2 = functools.partial(make_full_path, bam_column="KD_Bam_Rep2")
ctrl_bam_rep1 = functools.partial(make_full_path, bam_column="Ctrl_Bam_Rep1")
ctrl_bam_rep2 = functools.partial(make_full_path, bam_column="Ctrl_Bam_Rep2")

master_processing_table["kd_bam_rep1"] = master_processing_table.apply(kd_bam_rep1, axis=1)
master_processing_table["kd_bam_rep2"] = master_processing_table.apply(kd_bam_rep2, axis=1)
master_processing_table["ctrl_bam_rep1"] = master_processing_table.apply(ctrl_bam_rep1, axis=1)
master_processing_table["ctrl_bam_rep2"] = master_processing_table.apply(ctrl_bam_rep2, axis=1)

kd_bam_rep1 = functools.partial(make_size_metrics, bam_column="KD_Bam_Rep1")
kd_bam_rep2 = functools.partial(make_size_metrics, bam_column="KD_Bam_Rep2")
ctrl_bam_rep1 = functools.partial(make_size_metrics, bam_column="Ctrl_Bam_Rep1")
ctrl_bam_rep2 = functools.partial(make_size_metrics, bam_column="Ctrl_Bam_Rep2")

master_processing_table["kd_bam_rep1_size"] = master_processing_table.apply(kd_bam_rep1, axis=1)
master_processing_table["kd_bam_rep2_size"] = master_processing_table.apply(kd_bam_rep2, axis=1)
master_processing_table["ctrl_bam_rep1_size"] = master_processing_table.apply(ctrl_bam_rep1, axis=1)
master_processing_table["ctrl_bam_rep2_size"] = master_processing_table.apply(ctrl_bam_rep2, axis=1)

In [189]:
insert_size_metrics = [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['kd_bam_rep1']]
insert_size_metrics += [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['kd_bam_rep2']]
insert_size_metrics += [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['ctrl_bam_rep1']]
insert_size_metrics += [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['ctrl_bam_rep2']]

In [190]:
len(insert_size_metrics)

1128

In [191]:
job_maker.make_script(insert_size_metrics, 
                     script_name="/home/gpratt/projects/encode/scripts/get_distributions", 
                     run_dir="/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics/",
                     ppn=1,
                     walltime=1)

In [178]:
exclude_kd_bam_rep1 = ["ENCFF979AOI.bam", "ENCFF148JEE.bam"]
exclude_kd_bam_rep2 = ["ENCFF791RLG.bam", "ENCFF099GGP.bam", "ENCFF108HHU.bam"]
#exclude_ctrl_bam_rep2  
master_processing_table_small = master_processing_table[~master_processing_table.KD_Bam_Rep2.isin(exclude_kd_bam_rep2)]
master_processing_table_small = master_processing_table_small[~master_processing_table_small.KD_Bam_Rep1.isin(exclude_kd_bam_rep1)]

In [185]:
results = []
for name, row in master_processing_table_small.iterrows():
    try:
        
        kd_stats = pd.concat({"rep1": parse_insert_size_metrics(row.kd_bam_rep1_size), 
                              "rep2": parse_insert_size_metrics(row.kd_bam_rep2_size)}).unstack()

        ctrl_stats = pd.concat({"rep1": parse_insert_size_metrics(row.ctrl_bam_rep1_size), 
                                "rep2": parse_insert_size_metrics(row.ctrl_bam_rep2_size)}).unstack()

        results.append(run_rmats([row.kd_bam_rep1, row.kd_bam_rep2],
                                 [row.ctrl_bam_rep1, row.ctrl_bam_rep2],
                                 kd_stats.MEAN_INSERT_SIZE, 
                                 ctrl_stats.MEAN_INSERT_SIZE, 
                                 kd_stats.STANDARD_DEVIATION,  
                                 ctrl_stats.STANDARD_DEVIATION,
                                 read_len = 100,
                                 gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
                                 out_dir="/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/{}_vs_{}".format(row['Unnamed: 0'], row['Associated Control Accession'])))
    except:
        print row['Unnamed: 0']

ENCFF974BIC.txt
ENCSR030ARO
ENCFF087IGP.txt
ENCSR104OLN
ENCFF597LPT.txt
ENCSR118VQR
ENCFF401NSP.txt
ENCSR147ZBD
ENCFF987GMJ.txt
ENCSR148MQK
ENCFF077NLP.txt
ENCSR185JGT
ENCFF949KOV.txt
ENCSR230ORC
ENCFF066MMV.txt
ENCSR237IWZ
ENCFF983DDC.txt
ENCSR274KWA
ENCFF940KMT.txt
ENCSR300IEW
ENCFF935RCR.txt
ENCSR308IKH
ENCFF292QAQ.txt
ENCSR318HAT
ENCFF943PEC.txt
ENCSR366FFV
ENCFF964BRZ.txt
ENCSR410UHJ
ENCFF159JHT.txt
ENCSR424YSV
ENCFF997JOB.txt
ENCSR459EMR
ENCFF943GWU.txt
ENCSR471GIS
ENCFF139QDN.txt
ENCSR478FJK
ENCFF764OON.txt
ENCSR486AIO
ENCFF132PEP.txt
ENCSR573UBF
ENCFF962XCB.txt
ENCSR610AEI
ENCFF944XRD.txt
ENCSR620OKS
ENCFF992OHI.txt
ENCSR622MCX
ENCFF936OPH.txt
ENCSR624FBY
ENCFF977MZX.txt
ENCSR637JLM
ENCFF945DBT.txt
ENCSR667RIA
ENCFF993GPN.txt
ENCSR674KDQ
ENCFF100NIE.txt
ENCSR708GKW
ENCFF930KJO.txt
ENCSR710NWE
ENCFF121EPB.txt
ENCSR711ZJQ
ENCFF150SWH.txt
ENCSR728BOL
ENCFF974DGK.txt
ENCSR755KOM
ENCFF962VUU.txt
ENCSR771QMJ
ENCFF116TYA.txt
ENCSR815JDY
ENCFF459VBK.txt
ENCSR820ROH
ENCFF374PNQ.txt
ENCS

In [183]:
job_maker.make_script(results, 
                     script_name="/home/gpratt/projects/encode/scripts/rMATS", 
                     run_dir="/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/",
                     ppn=1,
                     walltime=48)