In [3]:
import pandas as pd
import os
import glob
import functools

In [4]:
def make_miso_submitter(bam, cell_type):
    id_name = bam.split(".")[0]
    bam_loc = os.path.join("/home/elvannostrand/scratch/ENCODE_rnaseq/bam_files_shashank/", cell_type, bam + ".sorted.bam")
    return "submit_miso_pipeline.py  '--bam' '{0}'  '--sample-id' '{1}'  '--genome' 'hg19'  '--do-not-submit'  '--output-sh' '{1}.miso.sh'  &&  'sh' '{1}.miso.sh'  &&  'touch' '/home/gpratt/projects/encode/analysis/miso_out/{1}.miso'".format(bam_loc, id_name) 

In [5]:
master_processing_table = pd.read_table("/home/elvannostrand/data/ENCODE/project_stuff/ENCODE_MASTER_ID_LIST_20160301_AllDatasets.txt", 
                                        sep="\t",
                                        skiprows=1,
                                        names=["UID", "RBP_gID", "CellLine", "RBP_ENSG", "Antibody", "Lot", "CLIP_ENCODEAccID", "CLIP_Rep1ENC", "CLIP_Rep2ENC", "CLIP_InputENC", "RNASEQ_ENCODEAccID",  "Duplicate_RNASEQ_ENCODEAccID", "RNASEQ_ControlENC", "RNASEQ_KDRep1Bam", "RNASEQ_KDRep2Bam", "RNASEQ_ControlRep1Bam", "RNASEQ_ControlRep2Bam", "bar"]
)

In [6]:
miso_commands = []
for name, row in master_processing_table.iterrows():
    miso_commands.append(make_miso_submitter(row.KD_Bam_Rep1, row.Cell_Type))
    miso_commands.append(make_miso_submitter(row.KD_Bam_Rep2, row.Cell_Type))
    miso_commands.append(make_miso_submitter(row.Ctrl_Bam_Rep1, row.Cell_Type))
    miso_commands.append(make_miso_submitter(row.Ctrl_Bam_Rep2, row.Cell_Type))

AttributeError: 'Series' object has no attribute 'KD_Bam_Rep1'

In [7]:
class ArrayJob():
    def __init__(self):
        self._epilogue = "eval ${cmd[$PBS_ARRAYID]}"

    def _prologue(self, name, count, run_dir, ppn=1, walltime=8):
        return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn={3}
#PBS -o {0}.out
#PBS -e {0}.err
#PBS -V
#PBS -q home-yeo
#PBS -W group_list=yeo-group
#PBS -t 1-{1}
#PBS -l walltime={4}:00:00
cd {2}
echo "hello, starting"

""".format(os.path.basename(name), count, run_dir, ppn, walltime)

        
    def make_script(self, commands, script_name, run_dir, ppn=1, walltime=8):
        total = 0 
        result = []
        num_out = 0
        for cmd in commands:
            total += 1
            result.append('cmd[{}]="{}"'.format(total, cmd))
            if total >= 500:
                with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
                    out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime ))
                    for line in result:
                        out_file.write(line + "\n\n")
                    out_file.write(self._epilogue + "\n")
                total = 0 
                num_out += 1
                result = []

        with open("{}_{}.sh".format(script_name, num_out), 'w') as out_file:
            out_file.write(self._prologue("{}_{}".format(script_name, num_out), total, run_dir, ppn, walltime))
            for line in result:
                out_file.write(line + "\n\n")
            out_file.write(self._epilogue + "\n")

In [8]:
job_maker = ArrayJob()
job_maker.make_script(miso_commands, 
                     script_name="/home/gpratt/projects/encode/scripts/miso_call_v2", 
                     run_dir="/home/gpratt/projects/encode/analysis/miso_out/",
                     ppn=16)

#Get Mean and Standard Deveation for all datasets

In [9]:
bam_files = glob.glob("/projects/ps-yeolab/seqdata/20160222_sebas_stress_encode/*.bam")
cryptic_exons = pd.DataFrame(pd.Series({os.path.basename(fn).split(".")[0]: fn for fn in bam_files}, name="bam_files"))
cryptic_exons = cryptic_exons.join(pd.read_excel("/home/gpratt/Dropbox/cryptic_splicing/data/human_data/data_annotations.xlsx", "Sheet1", index_col=4)).dropna()

#Download all the bam files
cat bams.txt | xargs -n 2 sh -c 'wget -b "$0" -O "$1".bam' 

#Get size stats

In [14]:
def collect_insert_size_metrics(bam, out_dir):
    
    name = os.path.basename(bam).split(".")[0]
    out_txt = os.path.join(out_dir, name) + ".txt"
    out_pdf = os.path.join(out_dir, name) + ".pdf"

    if os.path.exists(out_txt):
       return None

    return "java -jar /home/yeo-lab/software/picard-tools-1.141/picard.jar CollectInsertSizeMetrics I={} O={} H={} VALIDATION_STRINGENCY=SILENT".format(bam, out_txt, out_pdf)

In [15]:
insert_size_metrics = [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics") for bam_file in bam_files]


In [None]:
for x in insert_size_metrics:
    !$x

[Thu Mar 24 12:29:12 PDT 2016] picard.analysis.CollectInsertSizeMetrics HISTOGRAM_FILE=/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/HepG2_ut3.pdf INPUT=/projects/ps-yeolab/seqdata/20160222_sebas_stress_encode/HepG2_ut3.bam OUTPUT=/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/HepG2_ut3.txt VALIDATION_STRINGENCY=SILENT    DEVIATIONS=10.0 MINIMUM_PCT=0.05 METRIC_ACCUMULATION_LEVEL=[ALL_READS] ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
[Thu Mar 24 12:29:12 PDT 2016] Executing as gpratt@tscc-1-32.sdsc.edu on Linux 2.6.32-504.16.2.el6.x86_64 amd64; OpenJDK 64-Bit Server VM 1.7.0_75-mockbuild_2015_01_20_23_39-b00; Picard version: 1.141(8ece590411350163e7689e9e77aab8efcb622170_1447695087) IntelDeflater
INFO	2016-03-24 12:29:14	SinglePassSamProgram	Processed     1,000,000 records.  Elapsed time: 00:00:02s.  Time fo

In [16]:
job_maker.make_script(insert_size_metrics, 
                     script_name="/home/gpratt/projects/cryptic_exons/scripts/get_distributions", 
                     run_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/",
                     ppn=1)

In [17]:
def parse_insert_size_metrics(CollectInsertSizeMetricsFile):
    try:
        with open(CollectInsertSizeMetricsFile) as metrics_file:
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            metrics_file.next()
            header = metrics_file.next().split()
            data = metrics_file.next().split()
            return pd.Series({key: value for key, value in zip(header, data)})
    except Exception as e:
        print "error", os.path.basename(CollectInsertSizeMetricsFile)
        raise e

In [18]:
cryptic_exons['metrics'] = [os.path.join("/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/", item + ".txt") for item in cryptic_exons.index]
cryptic_exons = cryptic_exons.join(cryptic_exons.metrics.apply(parse_insert_size_metrics))

In [19]:
def run_rmats(b1, b2, b1_mean, b2_mean, b1_std, b2_std, read_len, gtf, out_dir):
    b1 = ",".join(b1)
    b2 = ",".join(b2)
    b1_mean = ",".join(b1_mean)
    b2_mean = ",".join(b2_mean)
    b1_std = ",".join(b1_std)
    b2_std = ",".join(b2_std)

    return "python /home/gpratt/software/rMATS.3.2.1.beta/RNASeq-MATS.py -b1 {0} -b2 {1} -r1 {2} -r2 {3} -sd1 {4} -sd2 {5} -gtf {6} -t paired -len {7} -a 8 -o {8}".format(b1, b2, b1_mean, b2_mean, b1_std, b2_std, gtf, read_len, out_dir)

In [20]:
interesting_combinations = {(('ars1', 'K562'), ('WT', 'K562')),
                            (('PQ1', 'K562'), ('WT', 'K562')),
                            (('PQCA1', 'K562'), ('WT', 'K562')),
                            (('hs1', 'K562'), ('WT', 'K562')),
                            (('CA', 'K562'), ('WT', 'K562')),
                            (('ars1', 'HepG2'), ('WT', 'HepG2')),
                            (('PQ1', 'HepG2'), ('WT', 'HepG2')),
                            (('PQCA1', 'HepG2'), ('WT', 'HepG2')),
                            (('hs1', 'HepG2'), ('WT', 'HepG2')),
                            (('CA', 'HepG2'), ('WT', 'HepG2')),
                           }

In [21]:
#Stringent Method
results = []
for (genotype_1, cell_type_1), (genotype_2, cell_type_2) in interesting_combinations:
    kd = cryptic_exons[(cryptic_exons.rep  == genotype_1) & (cryptic_exons.stress  == cell_type_1)]
    ctrl = cryptic_exons[(cryptic_exons.rep == genotype_2) & (cryptic_exons.stress  == cell_type_2)]
    results.append(run_rmats(kd.bam_files, 
              ctrl.bam_files, 
              kd.MEAN_INSERT_SIZE, 
              ctrl.MEAN_INSERT_SIZE, 
              kd.STANDARD_DEVIATION,  
              ctrl.STANDARD_DEVIATION, 
              read_len = 100,
              gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
              out_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/{}_{}_vs_{}_{}".format(genotype_1, cell_type_1, genotype_2, cell_type_2)))

In [22]:
job_maker.make_script(results, 
                     script_name="/home/gpratt/projects/cryptic_exons/scripts/rMATS", 
                     run_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/",
                     ppn=8,
                     walltime=48)

#This is for later, after I generate means and standard devations for all my stressed cells

In [23]:
def make_full_path(row, bam_column):
    return os.path.join("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/", row[bam_column] + ".bam")

def make_size_metrics(row, bam_column):
    return os.path.join("/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics/", row[bam_column].split(".")[0] + ".txt")

In [24]:
master_processing_table = pd.read_csv("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/20160325_rnai_manifest.csv", index_col=0)
#master_processing_table = master_processing_table.dropna(subset=["CellLine"], axis=0)

In [25]:
len(master_processing_table)

407

In [26]:
len(set(master_processing_table.kd_2))

405

In [27]:
kd_bam_rep1 = functools.partial(make_full_path, bam_column="ctrl_1")
kd_bam_rep2 = functools.partial(make_full_path, bam_column="ctrl_2")
ctrl_bam_rep1 = functools.partial(make_full_path, bam_column="kd_1")
ctrl_bam_rep2 = functools.partial(make_full_path, bam_column="kd_2")

master_processing_table["kd_bam_rep1"] = master_processing_table.apply(kd_bam_rep1, axis=1)
master_processing_table["kd_bam_rep2"] = master_processing_table.apply(kd_bam_rep2, axis=1)
master_processing_table["ctrl_bam_rep1"] = master_processing_table.apply(ctrl_bam_rep1, axis=1)
master_processing_table["ctrl_bam_rep2"] = master_processing_table.apply(ctrl_bam_rep2, axis=1)

kd_bam_rep1 = functools.partial(make_size_metrics, bam_column="ctrl_1")
kd_bam_rep2 = functools.partial(make_size_metrics, bam_column="ctrl_2")
ctrl_bam_rep1 = functools.partial(make_size_metrics, bam_column="kd_1")
ctrl_bam_rep2 = functools.partial(make_size_metrics, bam_column="kd_2")

master_processing_table["kd_bam_rep1_size"] = master_processing_table.apply(kd_bam_rep1, axis=1)
master_processing_table["kd_bam_rep2_size"] = master_processing_table.apply(kd_bam_rep2, axis=1)
master_processing_table["ctrl_bam_rep1_size"] = master_processing_table.apply(ctrl_bam_rep1, axis=1)
master_processing_table["ctrl_bam_rep2_size"] = master_processing_table.apply(ctrl_bam_rep2, axis=1)

In [28]:
insert_size_metrics = [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['ctrl_bam_rep1']]
insert_size_metrics += [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['ctrl_bam_rep2']]
insert_size_metrics += [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['kd_bam_rep1']]
insert_size_metrics += [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics") for bam_file in master_processing_table['kd_bam_rep2']]
insert_size_metrics = [cmd for cmd in insert_size_metrics if cmd is not None] 

In [29]:
job_maker.make_script(insert_size_metrics, 
                     script_name="/home/gpratt/projects/encode/scripts/get_distributions", 
                     run_dir="/home/gpratt/projects/encode/analysis/ad-hoc/size_metrics/",
                     ppn=1,
                     walltime=1)

In [30]:
# # exclude_kd_bam_rep1 = ["ENCFF979AOI.bam", "ENCFF148JEE.bam"]
# # exclude_kd_bam_rep2 = ["ENCFF791RLG.bam", "ENCFF099GGP.bam", "ENCFF108HHU.bam"]
# #exclude_ctrl_bam_rep2  
# master_processing_table_small = master_processing_table[~master_processing_table.KD_Bam_Rep2.isin(exclude_kd_bam_rep2)]
# master_processing_table_small = master_processing_table_small[~master_processing_table_small.KD_Bam_Rep1.isin(exclude_kd_bam_rep1)]

In [31]:
results = []
for name, row in master_processing_table.iterrows():
    out_dir="/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/{}_vs_{}".format(row['kd_accession'], row['ctrl_accession'])
    #if os.path.exists(out_dir):
    #    continue
    try:
        kd_stats = pd.concat({"rep1": parse_insert_size_metrics(row.kd_bam_rep1_size), 
                              "rep2": parse_insert_size_metrics(row.kd_bam_rep2_size)}).unstack()

        ctrl_stats = pd.concat({"rep1": parse_insert_size_metrics(row.ctrl_bam_rep1_size), 
                                "rep2": parse_insert_size_metrics(row.ctrl_bam_rep2_size)}).unstack()

        results.append(run_rmats([row.kd_bam_rep1, row.kd_bam_rep2],
                                 [row.ctrl_bam_rep1, row.ctrl_bam_rep2],
                                 kd_stats.MEAN_INSERT_SIZE, 
                                 ctrl_stats.MEAN_INSERT_SIZE, 
                                 kd_stats.STANDARD_DEVIATION,  
                                 ctrl_stats.STANDARD_DEVIATION,
                                 read_len = 100,
                                 gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
                                 out_dir=out_dir))
    except:
        pass
        #print row['Unnamed: 0']

In [32]:
job_maker.make_script(results, script_name="/home/gpratt/projects/encode/scripts/rMATS", 
                     run_dir="/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/",
                     ppn=8,
                     walltime=48)

#Run rMATS for human brain

In [33]:
metadata = pd.read_csv("/home/gpratt/Dropbox/cryptic_splicing/data/human_data/SampleInfo.csv")

metadata['bam_file'] = metadata.sample_id.apply(lambda x: os.path.join("/home/gpratt/projects/cryptic_exons/analysis/av_resequencing_v3", x + ".sorted.bam"))
insert_size_metrics = [collect_insert_size_metrics(bam_file, "/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics") for bam_file in metadata.bam_file]

In [34]:
def make_size_metrics(row, bam_column):
    return os.path.join("/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics", os.path.basename(row[bam_column]).split(".")[0] + ".txt")

mn_metrics = functools.partial(make_size_metrics, bam_column="bam_file")
metadata['bam_size'] = metadata.apply(mn_metrics, axis=1)

In [None]:
for x in insert_size_metrics:
    !$x

[Sun Apr 03 18:50:40 PDT 2016] picard.analysis.CollectInsertSizeMetrics HISTOGRAM_FILE=/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/AV_47d_1_S50_L008_R1_001.pdf INPUT=/home/gpratt/projects/cryptic_exons/analysis/av_resequencing_v3/AV_47d_1_S50_L008_R1_001.sorted.bam OUTPUT=/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/size_metrics/AV_47d_1_S50_L008_R1_001.txt VALIDATION_STRINGENCY=SILENT    DEVIATIONS=10.0 MINIMUM_PCT=0.05 METRIC_ACCUMULATION_LEVEL=[ALL_READS] ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json
[Sun Apr 03 18:50:40 PDT 2016] Executing as gpratt@tscc-1-52.sdsc.edu on Linux 2.6.32-504.16.2.el6.x86_64 amd64; OpenJDK 64-Bit Server VM 1.7.0_75-mockbuild_2015_01_20_23_39-b00; Picard version: 1.141(8ece590411350163e7689e9e77aab8efcb622170_1447695087) IntelDeflater
INFO	2016-04-03 18:50:43	SinglePassSamProgram	Processe

In [35]:
results = []

for name, row in metadata.groupby("genotype"):
    print name
    if name == "KK_MN":
        continue
        
    unstressed_name, stressed_name = row.sample_id
    unstressed, stressed = row.bam_file
    unstressed_size, stressed_size = row.bam_size
    out_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/{}_vs_{}".format(stressed_name, unstressed_name)
    #if os.path.exists(out_dir):
    #    continue
#     try:
    kd_stats = pd.concat({"rep1": parse_insert_size_metrics(stressed_size), 
                         }).unstack()

    ctrl_stats = pd.concat({"rep1": parse_insert_size_metrics(unstressed_size), 
                           }).unstack()

    results.append(run_rmats([stressed],
                             [unstressed],
                             kd_stats.MEAN_INSERT_SIZE, 
                             ctrl_stats.MEAN_INSERT_SIZE, 
                             kd_stats.STANDARD_DEVIATION,  
                             ctrl_stats.STANDARD_DEVIATION,
                             read_len = 100,
                             gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
                             out_dir=out_dir))
#     except:
#         pass
#         #print row['Unnamed: 0']

AV_47d
AV_ALS17_5
AV_CVB
AV_GY6_2
AV_Kin1ALS17_3
KK_MN


In [36]:
job_maker.make_script(results, script_name="/home/gpratt/projects/cryptic_exons/scripts/rMATS_MN", 
                     run_dir="/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/",
                     ppn=8,
                     walltime=48)

In [37]:
results = []

row = metadata.groupby("genotype").get_group("KK_MN")
        
fus_name, scr_name, taf15_name, tdp_name = row.sample_id
fus, scr, taf15, tdp = row.bam_file
fus_size, scr_size, taf15_size, tdp_size = row.bam_size

out_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/{}_vs_{}".format(fus_name, scr_name)
kd_stats = pd.concat({"rep1": parse_insert_size_metrics(fus_size), 
                     }).unstack()

ctrl_stats = pd.concat({"rep1": parse_insert_size_metrics(scr_size), 
                       }).unstack()

results.append(run_rmats([fus],
                         [scr],
                         kd_stats.MEAN_INSERT_SIZE, 
                         ctrl_stats.MEAN_INSERT_SIZE, 
                         kd_stats.STANDARD_DEVIATION,  
                         ctrl_stats.STANDARD_DEVIATION,
                         read_len = 100,
                         gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
                         out_dir=out_dir))

out_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/{}_vs_{}".format(taf15_name, scr_name)
kd_stats = pd.concat({"rep1": parse_insert_size_metrics(taf15_size), 
                     }).unstack()

ctrl_stats = pd.concat({"rep1": parse_insert_size_metrics(scr_size), 
                       }).unstack()

results.append(run_rmats([taf15],
                         [scr],
                         kd_stats.MEAN_INSERT_SIZE, 
                         ctrl_stats.MEAN_INSERT_SIZE, 
                         kd_stats.STANDARD_DEVIATION,  
                         ctrl_stats.STANDARD_DEVIATION,
                         read_len = 100,
                         gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
                         out_dir=out_dir))

out_dir="/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/{}_vs_{}".format(tdp_name, scr_name)
kd_stats = pd.concat({"rep1": parse_insert_size_metrics(tdp_size), 
                     }).unstack()

ctrl_stats = pd.concat({"rep1": parse_insert_size_metrics(scr_size), 
                       }).unstack()

results.append(run_rmats([tdp_size],
                         [scr],
                         kd_stats.MEAN_INSERT_SIZE, 
                         ctrl_stats.MEAN_INSERT_SIZE, 
                         kd_stats.STANDARD_DEVIATION,  
                         ctrl_stats.STANDARD_DEVIATION,
                         read_len = 100,
                         gtf="/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf",
                         out_dir=out_dir))

#     except:
#         pass
#         #print row['Unnamed: 0']

In [38]:
job_maker.make_script(results, script_name="/home/gpratt/projects/cryptic_exons/scripts/rMATS_MN_KD", 
                     run_dir="/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/",
                     ppn=8,
                     walltime=48)