    Copyright (C) 2022 Allen Buskirk

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

In [1]:
import sys
print (sys.version)

3.9.7 (default, Sep 16 2021, 13:09:58) 
[GCC 7.5.0]


In [None]:
from threading import Thread
from subprocess import Popen
from multiprocessing import Pool 
from time import sleep, time
import signal
import os
import re
import numpy as np
import pickle
import pandas as pd
import pysam
from Bio.Seq import Seq
from collections import OrderedDict
from copy import deepcopy

output_notebook()

# Setup

In [3]:
#set paths/parameters
fastq_dir = "/home/allen/data/4suv/ribo_seq/"
output_dir = "/home/allen/data/4suv/"
available_cores = 48 # must use fewer threads than samples
star_ncRNA_dir = "/home/allen/annotations/hg38_ensemble/ncRNA_STAR/"
star_genome_dir = "/home/allen/annotations/hg38_ensemble/hg38_STAR/"


In [4]:
#set up folder structure for output
if os.access(output_dir, os.F_OK):
    if os.access(output_dir+"logs", os.F_OK):
        print("Directories already exist")
    else:
        os.chdir(output_dir)
        os.mkdir(output_dir+"logs")
        #os.mkdir(output_dir+"/output/deduplicated")
        os.mkdir(output_dir+"trimmed")
        os.mkdir(output_dir+"ncRNA_aligned")
        os.mkdir(output_dir+"genome_aligned")
else:
    print("WARNING:", output_dir, "does not exist.")
if not os.access(output_dir, os.F_OK):
    print("WARNING:", fastq_dir, "does not exist.")

#check file endings for correct data processing.
filenames = os.listdir(fastq_dir)
fastq_files = []
for filename in filenames:
    if filename.endswith(".fastq") or filename.endswith(".fastq.gz"):
        fastq_files.append(filename)

#calculate the number of cores by dividing available cores by number of samples.
numcores = max(available_cores//len(fastq_files), 1)

print("Ready to process files:")
[print("\t\t"+file) for file in fastq_files]
print("\nUsing", str(numcores)+"/"+str(available_cores), "cores for each sample.")
print("Data will be output to", output_dir)

Directories already exist
Ready to process files:
		RIBO_UV_R1.fastq.gz
		RIBO_4SU_R3.fastq.gz
		RIBO_4SU_UV_R2.fastq.gz
		RIBO_UV_R3.fastq.gz
		RIBO_UV_R2.fastq.gz
		RIBO_4SU_R2.fastq.gz
		RIBO_UNT_R2.fastq.gz
		RIBO_4SU_R1.fastq.gz
		RIBO_UNT_R1.fastq.gz
		RIBO_4SU_UV_R3.fastq.gz
		RIBO_4SU_UV_R1.fastq.gz
		RIBO_UNT_R3.fastq.gz

Using 4/48 cores for each sample.
Data will be output to /home/allen/data/4suv/


# Main

In [7]:
#defines process handler for running one sample through the pipeline
class run_one_sample():
    '''handles the running of one sample through the entire pipeline. 
    This class should be called by an individual thread. It will output stdout/stderr 
    readouts from the individual steps in the .../output/logs/ folder.
    
    It also uses a simple text log to pick up where the sample left off if the pipeline 
    fails. This log is stored as completion_log.txt
    Steps are defined in self.steps
    
    The self.check method will terminate the run and kill all running process groups if
    one of the processes in this thread returns an exit code other than 0, or if any
    other thread changes the global kill_pipeline variable to True. Default behavior is 
    to set kill_pipeline to True whenever any thread fails, thus halting processing of 
    all samples.
    
    On completion, each thread will add one to the global samples_done counter, to 
    let the main thread know when all are finished.  
    '''
    def __init__(self, filename):
        global kill_pipeline
        global samples_done
        global star_ncRNA_dir
        global star_genome_dir
        global fastq_dir
        global output_dir
        global numcores
        global failed_sample
        global thread_tracker
        self.star_ncRNA_dir = star_ncRNA_dir
        self.star_genome_dir = star_genome_dir
        self.fastq_dir = fastq_dir
        self.filename = filename
        self.output_dir = output_dir
        self.numcores = numcores
        self.killflag = False
        match = re.search("(.+)(\.fastq|\.fastq\.gz)$", self.filename)
        self.prefix = match.group(1)
        self.suffix = match.group(2)
        if self.prefix not in thread_tracker:
            thread_tracker[self.prefix] = "initiate"
        self.steps = self.define_steps()
        
        with open(output_dir+"logs/"+self.prefix+"_stdout.txt", "a") as self.stdout, \
            open(output_dir+"logs/"+self.prefix+"_completion_log.txt", "a") as self.completion:
            try:
                for step_string in list(self.steps.keys()):
                    if self.killflag == False and kill_pipeline == False:
                        function_name, function_args = self.steps.pop(step_string)
                        self.process = function_name(*function_args)
                        self.check(self.process)
                        self.completion.write(step_string+"\n")
                        self.completion.flush()
                        thread_tracker[self.prefix] = step_string
                if len(self.steps) == 0:
                    samples_done += 1
                        
            except:
                self.killflag = True
                kill_pipeline = True
                samples_done += 1 
                raise
                
    
    def define_steps(self):
        steps = OrderedDict([
            #("deduplicate", (self.deduplicate, (fastq_dir, output_dir, self.filename, numcores))),
            ("trim", (self.trim_reads, (fastq_dir, output_dir, self.filename, numcores))),
            ("align_ncRNA", (self.align_ncRNA, (output_dir, self.filename, numcores))),
            ("align_genome", (self.align_genome, (output_dir, self.filename, numcores))),
            ("sort_genome_aligned", (self.samtools_sort, ("genome_aligned", numcores))),
            ("index_genome_aligned", (self.samtools_index, ("genome_aligned", numcores))),
        ])
            
        if os.access(output_dir+"logs/"+self.prefix+"_completion_log.txt", os.F_OK):
            with open(output_dir+"logs/"+self.prefix+"_completion_log.txt", "r") as self.completion:
                for line in self.completion.readlines():
                    steps.pop(line.strip())
        return steps
    
    
    def deduplicate(self, fastq_dir, output_dir, filename, numcores):
        '''deduplicate ribosome profiling reads using dedupe.sh from 
        the BBTools suite.
        '''
        self.stdout.write("Deduplicate\n"+"".join(["-"]*20)+"\n\n")
        self.stdout.flush()
        dedupe_dir = output_dir+"deduplicated/"+self.prefix+"/"
        if not os.access(dedupe_dir, os.F_OK):
            os.mkdir(dedupe_dir)
        return Popen([
            "/home/allen/code/bbmap/dedupe.sh",
            "in="+fastq_dir+filename,
            "out="+dedupe_dir+self.prefix+".deduped"+self.suffix,
            "absorbmatch=t", #absorb exact matches of contigs
            "absorbcontainment=f", #do not absorb full containments of contigs
            "absorbrc=f", #do not absorb reverse-compliments
            "threads="+str(numcores),
            "overwrite=t",
        ], stdout=self.stdout, stderr=self.stdout, preexec_fn=os.setsid)
            
    
    def trim_reads(self, fastq_dir, output_dir, filename, numcores):
        '''Trim adapters and low quality regions from reads using bbduk.sh
        from the BBTools suite.
        '''
        self.stdout.write("\n\nTrim Reads\n"+"".join(["-"]*20)+"\n\n")
        self.stdout.flush()
        trimmed_dir = output_dir+"trimmed/"+self.prefix+"/"
        if not os.access(trimmed_dir, os.F_OK):
            os.mkdir(trimmed_dir)
            os.mkdir(trimmed_dir+"failedQC")
        return Popen([
            "/home/allen/code/bbmap/bbduk.sh",
            "in="+fastq_dir+filename,
            "out="+trimmed_dir+self.prefix+".trimmed.fastq",
            "outm="+trimmed_dir+"failedQC/"+self.prefix+".failedQC"+self.suffix,
            "rpkm="+trimmed_dir+"rpkm.txt",
            "refstats="+trimmed_dir+"trimming_stats.txt",
#             "literal=CTGTAGGCACCATCAATATCTCGTATGCCGTCTTCTGCTTGAAAA",   ### used for israeli FP data
            "literal=NNNNNNCACTCGGGCACCAAGGAC", ### usual boris v 2 linker
            "k=24", # this parameter sets the minimum kmer being trimmed. Longer = more specific, shorter = more sensitive
            "mink=8", #includes truncations of the kmers down to 8
            "mm=f", #do not ignore middle base mismatch of kmer
            "rcomp=f", #do not allow reverse complement kmer matches
            "copyundefined=t",
            "ktrim=r",
            "forcetrimleft=4", #removes random barcode on left of reads.
            "minavgquality=10",
            "minlength=10",
            "threads="+str(numcores),
            "overwrite=t",
        ],
        stdout=self.stdout, stderr=self.stdout, preexec_fn=os.setsid)
            

    def align_ncRNA(self, output_dir, filename, numcores):
        '''Align reads to ncRNA using STAR. ncRNA fasta sequences from Boris.
        Output unaligned reads.
        '''
        self.stdout.write("\n\nAlign to ncRNA\n"+"".join(["-"]*20)+"\n\n")
        self.stdout.flush()
        trimmed_dir = output_dir+"trimmed/"+self.prefix+"/"
        ncRNA_aligned_dir = output_dir+"ncRNA_aligned/"+self.prefix+"/"
        if not os.access(ncRNA_aligned_dir, os.F_OK):
            os.mkdir(ncRNA_aligned_dir)
        command = [
            "STAR",
            "--runThreadN", str(numcores),
            "--genomeDir", self.star_ncRNA_dir,
            "--readFilesIn", trimmed_dir+self.prefix+".trimmed.fastq",
            "--outFileNamePrefix", ncRNA_aligned_dir+self.prefix+"_",
            "--outSAMtype", "BAM", "Unsorted",
            "--outReadsUnmapped", "Fastx",
            "--alignSJDBoverhangMin", "1",
            "--alignSJoverhangMin", "8",
            "--outFilterMultimapNmax", "20",
            "--outFilterType", "BySJout",
        ]
        return Popen(command, stderr=self.stdout, stdout=self.stdout, 
                     preexec_fn=os.setsid)
            
    
    def align_genome(self, output_dir, filename, numcores):
        '''Align remaining reads to genome.
        '''
        self.stdout.write("\n\nAlign to genome\n"+"".join(["-"]*20)+"\n\n")
        self.stdout.flush()
        previous_aligned_dir = output_dir+"ncRNA_aligned/"+self.prefix+"/"
        tx_aligned_dir = output_dir+"genome_aligned/"+self.prefix+"/"
        if not os.access(tx_aligned_dir, os.F_OK):
            os.mkdir(tx_aligned_dir)
        command = [
            "STAR",
            "--runThreadN", str(self.numcores),
            "--genomeDir", self.star_genome_dir,
            "--readFilesIn", previous_aligned_dir+self.prefix+"_Unmapped.out.mate1",
            "--outFileNamePrefix", tx_aligned_dir+self.prefix+"_",
            "--outSAMtype", "BAM", "Unsorted",
            "--outSAMattributes", "All",
            "--outReadsUnmapped", "Fastx",
            "--alignSJDBoverhangMin", "1",
            "--alignSJoverhangMin", "8",
            "--outFilterMultimapNmax", "1", #how many multimap sites allowed for read
            "--outSAMmultNmax", "1", #how many map sites to write to output for each read
            "--outMultimapperOrder", "Random", #assign read to random alignment if multimapper
            "--outFilterType", "BySJout",
        ]
        return Popen(command, stderr=self.stdout, stdout=self.stdout,
                     preexec_fn=os.setsid)
            
    
    def samtools_sort(self, input_dir, numcores):
        '''Sort BAM file from STAR ouput.
        '''
        self.stdout.write("\n\nSort "+input_dir+" BAM file\n"+"".join(["-"]*20)+"\n\n")
        self.stdout.flush()
        aligned_suffix = "_Aligned.out"
        return Popen([
            "samtools",
            "sort",
            "-@", str(numcores),
            self.output_dir+input_dir+"/"+self.prefix+"/"+self.prefix+aligned_suffix+".bam",
            "-o", 
            self.output_dir+input_dir+"/"+self.prefix+"/"+self.prefix+aligned_suffix+".sorted.bam"
        ], stderr=self.stdout, stdout=self.stdout, preexec_fn=os.setsid)
            
    
    def samtools_index(self, input_dir, numcores):
        '''Index BAM file from STAR output
        '''
        self.stdout.write("\n\nIndex "+input_dir+" BAM file\n"+"".join(["-"]*20)+"\n\n")
        self.stdout.flush()
        aligned_suffix = "_Aligned.out"
        try:
            os.remove(self.output_dir+input_dir+"/"+self.prefix+"/"+self.prefix+aligned_suffix+".bam",)
        except FileNotFoundError:
            pass
        return Popen([
            "samtools",
            "index",
            "-@", str(numcores),
            self.output_dir+input_dir+"/"+self.prefix+"/"+self.prefix+aligned_suffix+".sorted.bam"
        ], stderr=self.stdout, stdout=self.stdout, preexec_fn=os.setsid)
            
               
    def check(self, proc):
        '''Poll Popen processes returned by each method to determine if a nonzero
        error code was returned. If so, kill process group and set global kill_pipeline
        to True, signalling to other processing threads to shut down as well.
        Polling happens every 1 second.
        '''
        global kill_pipeline
        global thread_tracker
        exit_code = proc.poll()
        while exit_code == None:
            sleep(1)
            if kill_pipeline == True:
                self.killflag = True
            if self.killflag == True:
                os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
            exit_code = proc.poll()
        else:
            if exit_code != 0:
                self.killflag = True
                kill_pipeline = True

In [8]:
start_time = time()

kill_pipeline = False
samples_done = 0
sample_runs = {}
thread_tracker = OrderedDict()

#start threads for running each sample through the pipeline
for filename in fastq_files:
    sample_runs[filename] = Thread(target=run_one_sample, args=(filename,))
    sample_runs[filename].start()
    
print("Pipeline Running...")

steps_mapper = OrderedDict([ #offset by one because if it's recorded in the log, it's already done
    ("initiate", "Trimming Adapter"),
    ("trim", "Aligning to ncRNA"),
    ("align_ncRNA", "Aligning to Genome"), 
    ("align_genome", "Sorting genome aligned reads"), 
    ("sort_genome_aligned", "Indexing Genome Aligned Reads"), 
    ("index_genome_aligned", "Done"),
])

steps = ["Start"]+[steps_mapper[step] for step in steps_mapper]
prefixes = list(thread_tracker.keys())

p = figure(height=200, width=800, y_range=prefixes, background_fill_color="lightgray", x_range=steps, title="Pipeline Progress", tools=[])
p.xgrid.grid_line_color = "gray"
p.xaxis.major_label_orientation = np.pi/8

p.ygrid.visible = False
source = ColumnDataSource(data={"y":prefixes, "right":["Start"]*len(prefixes), "height":[0.5]*len(prefixes), "left":[0]*len(prefixes)})
x = p.hbar(y="y", right="right", height="height", left="left", source=source)

show(p, notebook_handle=True)

#Check every 1 sec whether the pipeline is finished and report whether it's been terminated.
try:
    while samples_done != len(fastq_files):
        for sample in thread_tracker:
            source.data = {"y":prefixes, "right":[steps_mapper[thread_tracker[sample]] for sample in thread_tracker], "height":[0.5]*len(prefixes), "left":[0]*len(prefixes)}
            x.view = CDSView(source=x.data_source)
            push_notebook()
        sleep(1)
    else:
        if kill_pipeline == False:
            source.data = {"y":prefixes, "right":[steps_mapper[thread_tracker[sample]] for sample in thread_tracker], "height":[0.5]*len(prefixes), "left":[0]*len(prefixes)}
            x.view = CDSView(source=x.data_source)
            push_notebook()
            print("Pipeline finished successfully!")
        else:
            print("Run terminated, check for errors")
except:
    kill_pipeline = True  #allows KeyboardInterrupt to kill pipeline
    print("Run terminated. Check for errors.")
    raise

    

runtime = time() - start_time
if runtime > 60:
    mins = round(runtime/60, 2)
    print("Run time:", mins, "minutes")
else:
    secs = round(runtime, 2)
    print("Run time:", secs, "seconds")


Pipeline Running...


Pipeline finished successfully!
Run time: 28.49 minutes


In [11]:
def read_fate(sample_name):
    global output_dir
    with open(output_dir+"logs/"+sample_name+"_stdout.txt", "r", errors='ignore') as log, \
    open(output_dir+"ncRNA_aligned/"+sample_name+"/"+sample_name+"_Log.final.out", "r") as ncRNA_log, \
    open(output_dir+"genome_aligned/"+sample_name+"/"+sample_name+"_Log.final.out", "r") as genome_log:
        file = log.readlines()[::-1]
#         dedupe_end = file.index("Trim Reads\n")
#         dedupe_start = file.index("--------------------\n", dedupe_end)
#         for line in file[dedupe_end:dedupe_start]:
#             if line.startswith("Input"):
#                 input_reads = int(line.split("\t")[1].split(" ")[0])
#             elif line.startswith("Result"):
#                 deduplicated_reads = int(line.split("\t")[1].split(" ")[0])
        trim_end = file.index("Align to ncRNA\n")
        trim_start = file.index("--------------------\n", trim_end)
        for line in file[trim_end:trim_start]:
            if line.startswith("Input:"):
                input_reads = int(line.split("\t")[1].split(" ")[0])
            if line.startswith("Result:"):
                trimmed_reads = int(line.split("\t")[1].split(" ")[0])
        file = ncRNA_log.readlines()
        for line in file:
            if line.strip().startswith("Uniquely mapped reads number"):
                ncRNA_mapped_reads = int(line.split("|")[1].strip())
            elif line.strip().startswith("Number of reads mapped to multiple loci"):
                ncRNA_mapped_reads += int(line.split("|")[1].strip())
        file = genome_log.readlines()
        for line in file:
            if line.strip().startswith("Number of input reads"):
                genome_input_reads = int(line.split("|")[1].strip())
            elif line.strip().startswith("Uniquely mapped reads number"):
                genome_mapped = int(line.split("|")[1].strip())
            elif line.strip().startswith("Number of reads mapped to multiple loci"):
                genome_mapped += int(line.split("|")[1].strip())
        unmapped_reads = genome_input_reads - genome_mapped
#         print("Library was", str(round(deduplicated_reads/input_reads*100, 2))+"%", "unique.")
#         print("Of those,", str(round(trimmed_reads/deduplicated_reads*100, 2))+"%", "survived trimming.")
        print("Total reads = ", str(input_reads))
        print(str(trimmed_reads)+' or', str(round(trimmed_reads/input_reads*100, 2))+"%", "survived trimming.")
        print(str(ncRNA_mapped_reads)+' or',str(round(ncRNA_mapped_reads/trimmed_reads*100, 2))+"%", "mapped to ncRNA.")
        print(str(genome_mapped)+' or',str(round(genome_mapped/trimmed_reads*100, 2))+"%", "mapped to the genome.")
        print(str(unmapped_reads)+' or',str(round(unmapped_reads/trimmed_reads*100, 2))+"%", "remained unmapped.")

fn = []
for i in filenames:
    if "RIBO" in i:
        fn.append(i)        
        
for fname in fn:
    print(fname[:-9])
    read_fate(fname[:-9])   # cuts off .fastq.gz
    print("\n\n")

RIBO_UV_R1
Total reads =  47214512
47101731 or 99.76% survived trimming.
36658111 or 77.83% mapped to ncRNA.
4584715 or 9.73% mapped to the genome.
5858905 or 12.44% remained unmapped.



RIBO_4SU_R3
Total reads =  52474177
52359691 or 99.78% survived trimming.
38880650 or 74.26% mapped to ncRNA.
6244789 or 11.93% mapped to the genome.
7234252 or 13.82% remained unmapped.



RIBO_4SU_UV_R2
Total reads =  30129632
29691889 or 98.55% survived trimming.
20758828 or 69.91% mapped to ncRNA.
3663567 or 12.34% mapped to the genome.
5269494 or 17.75% remained unmapped.



RIBO_UV_R3
Total reads =  53916147
53816938 or 99.82% survived trimming.
41202232 or 76.56% mapped to ncRNA.
6164841 or 11.46% mapped to the genome.
6449865 or 11.98% remained unmapped.



RIBO_UV_R2
Total reads =  53331014
53031408 or 99.44% survived trimming.
38651826 or 72.88% mapped to ncRNA.
7269189 or 13.71% mapped to the genome.
7110393 or 13.41% remained unmapped.



RIBO_4SU_R2
Total reads =  49419441
49353643 or 99.

Making density files for each of these samples

In [None]:
### parse GTF

with open("/home/allen/annotations/hg38_ensemble/files/Homo_sapiens.GRCh38.105.gtf", "r") as inGTF:

    gtf_dict = {"chrom":[], "feature":[], "start":[], "end":[], 
                "gene_id":[], "transcript_id":[], "exon_number":[], 
                "strand":[], "annotation_type":[], "support_level":[],
                "biotype":[]
               }
    for line in inGTF:
        try:
            if not line.startswith("#"):
                splitline = line.split("\t")
                chrom = splitline[0]
                feature = splitline[2]
                if feature == "stop_codon": 
                    feature = "CDS"
                if feature not in ["transcript", "CDS", "exon"] or chrom not in [str(num) for num in range(1,24)]+["X", "Y"]:
                    continue

                if not 'tag "basic"' in line:
                    continue
                start = int(splitline[3])
                end = int(splitline[4])
                gene_id = splitline[8].split("gene_id ")[1].split('"')[1]
                transcript_v = splitline[8].split("transcript_version ")[1].split('"')[1]
                transcript_id = splitline[8].split("transcript_id ")[1].split('"')[1]+'.'+transcript_v                    
                biotype = splitline[8].split("transcript_biotype ")[1].split('"')[1]
                try:
                    support_level = splitline[8].split("transcript_support_level")[1].split('"')[1]
                except IndexError:
                    support_level = np.nan
                annotation_type = splitline[1]
                if feature == "exon":
                    exon_number = int(splitline[8].split("exon_number ")[1].split('"')[1])
                else:
                    exon_number = np.nan
                strand = splitline[6]
                gtf_dict["chrom"].append(chrom)
                gtf_dict["feature"].append(feature)
                gtf_dict["start"].append(start)
                gtf_dict["end"].append(end)
                gtf_dict["gene_id"].append(gene_id)
                gtf_dict["transcript_id"].append(transcript_id)
                gtf_dict["exon_number"].append(exon_number)
                gtf_dict["strand"].append(strand)
                gtf_dict["annotation_type"].append(annotation_type)
                gtf_dict["support_level"].append(support_level)
                gtf_dict["biotype"].append(biotype)
        except:
            print(line)
            raise
gtf_df = pd.DataFrame(gtf_dict)

transcripts_deduped = gtf_df[(gtf_df.feature == "transcript") & (gtf_df.biotype == "protein_coding")
                            ].sort_values(by=["chrom", "gene_id", "support_level", "transcript_id", "start"]
                                         ).drop_duplicates(subset=["gene_id"], keep="first"
                                                          ).transcript_id
gtf_df = gtf_df[gtf_df.transcript_id.isin(transcripts_deduped)
               ].sort_values(by=["chrom", "gene_id", "support_level", "transcript_id", "start"])

gtf_df["length"] = gtf_df.end.apply(int) - gtf_df.start.apply(int) + 1
max_gene = max(gtf_df.length)

print("GTF loaded:", len(transcripts_deduped), "transcripts")
print("Filtered gtf_df down to", len(gtf_df[gtf_df.feature == "transcript"]), "matching transcripts.")

filename = "./parsed_gtf.pkl"
pickle.dump(gtf_df, open(filename, "wb"))
print("Pickling gtf_df to",filename)

    

In [5]:
### loading the parsed GTF pickle just created 

pickled_gtf_df = '/home/allen/code/4suv/github_4suv/parsed_gtf.pkl' 
gtf_df = pickle.load(open(pickled_gtf_df, "rb"))
print("Pickled gtf_df loaded:", len(gtf_df[gtf_df.feature == "transcript"]), "transcripts")


Pickled gtf_df loaded: 19654 transcripts


In [10]:
### load pickled transcript database with information for translation 

tx_df = pickle.load(open("/home/allen/code/4suv/github_4suv/tx_df_pkl", "rb"))
print("Pickled tx_df loaded:", len(tx_df), "transcripts")
tx_df.head()

Pickled tx_df loaded: 19580 transcripts


Unnamed: 0,transcript_id,chrom,strand,start,stop,spliced_len,seq
0,ENST00000367770.5,1,-,48,2276,2916,CTGCTTGGCTTTGAGGAAGAGTGGCAGTACTGCCTCACTGCATAAG...
1,ENST00000286031.10,1,+,700,3261,4355,GGCTTTGGCCCTGGAAAGCCTCGCGGACGTGTTCTGACCCAAGGTT...
2,ENST00000374004.5,1,-,146,1735,2350,GGGAGGACCCCAATCTAGGCCCAAGAGGGAAAGCCACGTGCCTGTA...
3,ENST00000359637.2,1,+,62,1219,1454,GAGGAGAACTGGACGTTGTGAACAGAGTTAGCTGGTAAATGTCCTC...
4,ENST00000374409.5,1,-,255,1259,2804,TCGTCACAGCCATGAGTGAGACTTGAAGCCCGTTTTACGTATGAAG...


In [8]:
## final function to assign density, called by wrapping function elsewhere (see below)

pd.options.mode.chained_assignment = None #default = 'warn'

'''
Takes in a transcript line of gtf_df and creates a tx_density object.
These will then be saved to a pkl file for fast parsing of downstream calculations.
'''

def get_density(transcript):

    global gtf_df
    global bamfile
     
    chrom = transcript[1].chrom ## comes as a tuple -- take [1]
    strand = transcript[1].strand
    transcript_id = transcript[1].transcript_id
    tx_cds_exons_df = gtf_df[gtf_df.transcript_id == transcript_id] #forms a dataframe transcript, CDS, exon from gtf_df

    tx = tx_cds_exons_df[tx_cds_exons_df.feature == "transcript"] #transcript line of dataframe
    tx_left = tx.start.iloc[0] #Leftmost genome coordinate of transcript (could be 3' or 5' end depending on strand)
    tx_right = tx.end.iloc[0] #Rightmost genome coordinate of transcript

    exons = tx_cds_exons_df[tx_cds_exons_df.feature == "exon"] #Exon lines of dataframe
    tx_len = sum(exons.length) #length of spliced transcript feature

    #creates a dataframe of all information on reads mapping anywhere in the unspliced transcript.
    read_iter = bamfile.fetch(chrom, int(tx_left), int(tx_right), multiple_iterators=True)
    read_dict = {"strand":[], "length":[], "left_pos":[], "right_pos":[], "transcript_id":[], "seq":[], "read_id":[]}
    for read in read_iter:
        if read.is_reverse:
            s = '-'
        else:
            s = '+'
        if s == strand:
            read_dict["strand"].append(s)
            read_dict["length"].append(read.query_length)
            read_dict["left_pos"].append(read.reference_start+1) #pysam fetches coordinates 0-based, +1 for 1-based list
            read_dict["right_pos"].append(read.reference_end) #this gives nt 1 past end, so -1 to get end and +1 for 1 based list
            read_dict["transcript_id"].append(transcript_id)
            inseq = read.query_sequence
            if s == '-':
                inseq = str(Seq(inseq).reverse_complement())
            read_dict['seq'].append(inseq)
            read_dict["read_id"].append(read.query_name)

    read_df = pd.DataFrame.from_dict(read_dict) 

    exonic_positions = [] #pd.Series to translate genome coords to spliced transcript coords
    for exon in exons.iterrows():
        exonic_positions.extend(np.arange(exon[1].start, exon[1].end+1))
    exonic_positions = pd.Series(np.arange(0,len(exonic_positions)), index=exonic_positions) # 0 based list

    #only select reads that are contained entirely within exons
    exonic_reads = read_df.loc[
        (read_df.left_pos.isin(exonic_positions.index)) & 
        (read_df.right_pos.isin(exonic_positions.index))]

    # change from genome coords to exon transcript coords
    # for - strand transcripts, the numbers are inverted so that everything is 5 to 3
    if strand == "+":
        exonic_reads.loc[:,("5p")] = exonic_positions[exonic_reads.left_pos].values
        exonic_reads.loc[:,("3p")] = exonic_positions[exonic_reads.right_pos].values
    elif strand == "-":
        exonic_reads.loc[:,("5p")] = tx_len-1-exonic_positions[exonic_reads.right_pos].values
        exonic_reads.loc[:,("3p")] = tx_len-1-exonic_positions[exonic_reads.left_pos].values

    histo = np.histogram(exonic_reads['5p'], np.arange(0, tx_len+1, 1))[0]


    return(histo)


In [None]:
file_list = [
 'RIBO_4SU_R1',
 'RIBO_4SU_R2',
 'RIBO_4SU_R3',
 'RIBO_4SU_UV_R1',
 'RIBO_4SU_UV_R2',
 'RIBO_4SU_UV_R3',
 'RIBO_UNT_R1',
 'RIBO_UNT_R2',
 'RIBO_UNT_R3',
 'RIBO_UV_R1',
 'RIBO_UV_R2',
 'RIBO_UV_R3'
    ]

In [19]:

for fn in file_list:

    bamfile = pysam.AlignmentFile("/home/allen/data/4suv/genome_aligned/"
        +fn+'/'+fn+"_Aligned.out.sorted.bam","rb")

    p = Pool(48)
    map_out = p.map(get_density, tx_df.iterrows())
    p.close()

    density_dict = {}
    for i in range(len(tx_df)):
        density_dict[tx_df.iloc[i].transcript_id] = map_out[i]

    pickle.dump(density_dict, open("/home/allen/data/4suv/density/"+fn+"_density_pkl", "wb"))
    print("Pickled density file saved for:", fn)

Pickled density file saved for: RIBO_4SU_UV_R1
Pickled density file saved for: RIBO_4SU_UV_R3
Pickled density file saved for: RIBO_UNT_R1
Pickled density file saved for: RIBO_UV_R3
Pickled density file saved for: RIBO_UNT_R2
Pickled density file saved for: RIBO_UV_R2
Pickled density file saved for: RIBO_4SU_R3
Pickled density file saved for: RIBO_4SU_UV_R2
Pickled density file saved for: RIBO_4SU_R2
Pickled density file saved for: RIBO_UV_R1
Pickled density file saved for: RIBO_UNT_R3
Pickled density file saved for: RIBO_4SU_R1
