In [4]:
# Import stuff

import re
from BCBio import GFF
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pysam

## Functions

In [5]:
def extract_genes(run_name):
    '''
    Function for extracting genes corresponding to BUSCO hits.
    Returns a SeqRecord object with one feature per BUSCO hit.
    '''

    file_tsv = open("../data/intermediate/run_"+run_name+"/full_table_"+run_name+".tsv", 'r')

    # Extract BUSCO IDs, start and end from table of hits into SeqRecord object, each BUSCO as a SeqFeature
    busco_record = SeqRecord(seq='', id='hits')
    for line in file_tsv.readlines():
        hit = (re.search(r'(\S*)\s(Complete|Duplicated)\s(\S*)\s(\S*)\s(\S*)\s\S*', line))
        if hit:
            busco_record.features.append(SeqFeature(FeatureLocation(int(hit.group(4)), int(hit.group(5))), id=hit.group(1), type='gene', qualifiers={'contig': hit.group(3)}))

    file_tsv.close()

    # Match the BUSCOs to augustus predicted genes in gff file
    gff_records = []
    correct_genes = SeqRecord(seq='', id='correct_genes')
    limit_infos = dict(
            gff_type = ["gene"]) # Only want genes

    for busco in busco_record.features:
        filename = busco.id # gff filenames are [busco_id].out.[1-999]
        i = 1
        while True:
            try:
                file_gff = open("../data/intermediate/run_"+run_name+"/augustus_output/predicted_genes/"+filename+".out."+str(i))
                for record in GFF.parse(file_gff, limit_info=limit_infos):
                    gff_records.append(record)
                i += 1
            except: # Finished reading all files for that BUSCO
                break

    # Find augustus predicted genes from the gff that match BUSCOs
    for rec in gff_records:
        for hit in busco_record.features:
            for feature in rec.features:
                if hit.location.start-1 == feature.location.start and hit.location.end == feature.location.end: # For some reason start has 1 nt diff...
                    feature.id = rec.id 
                    correct_genes.features.append(feature)
                    break

    file_gff.close()
    return correct_genes

In [6]:
def extract_mapped():
    '''
    Function used to extract only reads mapped to a reference.
    Really slow, only used to minimize the reads dataset in special use cases
    '''
    pairedreads = pysam.AlignmentFile("allpaired2.bam", "wb", template=samfile)
    mapped_reads =[]

    for read in samfile.fetch():
        # Probably stupidly long if-statement
        if read.is_read1 and not read.is_secondary and not read.is_unmapped and not read.mate_is_unmapped and read not in mapped_reads:
            mapped_reads.append(read)
            pairedreads.write(read)
            print(read)
            print(samfile.mate(read))
            pairedreads.write(samfile.mate(read))

    return mapped_reads

In [7]:
def infer_paired_region(genes):
    '''
    Function for inferring paired library-type by looking at a regions corresponding to genes
    '''
        # Counters for the different lib-types   
    libs = {
        'fr_first': 0,
        'fr_second': 0,
        'rf_first': 0,
        'rf_second': 0,
        'ff_first': 0,
        'ff_second': 0,
        'undecided': 0
    }
    
    for gene in genes.features:
        contig = gene.id
        start = int(gene.location.start)
        stop = int(gene.location.end)
        strand = gene.strand
        reads = []
        # Get reads mapped to a specific contig and in a sequence range
        # TODO: Look into optimizing this step, only take a subset (1000ish) reads? 
        # samfile.mate is not made for high throughput
        for read in samfile.fetch(contig, start, stop):
            if not read.mate_is_unmapped and read.is_read1:
                reads.append([read, samfile.mate(read)])

        

        # Check lib-type of reads 
        for read in reads:
            first = read[0]
            second = read[1]
            try:
                lib = ''
                if not first.is_reverse:
                    lib += 'f'
                else: 
                    lib += 'r'
                if not second.is_reverse:
                    lib += 'f'
                else: 
                    lib += 'r'
                # Gene on sense strand
                if strand == 1:
                    if first.reference_start > second.reference_start:
                        # Flip order of reads
                        lib = lib[::-1]
                        lib += '_first'
                    elif first.reference_start < second.reference_start:
                        lib += '_second'
                    else:
                        lib = 'undecided'
                # Gene on antisense
                elif strand == -1:
                    if first.reference_start > second.reference_start:
                        # Flip order of reads
                        lib = lib[::-1]
                        lib += '_second'
                    elif first.reference_start < second.reference_start:
                        lib += '_first'
                    else:
                        lib = 'undecided'
                libs[lib] += 1
            except: libs['undecided'] +=1 #Some reads missing start or end-values
    
    return libs

In [45]:
def infer_single_region(genes):
    """
    Function for inferring library type of single-ended library types
    Work in progress
    """
    
    libs = {
        'f_first': 0,
        'f_second': 0,
        'r_first': 0,
        'r_second': 0,
        'undecided': 0
    }
    
    for gene in genes.features:
        contig = gene.id
        start = int(gene.location.start)
        stop = int(gene.location.end)
        strand = gene.strand
        reads = []
        # Get reads mapped to a specific contig and in a sequence range
        # Slow, only take 1000 reads?
        for read in samfile.fetch(contig, start, stop):
            if not read.is_unmapped:
                reads.append(read)
                

        # Check lib-type of reads 
        for read in reads:
            try:
                lib = ''
                if strand == 1 and not read.is_reverse:
                    lib += 'f_second'
                elif strand == 1 and read.is_reverse:
                    lib += 'f_first'
                elif strand == -1 and not read.is_reverse:
                    lib += 'r_first'
                elif strand == -1 and read.is_reverse: 
                    lib += 'r_second'
                else:
                    print(read)
                    lib = 'undecided'
                libs[lib] += 1
            except: libs['undecided'] +=1 # Some reads missing start or end-values
    return libs 

In [9]:
def write_result(lib_dict):
    output = open('../data/output/result.txt', 'w+')
    output.write("Results of library inferring: \nLibrary type \t Reads \t Percent \n")
    
    total_reads = 0
    for i in lib_dict:
        total_reads += lib_dict[i]
    
    for i in lib_dict:
        percent = '{:.1%}'.format(lib_dict[i]/total_reads)
        output.write("%s \t %d \t %s\n" % (i, lib_dict[i], percent))

## Running

In [48]:
run_name = '4'
state = 'single'

In [49]:
# bam-file with mapped reads, needs to be sorted and indexed
samfile = pysam.AlignmentFile("../data/intermediate/4_sorted.bam", "rb")

In [50]:
test = extract_genes(run_name)

In [52]:
if state == 'single':
    result = infer_single_region(test)
elif state == 'paired':
    result = infer_paired_region(test)

In [53]:
print(result)

{'f_first': 1, 'f_second': 9, 'r_first': 1, 'r_second': 20, 'undecided': 0}


In [139]:
write_result(result)