In [1]:
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation
from Bio.SeqFeature import CompoundLocation

import pandas as pd
import numpy as np

In [10]:
def exon_intron(genbank, gene, _type_="ncRNA"):
    """Takes a genbank file and returns intron and exon sequences"""
    # Read in and extract the sequence
    record = SeqIO.read(genbank, "genbank")

    sequence = record.seq

    # Extract the desired feature
    feature = []
    
    for i in record.features:
        if i.type == _type_ and i.qualifiers["gene"] == gene:
            feature.append(i)

    if len(feature) < 1:
        raise ValueError("Feature not found")

    elif len(feature) > 1:
        raise ValueError("More than one feature of given description")

    else:
        location = feature[0].location

    # Use the exon indices to extract the mature transcript
    mat_rna = location.extract(sequence)
    
    # Make a list of exon indices
    exon_ls = list(location)
    
    # Make a list of intron indices using np.s_
    intron_slices = []
    
    for i, v in enumerate(exon_ls):

        if v != exon_ls[i - 1] + 1 and v > exon_ls[i - 1]:
            intron_slices.append(np.s_[exon_ls[i - 1] : v])

        else:
            pass

    # Extract the intron sequences
    introns = [sequence[i] for i in intron_slices]

    return mat_rna, introns

def rap_probes(seq, gene, probe_length = 90):
    '''Takes a sequence and makes probes of a given length'''
    # Extract indices of the desired probe length
    inds = np.arange(0, len(seq), probe_length)
    
    s_list = []
    
    for i in range(len(inds)-1):
        s_list.append(np.s_[inds[i]:inds[i+1]])
    
    # Use those indices to make probes
    s_seq = [seq[i] for i in s_list]
    
    # If there is more than a quarter probe of gene left uncovered, add one last probe 
    if len(seq) - inds[-1] > probe_length / 4 : 
        s_seq.append(seq[-90:])
    
    else:
        pass
    
    s_seq = [str(i.reverse_complement()) for i in s_seq]
    
    # Name the probes and return a dataframe
    prb_nms = [gene + str(i+1) for i in range(len(s_seq))]
    
    return pd.DataFrame({'Name':prb_nms,
                        'Sequence':s_seq})

In [11]:
mat_tsix, int_tsix = exon_intron('sequence.gb', ['Tsix'])

In [12]:
mat_probes = rap_probes(mat_tsix, 'Tsix_Exon_')

int_ls = [rap_probes(i, 'Tsix_Intron_') for i in int_tsix]

In [13]:
int_pd = pd.concat(int_ls)

int_pd = int_pd.reset_index(drop = True)

int_pd.loc[:,'Name'] = ['Tsix_Intron_' + str(i+1) for i in int_pd.index]

int_pd

Unnamed: 0,Name,Sequence
0,Tsix_Intron_1,AAGCAAACATGTATAGACATAAGTAATAAAGTCAGGTGTAGGCATA...
1,Tsix_Intron_2,TGTCTAGGTAACCCTGTAGCATGTGTGAAACAAAAGACTATTGCCC...
2,Tsix_Intron_3,CTTAGGAGAAAAACAGACGCTTAAACATCCCTACTTGGAGGCTTTT...
3,Tsix_Intron_4,GAATTCTTGATAAGAAATAACCTCCAGAGAAATGGCAATCATTTAC...
4,Tsix_Intron_5,GAAAGAAATGAAGAGAAAGAAAAAAGACTAATAGGTCCCAGAGACT...
...,...,...
541,Tsix_Intron_542,TTTGAATTTAAAACCGAAGTGATTGTTTTCAAAATGTATTTACGAT...
542,Tsix_Intron_543,TCTACCCCATGACTATTGCTGGGGTTGCATTTTGATTTCAATGAAT...
543,Tsix_Intron_544,AGCCTACTGGGTATAAGTGGTGACTTTGGCCAGAGTCATAGTGGAT...
544,Tsix_Intron_545,GCCCGTTCCATTCCTTTGTATTGTTCAGTGGCTAGTCTACTTACAC...


In [14]:
all_tsix = pd.concat([mat_probes, int_pd])

all_tsix = all_tsix.reset_index(drop = True)

all_tsix.to_csv('TsixProbes.csv')

In [15]:
len(all_tsix)

594