In [32]:
from abc import ABC, abstractmethod
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import gc_fraction

In [33]:
class BiologicalSequence(ABC):
    def __init__(self, seq: str):
        self.seq = seq
        self.length = len(self.seq)
        if not self.is_bioseq():
            raise ValueError("This is not biosequence")
        
    def seqindex(self, index):
        return self.seq[index]
    
    def __repr__(self):
        return f"Your sequence: {self.seq}"
    
    @abstractmethod
    def is_bioseq(self):
        pass


class NucleicAcidSequence(BiologicalSequence):
    def is_bioseq(self):
        alphabet = {'a', 'u', 'c', 'g', 't'}
        return set(self.seq.lower()).issubset(alphabet)
    
    COMPLEMENT = {'A': 'T', 'a': 't',
                  'C': 'G', 'c': 'g',
                  'T': 'A', 't': 'a',
                  'G': 'C', 'g': 'c',
                  'U': 'A', 'u': 'a'}
          
    def complement(self):    
        return ''.join(self.COMPLEMENT[i] for i in self.seq)

    def reverse(self):
        return self.seq[::-1]

    def reverse_complement(self):
        return ''.join(self.COMPLEMENT[i] for i in self.seq[::-1])


class DNASequence(NucleicAcidSequence):
    def transcribe(self):
        DNAtoRNA = {'A': 'U', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 'u', 't': 'a', 'c': 'g', 'g': 'c'}
        return ''.join(DNAtoRNA[i] for i in self.seq)


class RNASequence(NucleicAcidSequence):
    pass

class AminoAcidSequence(BiologicalSequence):
    def is_bioseq(self):
        alphabet = {'g', 'a', 'v', 'l', 'i',
                    'c', 'm', 's', 't', 'f',
                    'y', 'w', 'd', 'e', 'n',
                    'q', 'h', 'k', 'r', 'p'}
        return set(self.seq.lower()).issubset(alphabet)
    

In [25]:
my_dna = DNASequence('ataTTgaC')

In [26]:
my_dna

Your sequence: ataTTgaC

In [27]:
my_dna.reverse()

'CagTTata'

In [28]:
my_dna.transcribe()

'uauAAcuG'

In [29]:
my_dna.complement()

'tatAActG'

In [30]:
my_dna.reverse_complement()

'GtcAAtat'

In [4]:
def filter_fastq(input_fastq, gc_bounds=(0, 100),
                 length_bounds=(0, 2**32),
                 quality_threshold=0, output_fastq='filtered_file'):
    """ Function to filter sequences by the given parameter's values
    Args:
        input_fastq: path to a fastq-file to be filtered
        gc_bounds(tuple/int/float): interval for GC % value
        or the upper limit if int/float
        length_bounds(tuple/int/float): interval for length value
        or the upper limit if int/float
        quality_threshold(int, float): average read
        quality threshold for filtering
        output_fastq: name for the output file
        created in /filtered/. Default name is filtered_file.
    """
    full_path = os.path.abspath(__file__)
    path = os.path.dirname(full_path)
    os.makedirs(os.path.join(path, 'filtered'), exist_ok=True)
    with open(os.path.join(path, 'filtered', output_fastq), "a") as output_file:
        filter_result = []
        for rec in SeqIO.parse(input_fastq, "fastq"):
            length = len(rec.seq)
            mean_qual = mean(record.letter_annotations["phred_quality"])
            gc_content = gc_fraction(rec.seq) * 100

            if (length_bounds[0] <= length <= length_bounds[1]
                and mean_qual > quality_threshold
                and gc_bounds[0] <= gc_content <= gc_bounds[1]
                ):
                filter_result.append(SeqRecord(rec.seq,
                                    id=rec.id,
                                    letter_annotations=rec.letter_annotations))
        SeqIO.write(filter_result, output_file, "fastq")