In [1]:
from Bio import SeqIO
from Bio.Seq import Seq

In [2]:
class Trim:
    def __init__(self,forward_fastq,reverse_fastq, barcode_fasta, length, output_dir, output_forward_name, output_reverse_name):
        """
        This function takes in the forward and reverse fastq files, the barcode fasta file, the length
        of the barcode, the output directory, and the output file names for the forward and reverse
        reads
        
        :param forward_fastq: The forward fastq file
        :param reverse_fastq: The name of the reverse fastq file
        :param barcode_fasta: a fasta file containing the barcodes
        :param length: the length of the barcode
        :param output_dir: The directory where the output files will be saved
        :param output_forward_name: the name of the output file for the forward reads
        :param output_reverse_name: the name of the output file for the reverse reads
        """

        self.forward_fastq = forward_fastq
        self.reverse_fastq = reverse_fastq
        self.barcode_fasta = barcode_fasta
        self.length = length
        self.output_dir = output_dir
        self.output_forward_name = output_forward_name
        self.output_reverse_name = output_reverse_name

    def forward_fastq_reader(self):
        """
        This function takes a fastq file as input and returns a list of sequences
        :return: A list of sequences
        """

        n = []
        for record in SeqIO.parse(self.forward_fastq , "fastq"):
            n.append(str(record.seq))
        return n

    def reverse_fastq_reader(self):
        """
        This function takes a fastq file and returns a list of reverse complemented sequences
        :return: A list of the reverse complements of the sequences in the fastq file.
        """

        n = []
        for record in SeqIO.parse(self.reverse_fastq , "fastq"):
            n.append(str(record.seq)[::-1])
        return n
        
    def forward_barcode_reader(self):
        """
        This function takes a fasta file of barcodes and returns a list of the barcodes
        :return: A list of strings.
        """

        forward = []
        for record in SeqIO.parse(self.barcode_fasta, "fasta"):
            forward.append(str(record.seq))
        return forward
    
    def reverse_barcode_reader(self):
        """
        This function takes in a list of forward barcodes and returns a list of reverse barcodes
        :return: A list of the reverse complements of the barcodes.
        """
     
        forward = self.forward_barcode_reader()

        reversed = []
        for i in forward:
            reversed.append(str(Seq(i).reverse_complement())[::-1])
        return reversed
        
    def trimmer(self, complement):
        """
        Given a list of sequences and a list of barcodes, trim the sequences to the length of the
        barcodes
        
        :param complement: If True, the reverse complement of the barcode will be used
        :return: A list of the trimmed sequences.
        """
     
        if complement == True:
            readed_main = self.reverse_fastq_reader()
            readed_barcodes = self.reverse_barcode_reader()

        else:
            readed_main = self.forward_fastq_reader()
            readed_barcodes = self.forward_barcode_reader()

        trimmed = []
        for seq in readed_main:
            for barcode in readed_barcodes:
                index = seq.find(barcode)
                if not index == -1:
                    length = self.length
                    trimmed.append(seq[index:index + length])
        return trimmed

    def fasta_writer(self, complement):
        """
        Write the trimmed sequences to a fasta file
        
        :param complement: True or False. If True, the reverse complement of the sequence will be
        written
        """
     
        if complement == True:
            trim = self.trimmer(complement=True)
            path_file = self.output_dir + self.output_reverse_name
            
        else:
            trim = self.trimmer(complement=False)
            path_file = self.output_dir + self.output_forward_name

        file = open(path_file, 'w+')
        out = '\n'.join(['>line_' + str(i+1) + "\n" + j for i,j in enumerate(trim)])
        file.write(out)
        file.close()

    def __call__(self):
        """
        The function takes the forward and the reverse complement of the sequence and writes it to a fasta file
        """
    
        self.fasta_writer(True)
        self.fasta_writer(False)

In [3]:
args = {
    'forward_fastq': "/mnt/c/Users/pc/Downloads/barcod/qc_processed_barcode_1/N70_r1_qc_processed.fastq",
    'reverse_fastq': "/mnt/c/Users/pc/Downloads/barcod/qc_processed_barcode_1/N70_r2_qc_processed.fastq",
    'barcode_fasta': "/mnt/c/Users/pc/Downloads/barcod/qc_processed_barcode_1/b14.fasta",
    'output_dir': "/mnt/c/Users/pc/Downloads/barcod/qc_processed_barcode_1/",
    'output_forward_name': 'trimmed_forward.fasta',
    'output_reverse_name': 'trimmed_reverse.fasta',
    'length' : 48
    }

result = Trim(**args)

In [4]:
result()

In [5]:
trimmonic = 1739877

forward_read = 1567482

reverse_read = 1502458

In [6]:
from collections import Counter 

def count_legth_reads(list):
    """
    Given a list of reads, return a list of tuples, where each tuple is the length of the read and the
    number of reads of that length
    
    :param list: the list of reads
    :return: A list of tuples, where each tuple contains the length of the read and the number of reads
    with that length.
    """
    size = []
    for i in list:
        size.append(len(i))

    counts = Counter(size)

    return sorted(counts.items(), key=lambda i:i[1], reverse=True)

count_forwad = count_legth_reads(result.forward_fastq_reader())

count_forwad[:25]

[(147, 1570975),
 (146, 99795),
 (151, 20185),
 (145, 12717),
 (148, 11352),
 (150, 5719),
 (143, 2986),
 (149, 2914),
 (144, 2574),
 (54, 1776),
 (141, 1372),
 (139, 799),
 (142, 756),
 (135, 615),
 (133, 560),
 (137, 537),
 (132, 425),
 (131, 351),
 (114, 313),
 (136, 311),
 (134, 296),
 (128, 273),
 (138, 260),
 (124, 237),
 (127, 236)]