In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
import matplotlib.pyplot as plt

In [None]:
# 1. Count sequences in a FASTQ file
def count_fastq_sequences(fastq_file):
    count = 0
    with gzip.open(fastq_file, "rt") if fastq_file.endswith(".gz") else open(fastq_file, "r") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            count += 1
    return count



In [None]:
# 2. Calculate GC content
def calculate_gc_content(fasta_file):
    gc_contents = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(record.seq).upper()
        gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100
        gc_contents.append(gc_content)
    return gc_contents


In [None]:
# 3. Convert FASTQ to FASTA
def fastq_to_fasta(fastq_file, fasta_file):
    with open(fasta_file, "w") as output_handle:
        SeqIO.convert(fastq_file, "fastq", output_handle, "fasta")

In [None]:
# 4. Find longest sequence in FASTA
def find_longest_sequence(fasta_file):
    return max(SeqIO.parse(fasta_file, "fasta"), key=lambda x: len(x.seq))

In [None]:
# 5. Calculate N50
def calculate_n50(fasta_file):
    lengths = [len(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
    lengths.sort(reverse=True)
    total_length = sum(lengths)
    cumulative_length = 0
    for length in lengths:
        cumulative_length += length
        if cumulative_length >= total_length / 2:
            return length

In [None]:
# 6. Filter low-quality reads
def filter_low_quality_reads(input_fastq, output_fastq, quality_threshold):
    with open(output_fastq, "w") as output_handle:
        for record in SeqIO.parse(input_fastq, "fastq"):
            if min(record.letter_annotations["phred_quality"]) >= quality_threshold:
                SeqIO.write(record, output_handle, "fastq")


In [None]:
# 7. Compare two FASTA files
def compare_fasta_files(fasta1, fasta2):
    sequences1 = set(str(record.seq) for record in SeqIO.parse(fasta1, "fasta"))
    sequences2 = set(str(record.seq) for record in SeqIO.parse(fasta2, "fasta"))
    return sequences1.intersection(sequences2)



In [None]:
# 8. Reverse complement
def reverse_complement(sequence):
    return str(Seq(sequence).reverse_complement())


In [None]:
# 9. Split FASTA file
def split_fasta(input_fasta, output_prefix, sequences_per_file):
    records = list(SeqIO.parse(input_fasta, "fasta"))
    for i, batch in enumerate(range(0, len(records), sequences_per_file)):
        with open(f"{output_prefix}_{i+1}.fasta", "w") as output_handle:
            SeqIO.write(records[batch:batch+sequences_per_file], output_handle, "fasta")



In [None]:
# 10. Calculate read length distribution
def read_length_distribution(fastq_file):
    lengths = [len(record.seq) for record in SeqIO.parse(fastq_file, "fastq")]
    plt.hist(lengths, bins=50)
    plt.title("Read Length Distribution")
    plt.xlabel("Read Length")
    plt.ylabel("Frequency")
    plt.show()



In [None]:
# Example usage:
# count_fastq_sequences("example.fastq")
# calculate_gc_content("example.fasta")
# fastq_to_fasta("input.fastq", "output.fasta")
# longest_seq = find_longest_sequence("example.fasta")
# n50 = calculate_n50("example.fasta")
# filter_low_quality_reads("input.fastq", "output.fastq", 20)
# common_sequences = compare_fasta_files("file1.fasta", "file2.fasta")
# rev_comp = reverse_complement("ATGCATGC")
# split_fasta("large_file.fasta", "output", 1000)
# read_length_distribution("example.fastq")