In [1]:
from Bio import SeqIO
import pandas as pd
import timeit

In [2]:
def count_kmers_py(fasta_file, k=4):
    records = list(SeqIO.parse(fasta_file, "fasta"))
    counts = {}
    for rec in records:
        seq = str(rec.seq)
        for i in range(len(seq) - k + 1):
            kmer = seq[i:i+k]
            counts[kmer] = counts.get(kmer, 0) + 1
    # Convert to DataFrame
    df = pd.DataFrame(
        [(kmer, cnt) for kmer, cnt in counts.items()],
        columns=["kmer", "count"]
    )
    return df

In [3]:
# Benchmarking
setup = "from __main__ import count_kmers_py"
stmt = "count_kmers_py('sequence.fasta', k=4)"
times = timeit.repeat(stmt=stmt, setup=setup, repeat=5, number=1)
print("Python timings (s):", times)

Python timings (s): [0.040463400073349476, 0.00954600004479289, 0.019007799914106727, 0.021897200029343367, 0.023039599880576134]
