In [None]:
import os
import pandas as pd
from Bio import SeqIO
import yaml
import numpy as np

# load in config with GPSC paths
with open("config/SP_reps.yaml", 'r') as file:
    config=yaml.safe_load(file)

# extract GPSCs
gpscs = config['samples'] 

# set params
amplicon_stats = list()
xlen = 2200

total_genome_coverages={}
amplicon_positions={}

with open('output.txt', 'w') as f:

    for gpsc, fasta_file in gpscs.items():
        records = list(SeqIO.parse(fasta_file, "fasta"))

        # calculate length of the genome for each GPSC
        genome_length=sum(len(record.seq) for record in records)

        # initialize amplicon genome coverage for each GPSC separately 
        total_genome_coverage=0

        # initialize set of covered positions across each GPSC 
        covered_positions = set()

        # intitialize list of amplicon positions 
        amplicon_positions[gpsc] = []

        print(f"Processing {gpsc}...", file=f)  

        # load the samtools depth file
        depth_file = os.path.join("samtools_depth", f"{gpsc}.depth")
        df = pd.read_csv(depth_file, sep="\t", names=["Ref", "Pos", "Depth"])

        # get the positions where the depth is 1 (primer binding sites)
        primer_binding_sites = df[df["Depth"] == 1]["Pos"].tolist()

        for p1loc in primer_binding_sites:
            # find the next primer binding site within xlen bases
            p2loc = next((pos for pos in primer_binding_sites if p1loc < pos <= p1loc + xlen), None)

            if p2loc is not None:
                # calculate the amplicon stats
                amplicon_stats.append((gpsc, gpsc, gpsc, gpsc, gpsc, gpsc, 0, 0, 0, p1loc, p2loc))
                covered_positions.update(range(p1loc, p2loc+1))
                # store amplicon positions 
                amplicon_positions[gpsc].append((p1loc, p2loc))
                print(f"Detected amplicon from {p1loc} to {p2loc}.", file=f)  

                # calculate the length of the amplicon and add to total genome coverage
                total_genome_coverage += p2loc-p1loc 

        # calculate predicted % coverage 
        coverage_percentage = (len(covered_positions) / genome_length) * 100

        # update dictionary 
        total_genome_coverages[gpsc] = coverage_percentage

        print(f"Total genome coverage for {gpsc}: {total_genome_coverages[gpsc]}%", file=f) 

colnames=["pid1","pid2","set1","set2",
          "pseq1","pseq2",
          "max_hdist","hdist1","hdist2",
          "p1loc","p2loc"]
coltypes=["<U30", "<U30", "<U30", "<U30",
          "<U30", "<U30",
          float, float, float,
          int, int]

dt = {'names':colnames, 'formats':coltypes}

amplicon_statsnp = np.array(amplicon_stats,
                     dtype=dt)

np.save("amplicon_statstab.npy", amplicon_statsnp)

In [None]:
import pandas as pd

# initialize empty list
amplicon_data = []

# max allowed for size of amplicons
max_amplicon_size = 2000

# iterate over gpsc seqs
for gpsc, positions in amplicon_positions.items():
    # filter out none vals to fix none error
    filtered_positions = [pos for pos in positions if pos is not None]
    
    if filtered_positions:
        # sort start and end positions of amplicons
        filtered_positions.sort()
        start = filtered_positions[0][0]  
        end = filtered_positions[0][1]  

        for current_start, current_end in filtered_positions[1:]:
            # check if the current start is contiguous with the previous end and does not exceed max size
            if current_start <= end + 1 and (current_end - start) <= max_amplicon_size:
                end = max(end, current_end)  
            else:
                # if not contiguous or exceeds max size, save the previous contiguous sequence and start a new one
                amplicon_data.append({'Sequence': gpsc, 'Start': start, 'End': end})
                start, end = current_start, current_end  

        # save last contigious sequence after each loop
        amplicon_data.append({'Sequence': gpsc, 'Start': start, 'End': end})

# convert dict to df
amplicon_positions_df = pd.DataFrame(amplicon_data)

# export
amplicon_positions_df.to_csv('amplicon_positions.csv', index=False)

In [None]:
import pandas as pd

# create csv file containing predicted genome coverage for each sequence based on predicted amplicon coverage
total_genome_coverages_df = pd.DataFrame([total_genome_coverages])

total_genome_coverages_df.to_csv('genome_coverage_pc.csv', index=False)