In [27]:
import os
import pandas as pd
from Bio import SeqIO
import yaml
import numpy as np

# load in config with GPSC paths
with open("config/SP_reps.yaml", 'r') as file:
    config=yaml.safe_load(file)

# extract GPSCs
gpscs = config['samples'] 

# set params
amplicon_stats = list()
xlen = 2200

total_genome_coverages={}
amplicon_positions={}

with open('output.txt', 'w') as f:

    for gpsc, fasta_file in gpscs.items():
        records = list(SeqIO.parse(fasta_file, "fasta"))

        # calculate length of the genome for each GPSC
        genome_length=sum(len(record.seq) for record in records)

        # initialize amplicon genome coverage for each GPSC separately 
        total_genome_coverage=0

        # initialize set of covered positions across each GPSC 
        covered_positions = set()

        # intitialize list of amplicon positions 
        amplicon_positions[gpsc] = []

        print(f"Processing {gpsc}...", file=f)  

        # load the samtools depth file
        depth_file = os.path.join("samtools_depth", f"{gpsc}.depth")
        df = pd.read_csv(depth_file, sep="\t", names=["Ref", "Pos", "Depth"])

        # get the positions where the depth is 1 (primer binding sites)
        primer_binding_sites = df[df["Depth"] == 1]["Pos"].tolist()

        for p1loc in primer_binding_sites:
            # find the next primer binding site within xlen bases
            p2loc = next((pos for pos in primer_binding_sites if p1loc < pos <= p1loc + xlen), None)

            if p2loc is not None:
                # calculate the amplicon stats
                amplicon_stats.append((gpsc, gpsc, gpsc, gpsc, gpsc, gpsc, 0, 0, 0, p1loc, p2loc))
                covered_positions.update(range(p1loc, p2loc+1))
                # store amplicon positions 
                amplicon_positions[gpsc].append((p1loc, p2loc))
                print(f"Detected amplicon from {p1loc} to {p2loc}.", file=f)  

                # calculate the length of the amplicon and add to total genome coverage
                total_genome_coverage += p2loc-p1loc 

        # calculate predicted % coverage 
        coverage_percentage = (len(covered_positions) / genome_length) * 100

        # update dictionary 
        total_genome_coverages[gpsc] = coverage_percentage

        print(f"Total genome coverage for {gpsc}: {total_genome_coverages[gpsc]}%", file=f) 

colnames=["pid1","pid2","set1","set2",
          "pseq1","pseq2",
          "max_hdist","hdist1","hdist2",
          "p1loc","p2loc"]
coltypes=["<U30", "<U30", "<U30", "<U30",
          "<U30", "<U30",
          float, float, float,
          int, int]

dt = {'names':colnames, 'formats':coltypes}

amplicon_statsnp = np.array(amplicon_stats,
                     dtype=dt)

np.save("amplicon_statstab.npy", amplicon_statsnp)

In [28]:
for gpsc, coverage in total_genome_coverages.items():
    print(f"{gpsc}: {coverage}%")

GPSC15: 89.53183762018044%
GPSC17: 85.52519860790558%
GPSC22: 85.2485884074686%
GPSC25: 83.70355584570514%
GPSC26: 83.24475061663397%
GPSC32: 86.56638367738053%
GPSC34: 86.58638733761124%
GPSC37: 82.37399570281472%
GPSC3: 82.58450750499864%
GPSC4: 82.57514313707841%
GPSC5: 82.9434782199731%
GPSC8: 83.89724454176854%
JYGP01: 37.48834324179536%
NC_017592: 99.28831877584545%


In [30]:
from Bio import SeqIO

# Load the config file
with open("config/SP_reps.yaml", 'r') as file:
    config=yaml.safe_load(file)

# Extract the GPSCs
gpscs = config['samples']

# Initialize a dictionary to store the genome lengths
genome_lengths = {}

# Calculate and store the length of each sequence
for gpsc, fasta_file in gpscs.items():
    records = list(SeqIO.parse(fasta_file, "fasta"))
    genome_length = sum(len(record.seq) for record in records)
    genome_lengths[gpsc] = genome_length  # Store the genome length in the dictionary

#Print the genome lengths
for gpsc, length in genome_lengths.items():
    print(f"{gpsc}: {length} bp")

GPSC15: 1985764 bp
GPSC17: 2107041 bp
GPSC22: 2086119 bp
GPSC25: 2129170 bp
GPSC26: 2157682 bp
GPSC32: 2053788 bp
GPSC34: 2052251 bp
GPSC37: 2133955 bp
GPSC3: 2130580 bp
GPSC4: 2121917 bp
GPSC5: 2126260 bp
GPSC8: 2184682 bp
JYGP01: 1915198 bp
NC_017592: 2036867 bp


In [31]:
import pandas as pd

# initialize empty list
amplicon_data = []

# max allowed for size of amplicons
max_amplicon_size = 2000

# iterate over gpsc seqs
for gpsc, positions in amplicon_positions.items():
    # filter out none vals to fix none error
    filtered_positions = [pos for pos in positions if pos is not None]
    
    if filtered_positions:
        # sort start and end positions of amplicons
        filtered_positions.sort()
        start = filtered_positions[0][0]  
        end = filtered_positions[0][1]  

        for current_start, current_end in filtered_positions[1:]:
            # check if the current start is contiguous with the previous end and does not exceed max size
            if current_start <= end + 1 and (current_end - start) <= max_amplicon_size:
                end = max(end, current_end)  
            else:
                # if not contiguous or exceeds max size, save the previous contiguous sequence and start a new one
                amplicon_data.append({'Sequence': gpsc, 'Start': start, 'End': end})
                start, end = current_start, current_end  

        # save last contigious sequence after each loop
        amplicon_data.append({'Sequence': gpsc, 'Start': start, 'End': end})

# convert dict to df
amplicon_positions_df = pd.DataFrame(amplicon_data)

# export
amplicon_positions_df.to_csv('fwdandrev_amplicon_positions.csv', index=False)

In [32]:
import pandas as pd

# create csv file containing predicted genome coverage for each sequence based on predicted amplicon coverage
total_genome_coverages_df = pd.DataFrame([total_genome_coverages])

total_genome_coverages_df.to_csv('genome_coverage_pc.csv', index=False)

In [None]:
            ------------------------------------------------------ separate fwd and rev amplicon predictions --------------------------------------------------------------

In [33]:
import os
import pandas as pd
from Bio import SeqIO
import yaml
import numpy as np

# load in config with GPSC paths
with open("config/SP_reps.yaml", 'r') as file:
    config=yaml.safe_load(file)

# extract GPSCs
gpscs = config['samples'] 

# set params
amplicon_stats = list()
xlen = 2200

total_genome_coverages={}
amplicon_positions={}

with open('output.txt', 'w') as f:

    for gpsc, fasta_file in gpscs.items():
        records = list(SeqIO.parse(fasta_file, "fasta"))

        # calculate length of the genome for each GPSC
        genome_length=sum(len(record.seq) for record in records)

        # initialize amplicon genome coverage for each GPSC separately 
        total_genome_coverage=0

        # initialize set of covered positions across each GPSC 
        covered_positions = set()

        # intitialize list of amplicon positions 
        amplicon_positions[gpsc] = []

        print(f"Processing {gpsc}...", file=f)  

        # load the samtools depth file
        fwd_depth_file = os.path.join("samtools_depth_indiv_primers", f"{gpsc}_fwd.depth")
        rev_depth_file = os.path.join("samtools_depth_indiv_primers", f"{gpsc}_rev.depth")
        
        # load each depth file into a df
        fwd_df = pd.read_csv(fwd_depth_file, sep="\t", names=["Ref", "Pos", "Depth"])
        fwd_df['Primer'] = 'fwd'

        rev_df = pd.read_csv(rev_depth_file, sep="\t", names=["Ref", "Pos", "Depth"])
        rev_df['Primer'] = 'rev'

        # combine the dfs
        df = pd.concat([fwd_df, rev_df])

        # filter for fwd and rev positions separately 
        fwd_primer_binding_sites = df[(df["Depth"] == 1) & (df["Primer"] == 'fwd')]["Pos"].tolist()

        rev_primer_binding_sites = df[(df["Depth"] == 1) & (df["Primer"] == 'rev')]["Pos"].tolist()

        def calculate_amplicons(primer_binding_sites, gpsc, xlen, amplicon_positions, amplicon_stats, covered_positions, total_genome_coverage, f, primer_direction):
            for p1loc in primer_binding_sites:
        # find the next primer binding site within xlen bases
                p2loc = next((pos for pos in primer_binding_sites if p1loc < pos <= p1loc + xlen), None)

                if p2loc is not None:
            # amp stats
                    amplicon_stats.append((gpsc, gpsc, gpsc, gpsc, gpsc, gpsc, 0, 0, 0, p1loc, p2loc, primer_direction))
                    covered_positions.update(range(p1loc, p2loc+1))
            # ID amp positions
                    amplicon_positions[gpsc].append((p1loc, p2loc))
                    print(f"Detected amplicon from {p1loc} to {p2loc}.", file=f)  

            # get total genome coverage
                    total_genome_coverage += p2loc - p1loc

        # calculate predicted % coverage 
            coverage_percentage = (len(covered_positions) / genome_length) * 100

        # update dictionary 
            total_genome_coverages[gpsc] = coverage_percentage

        # summarize fwd amplicons
        calculate_amplicons(fwd_primer_binding_sites, gpsc, xlen, amplicon_positions, amplicon_stats, covered_positions, total_genome_coverage, f, 'fwd')
        # summarize rev amplicons
        calculate_amplicons(rev_primer_binding_sites, gpsc, xlen, amplicon_positions, amplicon_stats, covered_positions, total_genome_coverage, f, 'rev')

colnames = ["pid1", "pid2", "set1", "set2",
            "pseq1", "pseq2",
            "max_hdist", "hdist1", "hdist2",
            "p1loc", "p2loc", "PrimerDirection"]
coltypes = ["<U30", "<U30", "<U30", "<U30",
            "<U30", "<U30",
            float, float, float,
            int, int, "<U3"]

dt = {'names': colnames, 'formats': coltypes}

amplicon_statsnp = np.array(amplicon_stats,
                     dtype=dt)

np.save("amplicon_statstab.npy", amplicon_statsnp)

In [36]:
import csv

# Initialize a dictionary to group amplicons by sequence and primer type
grouped_amplicons = {}

# Define the maximum allowed size for amplicons
max_amplicon_size = 2000

# Iterate over amplicon_stats, assuming the last element is primer_direction, and the first is gpsc
for amplicon in amplicon_stats:
    *_, p1loc, p2loc, primer_direction = amplicon
    gpsc = amplicon[0]
    key = (gpsc, primer_direction)
    if key not in grouped_amplicons:
        grouped_amplicons[key] = []
    grouped_amplicons[key].append((p1loc, p2loc))

# Initialize list for contiguous amplicons
contiguous_amplicons = []

# Process each group for contiguity
for (gpsc, primer_direction), positions in grouped_amplicons.items():
    positions.sort()
    start, end = positions[0]
    
    for current_start, current_end in positions[1:]:
        # Check if contiguous and does not exceed max amplicon size
        if current_start <= end + 1 and (current_end - start) <= max_amplicon_size:
            end = max(end, current_end)
        else:
            contiguous_amplicons.append([gpsc, start, end, primer_direction])
            start, end = current_start, current_end
    # After processing all positions, append the last contiguous segment
    contiguous_amplicons.append([gpsc, start, end, primer_direction])

# Write contiguous amplicons to CSV
with open('fwdorrev_amplicon_positions.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["Sequence", "Start", "End", "Primer Type"])
    for amplicon in contiguous_amplicons:
        csvwriter.writerow(amplicon)

In [37]:
import pandas as pd

# create csv file containing predicted genome coverage for each sequence based on predicted amplicon coverage
total_genome_coverages_df = pd.DataFrame([total_genome_coverages])

total_genome_coverages_df.to_csv('fwdorrev_genome_coverage_pc.csv', index=False)