# Try to implement bayes based variant calling

- For Parent first
- Check for more mutatios later

In [None]:
# Import
import sys
sys.path.append("/home/emre/github_repo/MinION")
from minION.util import IO_processor
from minION import analyser
from minION import consensus

import importlib
importlib.reload(analyser)
importlib.reload(consensus)
importlib.reload(IO_processor)
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import gzip
import math
import re
import pickle
import itertools
import pysam
import subprocess

In [None]:
def get_bases_from_pileup_simulation(bam_file, chrom, positions):
    bases_dict = {position: {} for position in positions}
    
    with pysam.AlignmentFile(bam_file, 'rb') as bam:
        for pileup_column in bam.pileup(chrom, min(positions) - 1, max(positions) + 1,
                                        min_base_quality=0, 
                                        min_mapping_quality=0, 
                                        truncate=True):
            pos = pileup_column.pos + 1
            if pos in positions:
                for pileup_read in pileup_column.pileups:
                    read_name = pileup_read.alignment.query_name

                    # Handle deletions
                    if pileup_read.is_del:
                        base = '-'  # Symbol to represent a deletion
                    elif not pileup_read.is_refskip:
                        base = pileup_read.alignment.query_sequence[pileup_read.query_position]
                    else:
                        continue

                    # Add base to the dictionary
                    if read_name not in bases_dict[pos]:
                        bases_dict[pos][read_name] = base

    # Get unique read names and sort them
    read_names = sorted(set().union(*[bases_dict[pos].keys() for pos in bases_dict]))

    # Create DataFrame for bases
    df_bases = pd.DataFrame(index=read_names, columns=positions)
    
    # Populate DataFrame
    for pos in positions:
        for read_name in bases_dict[pos]:
            df_bases.at[read_name, pos] = bases_dict[pos][read_name]
    
    # Fill NaN with "-"
    df_bases = df_bases.fillna("-")

    return df_bases



def get_bases_from_pileup(bam_file, chrom, positions):
    bases_dict = {position: {} for position in positions}
    qualities_dict = {position: {} for position in positions}
    
    with pysam.AlignmentFile(bam_file, 'rb') as bam:
        for pileup_column in bam.pileup(chrom, min(positions) - 1, max(positions) + 1,
                                        min_base_quality=0, 
                                        min_mapping_quality=0, 
                                        truncate=True):
            pos = pileup_column.pos + 1
            if pos in positions:
                for pileup_read in pileup_column.pileups:
                    read_name = pileup_read.alignment.query_name

                    # Handle deletions
                    if pileup_read.is_del:
                        base = '-'  # or any symbol you prefer to represent a deletion
                        quality = 0  # Assign a default quality for deletions
                    elif not pileup_read.is_refskip:
                        base = pileup_read.alignment.query_sequence[pileup_read.query_position]
                        quality = pileup_read.alignment.query_qualities[pileup_read.query_position]
                    else:
                        continue

                    # Add base and quality to the dictionaries
                    if read_name not in bases_dict[pos]:
                        bases_dict[pos][read_name] = base
                        qualities_dict[pos][read_name] = quality

    # Get unique read names and sort them
    read_names = sorted(set().union(*[bases_dict[pos].keys() for pos in bases_dict]))

    # Create DataFrames
    df_bases = pd.DataFrame(index=read_names, columns=positions)
    df_qualities = pd.DataFrame(index=read_names, columns=positions)
    
    # Populate DataFrames
    for pos in positions:
        for read_name in bases_dict[pos]:
            df_bases.at[read_name, pos] = bases_dict[pos][read_name]
            df_qualities.at[read_name, pos] = qualities_dict[pos][read_name]
    
    # Fill NaN with "-" for bases and 0 for qualities
    df_bases = df_bases.fillna("-")
    df_qualities = df_qualities.fillna(10) # 10 is the lowest quality filter we used for filtering

    return df_bases, df_qualities


def get_soft_pop_frequency(bam_file, template, reference, nb_positions, min_depth = 5):

    # Min depth based on the alphabet size
    
    # Check also for variant by sampling random positions

    bases_df = get_bases_from_pileup(bam_file, reference, nb_positions)
    
    frequency_df = bases_df.apply(get_variant_name, axis=1, args=(template, nb_positions)).value_counts().reset_index()

    frequency_df.columns = ['Population', 'N_reads']
    
    frequency_df["Frequency"] = frequency_df["N_reads"] / frequency_df["N_reads"].sum()


    # Filter for frequency > 0.4 and depth > 15
    frequency_df = frequency_df[(frequency_df["Frequency"] > min_freq) & (frequency_df["N_reads"] > min_depth)]

    return frequency_df


def add_neighbouring_positions(positions, nb_neighbours, max_index):
    if positions is np.nan:
        return np.nan
    
    elif isinstance(positions, int):
        positions = [positions]

    new_positions = []
    for position in positions:
        for new_pos in range(position - nb_neighbours, position + nb_neighbours + 1):
            if 1 <= new_pos <= max_index:  # Check if the new position is within valid index range
                new_positions.append(new_pos)
    return sorted(set(new_positions))

def calculate_mean_quality_for_reads(bases_df, qual_df, nb_positions, nb_neighbours):
    if isinstance(nb_positions, int):
        nb_positions = [nb_positions]  # Convert single integer to a list

    read_mean_qualities = {}

    for nb_position in nb_positions:
        if nb_position not in bases_df.columns or nb_position not in qual_df.columns:
            continue  # Skip positions that are not present in either DataFrame

        neighbor_positions = range(nb_position - nb_neighbours, nb_position + nb_neighbours + 1)

        for read_name in qual_df.index:
            total_qual = 0
            valid_count = 0

            for position in neighbor_positions:
                if position not in bases_df.columns:
                    continue  # Skip positions that are outside the DataFrame's columns

                base = bases_df.at[read_name, position]
                quality = qual_df.at[read_name, position]

                if base != "-" and not pd.isna(quality):
                    total_qual += quality
                    valid_count += 1

            if valid_count == 0:
                continue  # Skip if no valid qualities were found
            else:
                if read_name not in read_mean_qualities:
                    read_mean_qualities[read_name] = {}
                read_mean_qualities[read_name][nb_position] = total_qual / valid_count

    # Convert the dictionary into a DataFrame
    mean_quality_df = pd.DataFrame.from_dict(read_mean_qualities, orient='index')
    updated_base_df = bases_df[nb_positions] 

    return updated_base_df, mean_quality_df

def add_neighbouring_positions(positions, nb_neighbours, max_index):
    if positions is None:
        return None
    
    elif isinstance(positions, int):
        positions = [positions]

    new_positions = []
    for position in positions:
        for new_pos in range(position - nb_neighbours, position + nb_neighbours + 1):
            if 1 <= new_pos <= max_index:  # Check if the new position is within valid index range
                new_positions.append(new_pos)
    return sorted(set(new_positions))

def calculate_mean_quality_for_reads(bases_df, qual_df, nb_positions, nb_neighbours):
    if isinstance(nb_positions, int):
        nb_positions = [nb_positions]  # Convert single integer to a list

    read_mean_qualities = {}

    for nb_position in nb_positions:
        if nb_position not in bases_df.columns or nb_position not in qual_df.columns:
            continue  # Skip positions that are not present in either DataFrame

        neighbor_positions = range(nb_position - nb_neighbours, nb_position + nb_neighbours + 1)

        for read_name in qual_df.index:
            total_qual = 0
            valid_count = 0

            for position in neighbor_positions:
                if position not in bases_df.columns:
                    continue  # Skip positions that are outside the DataFrame's columns

                base = bases_df.at[read_name, position]
                quality = qual_df.at[read_name, position]

                if base != "-" and not pd.isna(quality):
                    total_qual += quality
                    valid_count += 1

            if valid_count == 0:
                continue  # Skip if no valid qualities were found
            else:
                if read_name not in read_mean_qualities:
                    read_mean_qualities[read_name] = {}
                read_mean_qualities[read_name][nb_position] = total_qual / valid_count

    # Convert the dictionary into a DataFrame
    mean_quality_df = pd.DataFrame.from_dict(read_mean_qualities, orient='index')
    updated_base_df = bases_df[nb_positions] 

    return updated_base_df, mean_quality_df

def get_non_error_prop(quality_score):
    """Convert quality score to non-error probability."""
    return 1 - 10 ** (-quality_score / 10)

def get_softmax_count_df(bases_df, qual_df, nb_positions):

    alphabet = "ACTG-"
    softmax_counts = {position: [] for position in nb_positions}
    
    for position in nb_positions:
        for base in alphabet:
            base_mask = bases_df[position] == base
            base_counts = base_mask.sum()
            # Calculate the non-error probability for each base and sum them up
            soft_count = sum(base_mask * qual_df[position].apply(get_non_error_prop))
            softmax_counts[position].append(soft_count)

    softmax_count_df = pd.DataFrame(softmax_counts, columns=nb_positions, index=list(alphabet))

    # Apply softmax to each column (position)
    softmax_count_df = softmax_count_df.apply(lambda x: x / x.sum(), axis=0)

    return softmax_count_df

def get_softmax_count_df_Simulation(bases_df, qual_df, nb_positions):
    
    alphabet = "ACTG-"
    softmax_counts = {position: [] for position in nb_positions}

    for position in nb_positions:
        for base in alphabet:
            base_mask = bases_df[position] == base
            base_counts = base_mask.sum()
            soft_count = sum(base_mask * 0.99)
            softmax_counts[position].append(soft_count)
    
    softmax_count_df = pd.DataFrame(softmax_counts, columns=nb_positions, index=list(alphabet))
    softmax_count_df = softmax_count_df.apply(lambda x: x / x.sum(), axis=0)
    return softmax_count_df
    
def get_softmax(soft_count):
    """Calculate the softmax of a dictionary of soft counts."""
    # Calculate the sum of the non-error probabilities
    total = sum(soft_count.values())
    # Calculate the softmax for each base
    return {base: count / total for base, count in soft_count.items()}

def call_potential_populations(softmax_df, ref_seq):
    positions = softmax_df.columns
    top_combinations = []
    
    # Get the top 2 variants for each position
    for position in positions:
        top_variants = softmax_df[position].nlargest(2)

        if top_variants.iloc[1] < 0.1:
            top_combinations.append([top_variants.index[0]])
        
        else:
            top_combinations.append(top_variants.index.tolist())

        potential_combinations = list(itertools.product(*top_combinations))

    
    variants = {"Variant" : [], "Probability" : []}
    
    for combination in potential_combinations:
        final_variant = []
        for i, pos in enumerate(positions):

            if combination[i] == ref_seq[pos - 1]:
                continue

            elif combination[i] == "-":
                var = f"{ref_seq[pos - 1]}{pos}DEL"
                final_variant.append(var)
            else:
                var = f"{ref_seq[pos - 1]}{pos}{combination[i]}"
                final_variant.append(var)

        final_variant = '_'.join(final_variant)
        if final_variant == "":
            final_variant = "#PARENT#"

        joint_prob = np.prod([softmax_df.at[combination[i], positions[i]] for i in range(len(positions))])
    
        variants["Variant"].append(final_variant)
        variants["Probability"].append(joint_prob)

    return variants

def get_variant_soft(bam_file, template_seq, ref_name, padding = 50):

    variants = {"Variant" : [], "Position" : [], "Alignment Probability" : [], "Alignment Count" : []}

    alignment_count = int(subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode("utf-8").strip())

    # if alignment_count < 5:
    #     print("Not enough alignments")
    #     return None


    template = analyser.get_template_sequence(template_seq)

    padding_start, padding_end = padding, padding
    range_positions = range(padding_start + 1, len(template) - padding_end + 1) 

    freq_dist = pd.DataFrame(analyser.get_highest_non_ref_base_freq_2(bam_file, ref_name, range_positions, template, qualities=False)[0]).T.rename(columns={0:"Base", 1:"Frequency"})

    nb_positions = analyser.get_nb_positions(freq_dist, 0.3)

    available_positions = [pos for pos in range_positions if pos not in nb_positions]


    if len(nb_positions) == 0:
        # Select random 3 positions
        nb_positions = np.random.choice(available_positions, 3, replace=False)

    elif len(nb_positions) == 1:
        add_pos  = np.random.choice(available_positions, 2, replace=False)
        nb_positions = np.append(nb_positions, add_pos)

    elif len(nb_positions) > 15:
        print("Too many positions, either contaminated or sequencing error")
        nb_positions = np.random.choice(range_positions, 3, replace=False)

    #bases_df, qual_df = get_bases_from_pileup(bam_file, ref_name, add_neighbouring_positions(nb_positions, 2, len(template)))
    bases_df = get_bases_from_pileup_simulation(bam_file, ref_name, add_neighbouring_positions(nb_positions, 2, len(template)))
    #bases_df, qual_df = calculate_mean_quality_for_reads(bases_df, qual_df, nb_positions, 2)
    #qual_df = qual_df.fillna(10)
    #softmax_df = get_softmax_count_df(bases_df, qual_df, nb_positions)
    qual_df = pd.DataFrame() # Filler
    softmax_df = get_softmax_count_df_Simulation(bases_df, qual_df, nb_positions)
    variant_df = pd.DataFrame(call_potential_populations(softmax_df, template)).sort_values(by="Probability", ascending=False)

    # Take top variant
    variants["Variant"] = variant_df["Variant"].iloc[0]
    variants["Position"] = nb_positions
    variants["Alignment Probability"] = variant_df["Probability"].iloc[0]
    variants["Alignment Count"] = alignment_count

    
    # print("Error in getting variant")
    # variants["Variant"] = np.nan
    # variants["Position"] = None
    # variants["Alignment Probability"] = None
    # variants["Alignment Count"] = alignment_count

    return variants


def get_variant_df_soft(demultiplex_folder: Path, ref_seq : Path, ref_name : str, barcode_dicts : dict = None, merge = True, min_depth= 5, padding=50, rowwise = False, alignment_name = "alignment_minimap.bam"):


    if barcode_dicts is None:
        barcode_dicts = get_barcode_dict(demultiplex_folder)
    
    variant_template_df = analyser.template_df(barcode_dicts, rowwise=False)

    variants = {"RBC": [], "FBC": [], "Position": [], "Variant": [], "Alignment Probability": [], "Alignment Count": []}

    template = analyser.get_template_sequence(ref_seq) # Reference sequence

    summary = analyser.read_summary_file(demultiplex_folder)
    n_counts = summary.groupby(["RBC","FBC"])["FBC"].value_counts().reset_index() 



    for barcode_id, barcode_dict in barcode_dicts.items():

        rbc = os.path.basename(barcode_id)

        for front_barcode in barcode_dict:

            fbc = os.path.basename(front_barcode)
            print("Processing", rbc, fbc)
            count = n_counts[(n_counts["RBC"] == rbc) & (n_counts["FBC"] == fbc)]["count"].values[0]

            # # If alignment file exist continue
            if not os.path.exists(os.path.join(front_barcode, "alignment_minimap.bam")):
                print(f"Alignment file in {front_barcode} does not exist, running alignment and indexing")
                analyser.run_alignment_and_indexing(ref_seq, front_barcode, site_saturation=True)
            
            else: 
                print("Alignment file already exists, skipping alignment and indexing")


            bam_file = front_barcode / alignment_name


            if not bam_file.exists() or count < min_depth:
                print(f"{bam_file} does not exist.")
                variants["RBC"].append(rbc)
                variants["FBC"].append(fbc)
                variants["Position"].append(np.nan)
                variants["Variant"].append(np.nan)
                variants["Alignment Count"].append(np.nan)
                variants["Alignment Probability"].append(np.nan)
                print(f"Skipping Variant: {fbc}/{rbc}")
                continue

            # try:
            if padding == 0:
                print("Padding is 0. Implementing soft alignment")
                nn_variants = get_variant_soft(bam_file, ref_seq, ref_name, padding = padding)
            
            else: 
                nn_variants = get_variant_soft(bam_file, ref_seq, ref_name, padding = padding)
                print(nn_variants)

            if nn_variants is None:
                print("Empty variant list")
                variants["RBC"].append(rbc)
                variants["FBC"].append(fbc)
                variants["Position"].append(np.nan)
                variants["Variant"].append(np.nan)
                variants["Alignment Count"].append(np.nan)
                variants["Alignment Probability"].append(np.nan)
                print(f"Skipping Variant: {fbc}/{rbc}")
                continue

            
            variants["RBC"].append(rbc)
            variants["FBC"].append(fbc)
            variants["Position"].append(nn_variants["Position"])
            variants["Variant"].append(nn_variants["Variant"])
            #variants["Reads"].append(count)
            # Check if Alignment count is a number
            if isinstance(nn_variants["Alignment Count"], int) & (isinstance(nn_variants["Alignment Probability"], float) or nn_variants["Alignment Probability"] == "-"):
                variants["Alignment Count"].append(nn_variants["Alignment Count"])
                variants["Alignment Probability"].append(nn_variants["Alignment Probability"])


            else:
                print(f"Skipping {rbc}/{fbc} due incomplete data")
                variants["Alignment Count"].append(np.nan)
                variants["Alignment Probability"].append(np.nan)

            print(f"Variant: {fbc}/{rbc} {nn_variants['Alignment Count']} {nn_variants['Alignment Probability']}")
        
        # except Exception as e:
        #     # Append 'NA' in case of an exception
        #     print(f"Error processing {rbc}/{fbc}: {e}")
        #     variants["RBC"].append(rbc)
        #     variants["FBC"].append(fbc)
        #     variants["Position"].append(None)
        #     variants["Variant"].append("NA")
        #     variants["Alignment Count"].append("NA")
        #     variants["Alignment Frequency"].append("NA")



    if merge:
        variant_template_df = analyser.template_df(barcode_dicts, rowwise=rowwise)
        variant_df = analyser.rename_barcode(pd.DataFrame(variants).merge(n_counts, on=["RBC","FBC"] , how="left"))
        variant_df["Variant"] = variant_df["Variant"].apply(analyser.format_variant_list)
        variant_df["Variant"] = variant_df["Variant"].apply(lambda x: analyser.adjust_variant(x, padding))

        return variant_df.merge(variant_template_df, on=["Plate", "Well"], how="right")
    else:
        return variants

def call_variant_BF(bam_file, chrom, positions, reference_sequence, qualities = False):
    """ Calls variants from a BAM file.
    Args:
        - bam_file (str): Path to the BAM file.
        - chrom (str): Chromosome or contig name.
        - positions (list): List of positions to call variants.
        - reference_sequence (str): Reference sequence of the chromosome or contig.
    Returns:
        - variants (list): List of variants.
    """

    variants = {"Variant" : [], "Position" : [], "Frequency" : []}

    if qualities:
        bases, qualities = get_highest_non_ref_base_freq(bam_file, chrom, positions, reference_sequence)
    
    bases = analyser.get_highest_non_ref_base_freq_2(bam_file, chrom, positions, reference_sequence, qualities=False)

    for position in positions:
        ref_base = reference_sequence[position - 1].upper()
        non_ref_base, freq = bases[0][position]
        if non_ref_base and freq >= 0.35:

            if non_ref_base == "-":
                non_ref_base = "DEL"

            variant = f"{ref_base}{position}{non_ref_base}"

            variants["Variant"].append(variant)
            variants["Position"].append(int(position))
            variants["Frequency"].append(freq)
            #variants["Quality-Score"].append(qualities[position])
    
    if variants["Variant"] == []:
        variants["Variant"].append("#PARENT#")
        variants["Position"].append(np.nan)
        variants["Frequency"].append(np.nan)
        #variants["Quality-Score"].append("-")
            

    return variants

In [None]:
# Get alignment which were difficult to align 
variant_df = pd.read_pickle('/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/local/variants_SW_1Mio.pkl')
variant_df_guppy = pd.read_pickle('/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/local/variants_SW_BF_40k.pkl')


In [None]:
variant_df_guppy.head(29)

In [None]:
bam_file = Path("/home/emre/minION_results/MinION_RBC_0902723_sup/Demultiplex_cpp_70_1Mio_reads/RB01/NB38/alignment_minimap.bam")
template_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
ref_name = "HetCPIII"



get_variant_soft(bam_file, template_seq, ref_name, padding = 50)




In [None]:
nb_positions = [166,104,339]
bases_df, qual_df = get_bases_from_pileup(bam_file, ref_name, add_neighbouring_positions(nb_positions, 2, len(template)))
bases_df, qual_df = calculate_mean_quality_for_reads(bases_df, qual_df, nb_positions, 2)
# Replace NaN with 10
qual_df = qual_df.fillna(10)
get_softmax_count_df(bases_df, qual_df, nb_positions)


In [None]:

qual_df
variant_df = pd.DataFrame(call_potential_populations(softmax_df, template)).sort_values(by="Probability", ascending=False)


In [None]:
demultiplex_folder = Path("/home/emre/minION_results/MinION_RBC_0902723_sup/Demultiplex_cpp_70_40k_reads")
template_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
ref_name = "HetCPIII"
barcode_dicts = analyser.get_barcode_dict(demultiplex_folder, "NB", "RB")


variant_df_soft = get_variant_df_soft(demultiplex_folder, template_seq, ref_name, barcode_dicts, merge = True, min_depth= 5, padding=50)

In [None]:
variant_df_soft.to_pickle("/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/local/variants_SW_soft_40k.pkl")

In [None]:
variant_df_soft.head(29)

In [None]:
# Run for barcode simulater
demultiplex_folder = Path("/home/emre/minION_results/TamLQV96_sup/Demultiplex_cpp_70")
template_seq = "/home/emre/tam-lqv.fasta"
ref_name = "Tam-LQV"
barcode_dicts = analyser.get_barcode_dict(demultiplex_folder, "NB", "RB")


variants = get_variant_df_soft(demultiplex_folder, template_seq, ref_name, barcode_dicts, merge = False, min_depth= 5, padding=0, rowwise=True)

In [None]:
from tqdm import tqdm
for var_path in tqdm(folders):
    var_name = os.path.basename(var_path)
    depths = glob.glob(f"{var_path}/depth*")
    for depth in depths:
        
        bam_file = os.path.join(depth, "alignment_minimap.bam")

        if not os.path.exists(os.path.join(depth, "alignment_minimap.bam")):
            print(f"Alignment file in {depth} does not exist, running alignment and indexing")
            analyser.run_alignment_and_indexing(ref_seq, depth)   
                 
        else:
            print("Variant exists", depth)
            nn_variants = get_variant_soft(bam_file, template_seq, ref_name, padding = 50)
            print(nn_variants)
            break
            

In [None]:
padding_start = 50
padding_end = 50
template = analyser.get_template_sequence(template_seq)
range_positions = range(padding_start + 1, len(template) - padding_end + 1)
nb_positions = np.random.choice(range_positions, 3, replace=False)
bases_df, qual_df = get_bases_from_pileup(bam_file, ref_name, add_neighbouring_positions(nb_positions, 2, len(template)))

In [None]:
def single_plate_annotation(entry):
    row = ["A", "B", "C", "D", "E", "F", "G", "H"]
    new_well_name = row[int(entry["Plate"]) - 1] + entry["Well"][1:]
    entry["Well"] = new_well_name
    return entry

In [None]:
variant_df = analyser.rename_barcode(pd.DataFrame(variants), rowwise=True)
variant_df = variant_df.apply(single_plate_annotation, axis=1)
#variant_df = variant_df.merge(variant_template_df, on=["Well"], how="left")

In [None]:
variant_df = analyser.rename_barcode(pd.DataFrame(variants), rowwise=True)
variant_df = variant_df.apply(single_plate_annotation, axis=1)
#Drop Plate column
variant_df = variant_df.drop(columns=["Plate"])
variant_df = variant_df.merge(variant_template_df, on=["Well"], how="right")
variant_df.to_pickle("../results/4_Tam-LQV/Tam-LQF.pkl")

In [57]:
demultiplexer_path = Path("/home/emre/minION_results/20231130_RL-5sites-8plates_flongle_sup/Demultiplex_cpp_70")
#demultiplexer_path = Path("/home/emre/minION_results/20231130_RL-5sites-8plates_flongle_sup/test_alignment")
ref_name = "ParPgb"
template_seq = "/home/emre/minION_results/ParPgb.fasta"
barcode_dicts = analyser.get_barcode_dict(demultiplexer_path, "NB", "RB")

In [58]:
variant_df = get_variant_df_soft(demultiplexer_path, template_seq, ref_name, barcode_dicts, merge = False, min_depth= 5, padding=0)

Processing RB12 NB87
Alignment file already exists, skipping alignment and indexing
Padding is 0. Implementing soft alignment
Too many positions, either contaminated or sequencing error
Variant: NB87/RB12 174 nan
Processing RB12 NB03
Alignment file already exists, skipping alignment and indexing
Padding is 0. Implementing soft alignment
Too many positions, either contaminated or sequencing error
Variant: NB03/RB12 130 nan
Processing RB12 NB20
Alignment file already exists, skipping alignment and indexing
Padding is 0. Implementing soft alignment
Too many positions, either contaminated or sequencing error
Variant: NB20/RB12 69 0.9708044982698963
Processing RB12 NB48
Alignment file already exists, skipping alignment and indexing
Padding is 0. Implementing soft alignment
Too many positions, either contaminated or sequencing error
Variant: NB48/RB12 107 1.0
Processing RB12 NB70
Alignment file already exists, skipping alignment and indexing
Padding is 0. Implementing soft alignment
Too many

In [59]:
pd.DataFrame(variant_df).to_pickle("/home/emre/github_repo/MinION/results/6_5site_ParPgb/ParPgb_variant_df_5site_whole_Sequence_adj_param.pkl")
pd.DataFrame(variant_df).to_csv("/home/emre/github_repo/MinION/results/6_5site_ParPgb/ParPgb_variant_df_5site_whole_Sequence_adj_param.pkl")

In [None]:
variant_df = get_variant_df_soft(demultiplexer_path, template_seq, ref_name, barcode_dicts, merge = False, min_depth= 5, padding=0, alignment_name="alignment_minimap_site_saturation.bam")

In [None]:
pd.DataFrame(variant_df).to_pickle("/home/emre/github_repo/MinION/results/6_5site_ParPgb/ParPgb_variant_df_5site_whole_Sequence_adj_param_site_saturation.pkl")
pd.DataFrame(variant_df).to_csv("/home/emre/github_repo/MinION/results/6_5site_ParPgb/ParPgb_variant_df_5site_whole_Sequence_adj_param_site_saturation.csv")

In [60]:
pd.DataFrame(variant_df)

Unnamed: 0,RBC,FBC,Position,Variant,Alignment Probability,Alignment Count
0,RB12,NB87,"[354, 432, 374]",T354A_C432A,,174.0
1,RB12,NB03,"[523, 599, 412]",G523A_G599A,,130.0
2,RB12,NB20,"[367, 132, 278]",A367DEL_C132T_T278G,0.970804,69.0
3,RB12,NB48,"[376, 499, 262]",G376DEL_T499DEL_C262A,1.000000,107.0
4,RB12,NB70,"[342, 376, 370]",T342A_G376A,,152.0
...,...,...,...,...,...,...
732,RB07,NB18,"[135, 217, 171]",A217DEL_C171G,0.882647,126.0
733,RB07,NB05,"[486, 479, 329]",C486G_C479T_A329C,0.957557,117.0
734,RB07,NB39,"[219, 274, 501]",G219T_T274G_G501C,0.904253,81.0
735,RB07,NB69,"[262, 200, 328]",C262G_T200A,0.989011,91.0


### Simulator - Call Variant of Alignments Minimap.bam

In [None]:
import glob
template_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
ref_name = "HetCPIII"
Variant_dict = {"Original Variant" : [], "Predicted Variant": [] ,"Depth" : [], "Alignment Probability" : [], "Alignment Count" : []}

sequence_folder = Path("/home/emre/github_repo/MinION/examples/data/min_read_depth/seq")
variant_folder = glob.glob(f"{sequence_folder}/*")

for var in variant_folder:  
    variant_name = os.path.basename(var)
    
    depth_folders = glob.glob(f"{var}/depth*")

    for depth in depth_folders:

        depth_name = os.path.basename(depth)
        
        bam_file = os.path.join(depth, "alignment_minimap_Q10.bam")

        if not os.path.exists(os.path.join(depth, "alignment_minimap_Q10.bam")):
            print(f"Alignment file in {depth} does not exist, skipping")
            continue
        else:

            nn_variants = get_variant_soft(bam_file, template_seq, ref_name, padding = 50)
            print(nn_variants)

            if "wt" in variant_name:
                variant_name = "#PARENT#"
                Variant_dict["Original Variant"].append(variant_name)
            else:
                Variant_dict["Original Variant"].append(variant_name)
            Variant_dict["Depth"].append(depth_name)
            Variant_dict["Predicted Variant"].append(nn_variants["Variant"])
            Variant_dict["Alignment Count"].append(nn_variants["Alignment Count"])
            Variant_dict["Alignment Probability"].append(nn_variants["Alignment Probability"])


        
    

### Call Variant with BF only

In [None]:
import glob
template_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
ref_name = "HetCPIII"
Variant_dict = {"Original Variant" : [], "Predicted Variant": [] ,"Depth" : [], "Frequency" : []}
template = analyser.get_template_sequence(template_seq)
sequence_folder = Path("/home/emre/github_repo/MinION/examples/data/min_read_depth/seq")
variant_folder = glob.glob(f"{sequence_folder}/*")

for var in variant_folder:  
    variant_name = os.path.basename(var)
    
    depth_folders = glob.glob(f"{var}/depth*")

    for depth in depth_folders:

        depth_name = os.path.basename(depth)
        
        bam_file = os.path.join(depth, "alignment_minimap_Q10.bam")

        if not os.path.exists(os.path.join(depth, "alignment_minimap_Q10.bam")):
            print(f"Alignment file in {depth} does not exist, skipping")
            continue
        else:
            padding = 50
            nn_variants = call_variant_BF(bam_file, "HetCPIII", range(padding, len(template) - padding + 1), template, qualities=False)
            nn_variants["Variant"] = "_".join(nn_variants["Variant"])
            if "wt" in variant_name:
                variant_name = "#PARENT#"
                Variant_dict["Original Variant"].append(variant_name)
            else:
                Variant_dict["Original Variant"].append(variant_name)
            Variant_dict["Depth"].append(depth_name)
            Variant_dict["Predicted Variant"].append(nn_variants["Variant"])
            Variant_dict["Frequency"].append(nn_variants["Frequency"])
  

In [None]:
pd.DataFrame(Variant_dict)

In [None]:
pd.DataFrame(Variant_dict).to_pickle("/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/Simulation_Q10_BF_results.pkl")

In [None]:
variant_df = pd.read_pickle("/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/Simulation_Q20_results.pkl")

In [None]:
variant_df