### Barcode Demultiplexer Simulator 

- Simulate Demultiplexing using local and semi-global alignment

In [1]:
import sys
sys.path.append("/home/emre/github_repo/MinION")
from minION import analyser
from minION import consensus
from minION import demultiplexer
import importlib
from minION.util import IO_processor
from minION.util.globals import BARCODES, MEDAKA_MODELS, DEFAULT_TARGETS
importlib.reload(IO_processor)
importlib.reload(analyser)
importlib.reload(consensus)
importlib.reload(demultiplexer)
from pathlib import Path
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
#import plotly.express as px
import random
from Bio import SeqIO
import subprocess
import re
import shutil
import glob

In [2]:
# Plotly version
def barcode_score(summary):
    fig = px.histogram(summary, x="barcode_score", nbins=100, 
                       title="Barcode Score Distribution", labels={'barcode_score': 'Barcode Score', 'count': 'Frequency'})
    fig.update_traces(marker_color='red', marker_line_color='black', marker_line_width=1.5, opacity=0.5)
    fig.update_layout(width=600, height=400)
    return fig

def barcode_barplot(summary, barcode_id="barcode_arrangement", ylim=None):
    barcodes = summary[barcode_id].value_counts().reset_index()
    fig = px.bar(barcodes, x=barcode_id, y="count", 
                 title="Barcode Frequency", labels={'index': 'Barcode ID', barcode_id: 'Frequency'})
    fig.update_traces(marker_color='red', marker_line_color='black', marker_line_width=1.5, opacity=0.5)
    fig.update_layout(width=600, height=400)

    if ylim:
        fig.update_layout(yaxis=dict(range=[0, ylim]))

    return fig

def barcode_arrangement(summary, plot=True):
    barcode_arrangements = summary["barcode_arrangement"].value_counts().reset_index()

    if plot:
        fig = px.bar(barcode_arrangements, x='barcode_score', y='barcode_arrangement', 
                     title="Barcode Frequency", labels={'index': 'Barcode ID', 'barcode_arrangement': 'Frequency'})
        fig.update_traces(marker_color='red', marker_line_color='black', marker_line_width=1.5, opacity=0.5)
        fig.update_layout(width=600, height=400)
        return fig
    else:
        return barcode_arrangements

In [3]:

def check_gc_content(sequence, desired_gc_fraction, tolerance=0.05):
    """
    Check if the GC content of the sequence is within the desired range.
    """
    gc_count = sum(1 for nucleotide in sequence if nucleotide in ['G', 'C'])
    gc_fraction = gc_count / len(sequence)
    return abs(gc_fraction - desired_gc_fraction) <= tolerance

def generate_gc_content_sequence(length, desired_gc_fraction):
    """
    Iteratively generate a random DNA sequence until it has the desired GC content.
    """
    while True:
        sequence = ''.join(random.choice('ATCG') for _ in range(length))
        if check_gc_content(sequence, desired_gc_fraction):
            return sequence


def introduce_mutations(sequence, mutation_rate, weights = [0.2, 0.4, 0.4]):
    """
    Introduce mutations in a sequence with the specified mutation rate.
    The mutations include substitution, insertion, and deletion.
    """
    mutated_sequence = ''
    nucleotides = ['A', 'C', 'G', 'T']
    
    weights = {'substitution': weights[0], 'insertion': weights[1], 'deletion': weights[2]}

    i = 0
    while i < len(sequence):
        if random.random() < mutation_rate:
            mutation_type = random.choices(['substitution', 'insertion', 'deletion'], weights=[weights['substitution'], weights['insertion'], weights['deletion']])[0]
            if mutation_type == 'substitution':
                mutated_sequence += random.choice([n for n in nucleotides if n != sequence[i]])
                i += 1
            elif mutation_type == 'insertion':
                mutated_sequence += random.choice(nucleotides)
            else:  # deletion
                i += 1
                continue
        else:
            mutated_sequence += sequence[i]
            i += 1
    return mutated_sequence

### Generate the synthetic sequences

- GC content of 60 %


In [4]:

def write_to_fastq(filename, barcodes, sequence_length, barcode_position_start, barcode_position_end, mutation_rate, num_sequences, num_noise):
    """
    Write mutated sequences for each barcode into a FASTQ file using a fixed surrounding sequence.
    """
    with open(filename, 'wt') as fastq_file:
        for i in range(num_sequences):
            # Generate one surrounding sequence for the set of barcodes
            surrounding_sequence = generate_gc_content_sequence(sequence_length, 0.6)

            for barcode_name, barcode_sequence in barcodes.items():
                # Introduce mutations in the barcode sequence
                mutated_barcode_sequence = introduce_mutations(barcode_sequence, mutation_rate)

                # Insert the mutated barcode sequence in the surrounding sequence
                mutated_sequence = surrounding_sequence[:barcode_position_start] + mutated_barcode_sequence + surrounding_sequence[barcode_position_end:]
                
                # Assign quality scores
                quality_scores = "I" * len(mutated_sequence)
                
                mut_r = str(mutation_rate).replace(".", "_")
                # Write to FASTQ file
                fastq_file.write(f"@seq_{mut_r}_{i + 1}_{barcode_name}\n")
                fastq_file.write(f"{mutated_sequence}\n")
                fastq_file.write("+\n")
                fastq_file.write(f"{quality_scores}\n")


        for i in range(num_noise):
        # Generate random noise sequence
            noise_sequence = generate_gc_content_sequence(sequence_length, 0.6)
            
            # Assign quality scores
            quality_scores = "I" * sequence_length
            
            # Write to FASTQ file
            fastq_file.write(f"@noise_{i + 1}\n")
            fastq_file.write(f"{noise_sequence}\n")
            fastq_file.write("+\n")
            fastq_file.write(f"{quality_scores}\n")

def split_read_id(read_id):
    """
    Split the read ID into the barcode ID and the read ID.
    """
    parts = read_id.split("_")
    barcode_id = parts[-1]
    read_id_prefix = "_".join(parts[:-1])

    if barcode_id.isnumeric():
        barcode_id = "noise"
    return barcode_id



In [None]:
from Bio import SeqIO
barcodes = {}
with open("/home/emre/github_repo/MinION/minION/barcoding/minion_barcodes_sim.fasta", "r") as handle:
    records = list(SeqIO.parse(handle, "fasta"))
    i = 0
    for record in records:
        if i > 24:
            break
        barcodes[record.id] = str(record.seq)
        print(record.id, str(record.seq))
        i += 1
        

In [None]:


names = ["0", "01", "02", "03", "04"]
for i, rate in enumerate([0, 0.1, 0.2, 0.3, 0.4]):
    sequence_length = 150
    barcode_position_start = 70
    barcode_position_end = 94
    mutation_rate = rate # 10% mutation rate
    num_sequences = 3000
    num_noise = 0

    # Write sequences to a FASTQ file
    write_to_fastq(f"data/mutated_sequences_{names[i]}_24barcodes.fastq", barcodes, sequence_length, barcode_position_start, barcode_position_end, mutation_rate, num_sequences, num_noise)


In [None]:
#file_names = ["00_0", "00_1", "00_2", "01_0", "01_1", "01_2", "02_0", "02_1", "02_2"]
file_names = ["tpr_fpr_testing"]

for file in file_names:
    path = Path("data") / file / "24_barcodes" 
    demultiplexer.run_demultiplexer_single(path, BARCODES, 15, 15, basecall_folder = path)

    # Calculate TP, FP, FN, TN
    #summary = pd.read_csv(Path("data") / file / "24_barcodes" / "demultiplex_15" / "barcoding_summary.csv")
    #summary["Truth"] = summary["read_id"].apply(split_read_id)
    

### CPP Analyser

In [None]:
### Summary for CPP 

summary = pd.read_csv("/home/emre/github_repo/MinION/examples/data/Demultiplex_cpp/barcoding_summary_8_barcodes.txt", sep="\t")
summary_24 = pd.read_csv("/home/emre/github_repo/MinION/examples/data/Demultiplex_cpp/barcoding_summary_24_barcodes.txt", sep="\t")
summary["Truth"] = summary["ID"].apply(split_read_id)
summary_24["Truth"] = summary_24["ID"].apply(split_read_id)

# summary = pd.read_csv("/home/emre/github_repo/MinION/examples/data/tpr_fpr_testing/8_barcodes/demultiplex_15/barcoding_summary.txt", sep="\t")
# summary_24 = pd.read_csv("/home/emre/github_repo/MinION/examples/data/tpr_fpr_testing/24_barcodes/demultiplex_15/barcoding_summary.txt", sep="\t")
# summary["Truth"] = summary["read_id"].apply(split_read_id)
# summary_24["Truth"] = summary_24["read_id"].apply(split_read_id)



bins = np.arange(15, 105, 5)
# summary["bins"] = pd.cut(summary["barcode_score"], bins)
# summary_24["bins"] = pd.cut(summary_24["barcode_score"], bins)

# summary["bins"] = pd.cut(summary["RBC_Score"], bins)
# summary_24["bins"] = pd.cut(summary_24["RBC_Score"], bins)

# Set the aesthetic style of the plots
#sns.set_theme(style="whitegrid")

# Create figure and axes
fig, ax = plt.subplots(figsize=(12, 7))


color_8 = sns.color_palette("viridis")[3]
color_24 = sns.color_palette("viridis")[5]
# Create the histogram plot
# sns.histplot(data=summary, x="RBC_Score", bins=100, kde=False, ax=ax, color=color, alpha=0.5, edgecolor="black", linewidth=1.5)
# sns.histplot(data=summary_24, x="RBC_Score", bins=100, kde=False, ax=ax, color=color_24, alpha=0.3, edgecolor="black", linewidth=1.5)


sns.histplot(data=summary, x="barcode_score", bins=100, kde=False, ax=ax, color=color, alpha=0.5, edgecolor="black", linewidth=1.5)
sns.histplot(data=summary_24, x="barcode_score", bins=100, kde=False, ax=ax, color=color_24, alpha=0.3, edgecolor="black", linewidth=1.5)

# Add custom legend with the correct colors
#custom_lines = [plt.Line2D([0], [0], color="navy", lw=4)]

#ax.legend(custom_lines, [''])
# Add labels and title
ax.set_xlabel("Barcode Score", size=22)
ax.set_ylabel("Frequency", size = 22)
ax.tick_params(labelsize=18)
ax.set_ylim(0, 10000)


#ax.set_title("Barcode Score Distribution")
ax.legend(["8 barcodes", "24 barcodes"], fontsize=18)
# Add a red vertical line
#ax.axvline(x=60, color='red', linestyle='--')

plt.tight_layout()
#plt.savefig("barcode_score_distribution_guppy_quality_test_barcodes_guppy.png", dpi=300)
plt.show()

In [None]:
bins = np.arange(30, 105, 5)
summary["bins"] = pd.cut(summary["RBC_Score"], bins) # Guppy
summary_24["bins"] = pd.cut(summary_24["RBC_Score"], bins)

# summary["bins"] = pd.cut(summary["RBC_Score"], bins)
# summary_24["bins"] = pd.cut(summary_24["RBC_Score"], bins)


df = summary_24.copy()

In [None]:

TPR_interval = {"Interval" : [], "TPR" : [], "FPR" : []}
barcode_column = "RBC"
for intervals in df["bins"].unique().sort_values():
    bin_df = df[df["bins"] == intervals]
    
    tpr = []
    fpr = []

    fp = bin_df[(bin_df[barcode_column] != bin_df["Truth"])]
    tp = bin_df[(bin_df[barcode_column] == bin_df["Truth"])]

    print(f"TPR for {intervals}: {len(tp) / (len(bin_df) +1)}")
    print(f"FPR for {intervals}: {len(fp) / (len(bin_df)+1)}")

    TPR_interval["Interval"].append(intervals)
    TPR_interval["TPR"].append(len(tp) / (len(bin_df) +1))
    TPR_interval["FPR"].append(len(fp) / (len(bin_df)+1))

pd.DataFrame(TPR_interval).to_csv("tpr_fpr_24barcodes_local.csv", index=False)
pd.DataFrame(TPR_interval).dropna().plot(x="Interval", y=["TPR", "FPR"], figsize=(10, 5), title="TPR and FPR for different barcode score intervals")

In [None]:
for intervals in summary["bins"].unique().sort_values():
    bin_df = summary[summary["bins"] == intervals]
    
    tpr = []
    fpr = []

    fp = bin_df[(bin_df[barcode_column] != bin_df["Truth"])]
    tp = bin_df[(bin_df[barcode_column] == bin_df["Truth"])]
    print(f"TPR for {intervals}: {len(tp) / (len(bin_df) +1)}")
    break

In [None]:
bin_df

In [None]:
summary = pd.read_csv("/home/emre/github_repo/MinION/examples/data/tpr_fpr_testing/24_barcodes/demultiplex_15/barcoding_summary.txt", sep="\t")
summary

### Guppy Analyser

In [None]:
summary = pd.read_csv("/home/emre/github_repo/MinION/examples/data/tpr_fpr_testing/24_barcodes/demultiplex_15/barcoding_summary.txt", sep="\t")
summary["Truth"] = summary["read_id"].apply(split_read_id)
bins = np.arange(15, 105, 5)
summary["bins"] = pd.cut(summary["barcode_score"], bins)

# Set the aesthetic style of the plots
sns.set_theme(style="whitegrid")

# Create figure and axes
fig, ax = plt.subplots(figsize=(10, 5))

# Create the histogram plot
sns.histplot(data=summary, x="barcode_score", bins=100, kde=False, ax=ax, color="navy", alpha=0.5)


# Add custom legend with the correct colors
custom_lines = [plt.Line2D([0], [0], color="navy", lw=4)]

#ax.legend(custom_lines, [''])
# Add labels and title
ax.set_xlabel("Barcode Score")
ax.set_ylabel("Frequency")
#ax.set_title("Barcode Score Distribution")

# Add a red vertical line
#ax.axvline(x=60, color='red', linestyle='--')

# Show the plot
plt.savefig("barcode_score_distribution_guppy_quality_test_24barcodes.png", dpi=300)
plt.show()



In [None]:
TPR_interval = {"Interval" : [], "TPR" : [], "FPR" : []}
barcode_column = "barcode_full_arrangement"
for intervals in summary["bins"].unique().sort_values():
    bin_df = summary[summary["bins"] == intervals]
    
    tpr = []
    fpr = []

    fp = bin_df[(bin_df[barcode_column] != bin_df["Truth"])]
    tp = bin_df[(bin_df[barcode_column] == bin_df["Truth"])]

    print(f"TPR for {intervals}: {len(tp) / (len(bin_df) +1)}")
    print(f"FPR for {intervals}: {len(fp) / (len(bin_df)+1)}")

    TPR_interval["Interval"].append(intervals)
    TPR_interval["TPR"].append(len(tp) / (len(bin_df) +1))
    TPR_interval["FPR"].append(len(fp) / (len(bin_df)+1))


    pd.DataFrame(TPR_interval).to_csv("tpr_fpr_8barcodes_guppy.csv", index=False)


In [None]:
summary.groupby(["Truth", "bins"]).count()["read_id"].reset_index().pivot(index="bins", columns="Truth", values="read_id").plot.bar(stacked=True, figsize=(10, 5))


In [None]:
# Get TP and FP
barcode_column = "barcode_full_arrangement"
barcode = "RB01"
tp = summary[(summary[barcode_column] == barcode) & (summary["Truth"] == barcode)]

### TPR FPR Plotting of Guppy and Local Alignment

In [None]:
barcode_8_cpp = pd.read_csv("/home/emre/github_repo/MinION/examples/tpr_fpr_8barcodes_CPP.csv")
barcode_24_cpp = pd.read_csv("/home/emre/github_repo/MinION/examples/tpr_fpr_24barcodes_CPP.csv")
barcode_24 = pd.read_csv("/home/emre/github_repo/MinION/examples/tpr_fpr_24barcodes.csv")
barcode_8 = pd.read_csv("/home/emre/github_repo/MinION/examples/tpr_fpr_8barcodes.csv")

barcode_8_cpp["Barcode"] = "8 barcodes (CPP)"
barcode_24_cpp["Barcode"] = "24 barcodes (CPP)"
barcode_24["Barcode"] = "24 barcodes (Guppy)"
barcode_8["Barcode"] = "8 barcodes (Guppy)"

summary = pd.concat([barcode_8_cpp, barcode_24_cpp, barcode_24, barcode_8]).reset_index(drop=True)
#summary = summary.drop(summary[(summary["Barcode"] == "8 barcodes (Guppy)") & (summary["Interval"] == "(20, 25]")].index)
summary = summary[summary['Interval'] >= '(30, 35]']


In [None]:
# Dataframes as per your setup
import matplotlib.cm as cm
df = summary
df_8 = df[df['Barcode'] == "8 barcodes (CPP)"]
df_24 = df[df['Barcode'] == "24 barcodes (CPP)"]
df_8_guppy = df[df['Barcode'] == "8 barcodes (Guppy)"]
df_24_guppy = df[df['Barcode'] == "24 barcodes (Guppy)"]

# Define bar width and positions
barWidth = 0.2
# r1 = np.arange(len(df_8['Interval']))
# r2 = [x + barWidth for x in r1]
# r3 = [x + barWidth for x in r2]
# r4 = [x + barWidth for x in r3]

# Define bar width and positions
barWidth = 0.2
r = np.arange(len(df_8['Interval']))

# Create the figure and axes
plt.figure(figsize=(10, 6))

viridis_colors = cm.viridis(np.linspace(0, 1, 4))

# Creating bars for TPR
plt.bar(r, df_8['TPR'], color=viridis_colors[0], width=barWidth, edgecolor='black', label='8 Barcodes - TPR - Local')
plt.bar(r + barWidth, df_24['TPR'], color=viridis_colors[1], width=barWidth, edgecolor='black', label='24 Barcodes - TPR - Local')
plt.bar(r + 2 * barWidth, df_8_guppy['TPR'], color=viridis_colors[2], width=barWidth, edgecolor='black', label='8 Barcodes - TPR - Semi-Global')
plt.bar(r + 3 * barWidth, df_24_guppy['TPR'], color=viridis_colors[3], width=barWidth, edgecolor='black', label='24 Barcodes - TPR - Semi-Global')

# Add xticks on the middle of the group bars
plt.xlabel('Barcode Score Interval', size=20)
plt.ylabel('True Positive Rate (TPR)', size=20)
plt.xticks(r + 1.5 * barWidth, df_8['Interval'], rotation=45, size=16)
plt.yticks(size=16)

# Create legend & Show graphic
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("TPR_barcode_score_interval.png", dpi=600)
plt.show()

In [None]:
summary = analyser.read_summary_file("/home/emre/github_repo/MinION/examples/data/01_quality/demultiplex_30/")

# Calculate precision, recall, F1 score
summary["Truth"] = summary["read_id"].apply(split_read_id)

barcode_classification = {"True Positive": "TP", "False Positive": "FP", "False Negative": "FN", "True Negative": "TN"}
for barcode in barcodes.keys():
    barcode_column = "barcode_arrangement"
    # barcode = RB01
    tp = summary[(summary[barcode_column] == barcode) & (summary["Truth"] == barcode)]
    fn = summary[(summary[barcode_column] != barcode) & (summary["Truth"] == barcode)]
    fp = summary[(summary[barcode_column] == barcode) & (summary["Truth"] != barcode)]
    tn = summary[(summary[barcode_column] != barcode) & (summary["Truth"] != barcode)]

    tp_percentage = len(tp) / 40000 * 100
    fp_percentage = len(fp) / 40000 * 100

    print(f"True Positive: {len(tp)} ({tp_percentage:.2f}%), False Positive: {len(fp)} ({fp_percentage:.2f}%)")

In [None]:
summary[(summary[barcode_column] == "RB01") & (summary["Truth"] != "RB01")]

In [None]:
demultiplexer.run_demultiplexer_single(Path("data") / "03_0", BARCODES, 50, 50, basecall_folder = Path("data") / "03_0")

In [None]:
summary["barcode_full_arrangement"].value_counts().reset_index()

# SNS
fig, ax = plt.subplots(figsize=(10, 5))
plt.bar(summary["barcode_arrangement"].value_counts().index, summary["barcode_arrangement"].value_counts().values, color="lightgrey", edgecolor="black")
#plt.title("Barcode Arrangement - Guppy (Semi-Global)")
plt.xlabel("Barcode Classification")
plt.ylabel("Frequency")



In [None]:
# Set the aesthetic style of the plots
sns.set_theme(style="whitegrid")

# Create figure and axes
fig, ax = plt.subplots(figsize=(10, 5))

# Create the histogram plot
sns.histplot(data=summary_01, x="barcode_score", bins=100, kde=False, ax=ax, color="navy", alpha=0.5)
sns.histplot(data=summary_02, x="barcode_score", bins=100, kde=False, ax=ax, color="pink", alpha=0.9)
sns.histplot(data=summary_03, x="barcode_score", bins=100, kde=False, ax=ax, color="lightgreen", alpha=0.5)
sns.histplot(data=summary_04, x="barcode_score", bins=100, kde=False, ax=ax, color="red", alpha=0.5)

# Add custom legend with the correct colors
custom_lines = [plt.Line2D([0], [0], color="navy", lw=4),
                plt.Line2D([0], [0], color="pink", lw=4),
                plt.Line2D([0], [0], color="lightgreen", lw=4),
                plt.Line2D([0], [0], color="red", lw=4)]

ax.legend(custom_lines, ['10% ', '20%', "30%", "40%"])
# Add labels and title
ax.set_xlabel("Barcode Score")
ax.set_ylabel("Frequency")
#ax.set_title("Barcode Score Distribution")

# Add a red vertical line
#ax.axvline(x=60, color='red', linestyle='--')

# Show the plot
#plt.savefig("barcode_score_distribution_guppy.png", dpi=300)
plt.show()

In [None]:
summary = pd.concat([summary_01, summary_02, summary_03, summary_04])

In [None]:
bins = np.arange(0, 100, 5)
summary["bins"] = pd.cut(summary["barcode_score"], bins)
summary["Truth"] = summary["read_id"].apply(split_read_id)

In [None]:
summary["barcode_arrangement"].value_counts().reset_index()

In [None]:
def split_read_id(read_id):
    """
    Split the read ID into the barcode ID and the read ID.
    """
    parts = read_id.split("_")
    barcode_id = parts[-1]
    read_id_prefix = "_".join(parts[:-1])
    return barcode_id


In [None]:
summary["Truth"] = summary["read_id"].apply(split_read_id)

In [None]:
correctly_classified_df = get_correctly_classified_barcodes(summary_01, barcode_truth_mapping)


In [None]:
summary_cpp = analyser.read_summary_file(Path('/home/emre/github_repo/MinION/examples/data/Demultiplex_cpp/50'))

In [None]:
summary_cpp["RBC"].value_counts().reset_index()

# SNS
fig, ax = plt.subplots(figsize=(10, 5))
sns.set_theme(style="whitegrid")
sns.barplot(x="RBC", y="count", data=summary_cpp["RBC"].value_counts().reset_index())
plt.title("Barcode Arrangement - Ours")
plt.xlabel("Barcode Classification")
plt.ylabel("Frequency")


In [None]:
# Set the aesthetic style of the plots
sns.set_theme(style="whitegrid")

# Create figure and axes
fig, ax = plt.subplots(figsize=(10, 5))

# Create the histogram plot
sns.histplot(data=summary_cpp, x="RBC_Score", bins=100, kde=True, ax=ax)

# Add labels and title
ax.set_xlabel("Barcode Score")
ax.set_ylabel("Frequency")
ax.set_title("Barcode Score Distribution")

# Add a red vertical line
ax.axvline(x=60, color='red', linestyle='--')

# Show the plot
plt.show()

# Alignment Simulation to predict the accuracy at different depths

- Start with HETCPII (612bp)
- Sample random mutations (Parent, Single Point Mutation, 2, 3 up to 8)
- Generate Substitutions & indels for a given site

In [None]:
def generate_variants(template, num_variants, num_mutations, seed):
    """
    Generate random variants based of num of mutations and num of variants.
    """
    trans_matrix = np.array([   [0,    0.01,  0.46,  0.18, 0.0], #A
                                [0.02, 0,     0.025, 0.43, 0.0], #C
                                [0.43, 0.025, 0,     0.02, 0.0], #G
                                [0.18, 0.0335,0.1,   0,    0.0], #T
                                [0.0,  0.0,   0.0,   0.0,  0]]) #DEL

    bases = ["A", "C", "G", "T", "DEL"]

    Variants = {"Variant": [], "Sequence": []}

    rng = np.random.RandomState(seed)  # Random state object


    for i in range(num_variants):
        if num_mutations == 0:
            Variants["Variant"].append(["#PARENT#"])
            Variants["Sequence"].append(template)
            continue

        positions = rng.choice(range(len(template)), num_mutations, replace=False)
        positions.sort()

        
        mutations = []
        for pos in positions:
            adjusted_pos = pos 
            ref_base = template[adjusted_pos]
            ref_index = bases.index(ref_base)

            prob = trans_matrix[ref_index]
            prob = prob / prob.sum()

            new_base = np.random.choice(bases, p=prob)

            if new_base != "DEL":
                mutation = f"{ref_base}{adjusted_pos + 1}{new_base}"
            elif new_base == "DEL":
                mutation = f"{ref_base}{adjusted_pos + 1}DEL"
                
            else:
                raise ValueError("Invalid base")
                

            mutations.append((adjusted_pos, new_base, mutation))

        variant = []
        mutated_sequence = list(template)
        for pos, new_base, mut in mutations:
            variant.append(mut)
            if new_base != "DEL":
                mutated_sequence[pos] = new_base
            else:
                # Replace the base with a placeholder for deletion
                mutated_sequence[pos] = "_"

        mutated_sequence = "".join(mutated_sequence).replace("_", "")
        Variants["Variant"].append(variant)
        Variants["Sequence"].append(mutated_sequence)

    return pd.DataFrame(Variants)

def select_random_reads(input_file, output_file, num_reads):
    all_records = list(SeqIO.parse(input_file, "fasta"))

    if num_reads > len(all_records):
        print("Requested number of reads is more than available in the file. Selecting all reads.")
        selected_records = all_records
    else:
        selected_records = random.sample(all_records, num_reads)

    with open(output_file, "w") as output_handle:
        SeqIO.write(selected_records, output_handle, "fasta")

def run_alignment_and_indexing_sim(ref, input_file, output_dir, site_saturation = False, alignment_name = "alignment_minimap_Q10"):
    """
    Aligns sequences using minimap2, converts to BAM, sorts and indexes the BAM file.

    Args:
    ref (str): Path to the reference file.
    fasta_file (str): Path to the FASTA file containing reads.
    output_dir (Path or str): Directory to store output files.

    Returns:
    None
    """
    output_dir = Path(output_dir)

    fastq_files = Path(input_file)

    if not fastq_files:
        raise FileNotFoundError("No FASTQ files found in the specified output directory.")

    print(fastq_files)

    print("Running minimap2...")
    if site_saturation:
        
        alignment_name = "alignment_minimap_site_saturation"

        match_score = 4
        mismatch_score = 2
        gap_opening_penalty = 10

        minimap_cmd = f"minimap2 -ax map-ont -A {match_score} -B {mismatch_score} -O {gap_opening_penalty},24 {ref} {fastq_files_str} > {output_dir}/{alignment_name}.sam"
        subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    else:
        minimap_cmd = f"minimap2 -ax map-ont -A 2 -B 4 -O 4,24 {ref} {fastq_files} > {output_dir}/{alignment_name}.sam"
        subprocess.run(minimap_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    view_cmd = f"samtools view -bS {output_dir}/{alignment_name}.sam > {output_dir}/{alignment_name}.bam"
    subprocess.run(view_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    sort_cmd = f"samtools sort {output_dir}/{alignment_name}.bam -o {output_dir}/{alignment_name}.bam"
    subprocess.run(sort_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    index_cmd = f"samtools index {output_dir}/{alignment_name}.bam"
    subprocess.run(index_cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)




def adjust_variant(variant, padding_start):
    """
    Adjust the variant position to account for the padding.
    """

    if "#PARENT#" in variant:
        return "#PARENT#"
    
    elif variant == "NA":
        return "NA"
    
    else:
        variants = variant.split('_')
        adjusted_variants = []

        for v in variants:
            # Find the position number using regular expression
            match = re.search(r'([A-Za-z]+)(\d+)([A-Za-z]+)', v)
            if match:
                refAA, pos, newAA = match.groups()
                
                adjusted_pos = max(int(pos) - padding_start, 1)  
                adjusted_variants.append(f"{refAA}{adjusted_pos}{newAA}")

    return '_'.join(adjusted_variants)



In [None]:
seed = 42

template = IO_processor.read_fasta_file(Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii.fasta"))["Sequence"][0]

Variants = pd.DataFrame({"Variant" : [], "Sequence" : [], "Num_Mutations" : []})


for n_mut in range(0,11):
    var = generate_variants(template, 100, n_mut, seed)
    var["Num_Mutations"] = n_mut
    Variants = pd.concat([Variants, var]).reset_index(drop=True)

Variants.to_pickle("Variants_100_p_s.pkl")

### Simulate mutation prediction of AF analysis and medaka consensus for different # of mutations

In [None]:
Variants = pd.read_pickle("Variants_100_p_s.pkl")
# Select mutation with 0, 1, 5, 10
# variants = Variants[Variants["Num_Mutations"].isin([0, 1, 5, 10])].reset_index(drop=True)


In [None]:
Variants.shape

### Run Simulation

- Enter the depths to analyze
- Padding length
- Num Mutations

In [None]:
from tqdm import tqdm

# Generate substitution and indels for each variant
template = IO_processor.read_fasta_file(Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta"))["Sequence"][0]
padding = 50
os.makedirs(f"data/min_read_depth/seq", exist_ok=True)

ref = "/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta"
reference = "HetCPIII"

results = {"Original Variant" : [], "Predicted Variant" : [], "Depth" : [], "Num Mutations" : [], "Correct" : [], "Alignment Frequency" : []}

np.random.seed(43)
random.seed(43)

for i, variant in tqdm(Variants.iterrows()):

    max_depth = 50
    
    if "#PARENT#" in variant["Variant"]:
        var_name = f"wt_{i}"

    else:
        #sort variant
        var_name = "_".join(variant["Variant"])

    # Create folder for each variant
    os.makedirs(f"data/min_read_depth/seq/{var_name}", exist_ok=True)

    padding_seq1 = "aattcccctctagaaataattttgtttaactttaagaaggagatatacat"
    padding_seq2 = "gatccggctgctaacaaagcccgaaaggaagctgagttggctgctgccac"

    with open(f"data/min_read_depth/seq/{var_name}/{var_name}_Q10.fasta", "w") as handle: # Create fasta file
        for j in range(max_depth):
            new_seq = introduce_mutations(variant["Sequence"], 0.1)

            new_seq = padding_seq1 + new_seq + padding_seq2

            handle.write(f">{var_name}_{j+1}\n")
            handle.write(f"{new_seq}\n")

    depths = [1,3,5,7,9,11,13,15,20,25,30,35,40,45,50]

    for depth in depths:

        #Create depth folder
        depth_folder_path = f"data/min_read_depth/seq/{var_name}/depth_{depth}"
        os.makedirs(depth_folder_path, exist_ok=True)

        # Select random reads from the fasta file, Create first all depth lengths and store all fasta files in a folder 
        select_random_reads(f"data/min_read_depth/seq/{var_name}/{var_name}_Q10.fasta", f"{depth_folder_path}/{var_name}_Q10_reads.fasta", depth)

        # prompt = f'mini_align -r {ref} -i {depth_folder_path}/*.fasta -t 1 -m -p alignment && mv *.bam *.bam.bai {depth_folder_path}'
        # subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(depth_folder_path)
        # Run alignment and indexing
        run_alignment_and_indexing_sim(ref, f"{depth_folder_path}/{var_name}_Q10_reads.fasta", depth_folder_path, site_saturation = False , alignment_name="alignment_minimap_Q10")
        #bam_file = f"{depth_folder_path}/alignment_minimap.bam"

        # Call Variants with bam file
        # if method == "AF":

        #     variant_pred = analyser.call_variant_pop_frequency(Path(bam_file), template, reference, min_freq=0.1, min_depth= 0, padding_start=padding, padding_end= padding + 1)
            
        #     try:
        #         variant_pred = pd.DataFrame(variant_pred).sort_values("Alignment Frequency", ascending=False).reset_index(drop=True)
        #         variant_pred["Variant"] = variant_pred["Variant"].apply(lambda x: adjust_variant(x, padding))
        #         result = 1 if variant_pred["Variant"][0] == "_".join(variant["Variant"]) else 0

        #         results["Original Variant"].append(variant["Variant"])
        #         results["Predicted Variant"].append(variant_pred["Variant"][0])
        #         results["Depth"].append(depth)
        #         results["Num Mutations"].append(len(variant["Variant"]) if "#PARENT#" not in variant["Variant"] else 0)
        #         results["Correct"].append(result)
        #         results["Alignment Frequency"].append(variant_pred["Alignment Frequency"][0])
        #     except:
        #         result = "NA"
        #         results["Original Variant"].append(variant["Variant"])
        #         results["Predicted Variant"].append("NA")
        #         results["Depth"].append(depth)
        #         results["Num Mutations"].append(len(variant["Variant"]))
        #         results["Correct"].append(result)
        #         results["Alignment Frequency"].append("NA")
        
        # elif method == "guppy":
        
 
            
            
# Delete seq folder

# if os.path.exists("data/min_read_depth/seq"):
#     shutil.rmtree("data/min_read_depth/seq")


#results = pd.DataFrame(results)

#results.to_csv("data/min_read_depth/results_100_p_s_Q10.csv", index=False)
#results.to_pickle("data/min_read_depth/results_100_p_s_Q20.pkl")


In [None]:
import glob
from tqdm import tqdm
ref_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
folders = glob.glob("data/min_read_depth/seq/*")

for var_path in tqdm(folders):
    var_namr = os.path.basename(var_path)
    depths = glob.glob(f"{var_path}/depth*")
    for depth in depths:
        
        folder_path = Path(depth)
        print("Processing", folder_path)
        consensus.get_consensus(folder_path, ref_seq, output_name = "consensus.fastq", qualities = True, consensus_folder = folder_path)


In [None]:
### Get variant df



# Speed analsyis
- Generate synthetic sequences and alignment. Call variant with BF, AF & medaka


In [None]:
import glob
from tqdm import tqdm
ref_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
folders = glob.glob("data/min_read_depth/seq/*")

for var_path in tqdm(folders):
    var_namr = os.path.basename(var_path)
    depths = glob.glob(f"{var_path}/depth*")
    for depth in depths:
        
        folder_path = Path(depth)
        print("Processing", folder_path)

In [None]:
pd.DataFrame(results).to_pickle("data/min_read_depth/results_100_p_s_Q20_local.pkl")

In [None]:
df = pd.DataFrame(results)
#df.to_csv("data/min_read_depth/results_20_p_s_01.csv", index=False)
#df.drop(df[df["Correct"] == "NA"].index, inplace=True)
df.tail(30)["Original Variant"].value_counts()
df = df.drop(df[df["Correct"] == "NA"].index).reset_index(drop=True)

In [None]:
import re


def get_DEL_samples(entry):
    for variant in entry:
        if "DEL" in variant:
            return 1
    return 0

def check_neg_DEL_position(row):
    variant_list = row['Original Variant']
    check_list = row['Predicted Variant'].split("_")

    
    if "#PARENT#" in variant_list:
        return 0

    min_length = min(len(variant_list), len(check_list))
    
    for i in range(min_length):
        var = variant_list[i]
        if "DEL" in var:
            match_orig = re.search(r'([A-Za-z]+)(\d+)([A-Za-z]+)', var)
            match_pred = re.search(r'([A-Za-z]+)(\d+)([A-Za-z]+)', check_list[i])
            
            if match_orig and match_pred:
                pos = int(match_orig.group(2))
                pos_pred = int(match_pred.group(2))
                
                if pos == pos_pred or pos_pred + 1 == pos or pos_pred - 1 == pos:
                    return 1

    return 0
        
        # Additional processing can be added here as needed
df_wo_del = df.copy()
df_wo_del["DEL"] =      df_wo_del["Original Variant"].apply(get_DEL_samples)
df_wo_del["Corr DEL"] = df_wo_del.apply(check_neg_DEL_position, axis=1)
mask = df_wo_del[(df_wo_del["DEL"] ==1) & (df_wo_del["Corr DEL"] == 1) & (df_wo_del["Correct"] == 0)].index
df_wo_del.iloc[mask, 4] = 1

In [None]:
df_wo_del[(df_wo_del["Correct"] == 0) & (df_wo_del["Depth"] == 50)]

In [None]:
grouped_stats_wo_del = df_wo_del.groupby(['Num Mutations', 'Depth'])['Correct'].agg(['mean', 'std']).reset_index()
grouped_stats_wo_del

In [None]:
grouped_stats = df.groupby(['Num Mutations', 'Depth'])['Correct'].agg(['mean', 'std']).reset_index()
grouped_stats

In [None]:
results["DEL"] = results["Original Variant"].apply(get_DEL_samples)


In [None]:
#SHow row with Na
from scipy.stats import linregress

#df = pd.DataFrame(results)

#df = df_wo_del

# Assuming df is your DataFrame
df.drop(df[df["Correct"] == "NA"].index, inplace=True)

df = df.sort_values("Num Mutations")
#df['Num Mutations'] = df['Num Mutations'].astype(str)
df['Correct'] = df['Correct'].astype(float)



# Find unique values in 'Num Mutations' and sort them
unique_mutations = sorted(df['Num Mutations'].unique())

# Create a color palette with a color for each unique 'Num Mutations' value
palette = sns.color_palette("viridis", n_colors=len(unique_mutations))

# Map each unique 'Num Mutations' value to a color
color_map = dict(zip(unique_mutations, palette))

# Plotting
plt.figure(figsize=(10, 6))

# Use the lineplot with the custom palette
sns.lineplot(data=df, x="Depth", y="Correct", hue="Num Mutations", style="Num Mutations", 
             markers=True, dashes=False, palette=color_map, linewidth=2, markersize=10)

# Customization
plt.xlabel("Read Depth", size=16)
plt.ylabel("Accuracy", size=16)
plt.xticks(size=14)
plt.yticks(size=14)
# You can uncomment this if you want a legend
# plt.legend(title="# Mutations", title_fontsize=14, fontsize=14)

plt.savefig("depth_vs_correct_Q20.png", dpi=300)
plt.show()

In [None]:
grouped_stats

In [None]:
# Plot Alignment Frequency for number of mutations

plt.bar(df["Num Mutations"].unique(), df.groupby("Num Mutations").mean()["Alignment Frequency"])

In [None]:
df.groupby("Num Mutations").mean()["Correct"]

In [None]:
template = IO_processor.read_fasta_file(Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii.fasta"))["Sequence"][0]


Variants = {"Variant" : [], "Sequence" : []}

var = ["A62T", "A224DEL", "A317T", "T348A", "A596G"]
template = list(template)
# Delete base at position 2 and 3
for v in var:
    match = re.search(r'([A-Za-z]+)(\d+)([A-Za-z]+)', v)
    if match:
        refAA, pos, newAA = match.groups()
        template[int(pos) - 1] = ""
Variants["Variant"].append(var)
Variants["Sequence"].append("".join(template))
print(len(Variants["Sequence"][0]))

In [None]:
template = IO_processor.read_fasta_file(Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta"))["Sequence"][0]
padding_start = 50
padding_end = 50
bam_file = "/home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G22A_C54A_C93T_A105G_G178A_A335T_T519C_C607DEL/depth_50/alignment_minimap.bam"
alignment_count = int(subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode("utf-8").strip())
range_positions = range(padding_start, len(template) - padding_end)
freq_dist = pd.DataFrame(analyser.get_highest_non_ref_base_freq_2(bam_file, reference, range_positions, template, qualities=False)[0]).T.rename(columns={0:"Base", 1:"Frequency"})
nb_positions = analyser.get_nb_positions(freq_dist, 0.4)
freq_df = analyser.get_pop_frequency(bam_file, template, reference, nb_positions, min_freq=0.1, min_depth= 0)
bases_df = analyser.get_bases_from_pileup(bam_file, reference, [655,656,657,658])
bases_df

In [None]:
bam_file = "/home/emre/github_repo/MinION/examples/data/min_read_depth/seq/A393G/depth_50/alignment.bam"
variant_pred = analyser.call_variant_pop_frequency(bam_file, template, reference, min_freq=0.1, min_depth= 1)

In [None]:
variant_pred

In [None]:
alignment_count = int(subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode("utf-8").strip())
freq_dist = pd.DataFrame(analyser.get_highest_non_ref_base_freq_2(bam_file, reference, range(1,len(template)), template, qualities=False)[0]).T.rename(columns={0:"Base", 1:"Frequency"})

nb_positions = analyser.get_nb_positions(freq_dist, 0.4)

In [None]:
analyser.get_pop_frequency(bam_file, template, reference, nb_positions, min_freq=0.1, min_depth= 0)

In [None]:
pd.DataFrame(variant_pred).sort_values("Alignment Frequency", ascending=False).reset_index(drop=True)

In [None]:
Variant_manual = {"variant" : [["G23A", "C336T", "C587DEL"], ["C283DEL", "C387T", "A478G"], ["G3A", "G123A", "G229A", "C442T"] ]}

In [None]:
Variants_manual = Variants[Variants["Variant"].isin(Variant_manual["variant"])]

In [None]:

bam_file = "/home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G3A_G123A_G229A_C442T/depth_50/alignment.bam"
prompt = f"medaka consensus {bam_file} /home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G3A_G123A_G229A_C442T/depth_50/pre_consensus.hdf --batch 200 --threads 4"
subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [None]:
prompt = f"medaka stitch /home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G3A_G123A_G229A_C442T/depth_50/pre_consensus.hdf {ref} /home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G3A_G123A_G229A_C442T/depth_50/result.fasta --threads 4"

subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


In [None]:
# align ref and result with bioaligner

ref = "/home/emre/github_repo/MinION/minION/refseq/hetcpiii.fasta"
result = "/home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G3A_G123A_G229A_C442T/depth_50/result.fasta"

prompt = f"bioaligner align {ref} {result} --outfmt sam --out /home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G3A_G123A_G229A_C442T/depth_50/alignment.sam"

subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [None]:
# Padding the reference sequence with 50 Ns at the start and end

# with open("/home/emre/github_repo/MinION/minION/refseq/hetcpiii.fasta", "r") as handle:
#     records = list(SeqIO.parse(handle, "fasta"))
#     template = str(records[0].seq)

# template = "N" * 50 + template + "N" * 50

# with open("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta", "w") as handle:
#     handle.write(f">{records[0].id}\n")
#     handle.write(f"{template}\n")

# Reindexing the reference sequence
prompt = f"samtools faidx /home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta"
subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

### Alignment Frequency distribution accross different quality scores

In [None]:
Variant_df = pd.read_csv("/home/emre/github_repo/MinION/examples/data/min_read_depth/results_100_p_s_Q15.csv")
# Count NaN
Variant_df.dropna(inplace=True) # NA can occur if the pileup analysis fails. For the sake of the simulation, we dropped these rows. (3 Variants out of 1100)


In [None]:
Variant_df

In [None]:
def edit_alignment_freq(entry):
    if entry["Alignment Frequency"] == "-":
        return 0
    else:
        return entry["Alignment Frequency"]

Variant_df["Alignment Frequency"] = Variant_df.apply(edit_alignment_freq, axis=1)

In [None]:
# Assuming Variant_df is your DataFrame and it's already imported
df = Variant_df[Variant_df["Num Mutations"] > 0]
df["Alignment Frequency"] = df["Alignment Frequency"].astype(float)

# Sort the DataFrame numerically first
df = df.sort_values("Num Mutations")

# Group by and calculate mean and SEM
group_stats = df.groupby("Num Mutations")["Alignment Frequency"].agg(['mean', 'std'])

# Now convert "Num Mutations" to string for plotting, after grouping and calculations
group_stats.index = group_stats.index.astype(str)

# Plotting
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(group_stats.index, group_stats["mean"], yerr=group_stats["std"], color = "lightgrey", edgecolor = "black", capsize=5)
plt.tick_params(size=14, labelsize=14)




# Improving the plot aesthetics
plt.xlabel('# of Mutations', size=18)
plt.ylabel('Alignment Frequency', size=18)
 # Rotate the x-axis labels for better readability
plt.tight_layout()  # Adjust layout to fit everything nicely
plt.savefig("data/min_read_depth/alignment_frequency_vs_num_mutations_Q10.png", dpi=300)
plt.show()

### Speed Test

- Copy 10 folder from min_read_depth and calculate the time to process the data

In [13]:
# Folder 

folder_path = "/home/emre/github_repo/MinION/examples/data/min_read_depth/"

files_to_run = []

variants = glob.glob(f"{folder_path}seq/*")

seed = 42

# Select randomly 20 mutations

variants = random.sample(variants, 50)

for variant in variants:
    # Get depth 50 from each variant
    depth_50_bam = f"{variant}/depth_50/alignment_minimap.bam"
    files_to_run.append(depth_50_bam)







### Run Bayes AF analysis

In [15]:
from minION.analyser_bayes_AF import *
import time 

In [16]:
Time_Analysis = {"Method" : [], "Time" : []}

In [17]:
template_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
ref_name = "HetCPIII"


# Measure time
start = time.time()
for bam_file in files_to_run:
    get_variant_soft(bam_file, template_seq, ref_name, padding = 50)
end = time.time()

Time_Analysis["Method"].append("Soft Probability")
Time_Analysis["Time"].append(end-start)

### Base Frequency Only

In [19]:
padding = 50
template = analyser.get_template_sequence(template_seq)

start = time.time()
for bam_file in files_to_run:
    call_variant_BF(bam_file, "HetCPIII", range(padding, len(template) - padding + 1), template, qualities=False)
end = time.time()

Time_Analysis["Method"].append("Base Frequency Only")
Time_Analysis["Time"].append(end-start)

### Guppy Consensus

In [33]:
cons_folder = "/home/emre/github_repo/MinION/examples/data/cons_tmp_folder"
ref = template_seq
start = time.time()
for bam_file in files_to_run:
    prompt = f"medaka consensus {bam_file} {cons_folder}/pre_consensus_Q10.hdf --batch 200 --threads 4"
    subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    prompt = f"medaka stitch {cons_folder}/pre_consensus_Q10.hdf {ref} {cons_folder}/result.fasta --threads 4"
    subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    analyser.call_variant_nn(template, cons_folder, ref_name, padding = 50)
end = time.time()

Time_Analysis["Method"].append("Medaka Consensus")
Time_Analysis["Time"].append(end-start)

KeyboardInterrupt: 

In [32]:
pd.DataFrame(Time_Analysis).to_csv("/home/emre/github_repo/MinION/results/3_Simulations/Time_Analysis.csv", index=False)

In [29]:
df = pd.DataFrame(Time_Analysis)

# Find the 'Time' value for 'Base Frequency Only'
bf_time = df.loc[df['Method'] == 'Base Frequency Only', 'Time'].iloc[0]

# Normalize the 'Time' column relative to 'Base Frequency Only'
df['Normalized Time'] = df['Time'] / bf_time

In [34]:
bam_file = "/home/emre/github_repo/MinION/examples/data/min_read_depth/seq/G160A/depth_45/alignment_minimap_Q10.bam"
prompt = f"medaka consensus {bam_file} {cons_folder}/pre_consensus_Q10.hdf --batch 200 --threads 4"
subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

prompt = f"medaka stitch {cons_folder}/pre_consensus_Q10.hdf {ref} {cons_folder}/result.fasta --threads 4"
subprocess.run(prompt, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

analyser.call_variant_nn(template, cons_folder, ref_name, padding = 50)

{'Variant': 'NA', 'Position': 'NA', 'Quality-Score': 'NA'}