# Run MinION with Smith-Waterman alignment (ours) 

- This scripts runs guppy minion step by step. The actual run can be found in 

In [2]:
# Import all packages

import sys
sys.path.append("/home/emre/github_repo/MinION")

from minION.util import IO_processor
from minION import analyser
from minION import consensus
from minION import demultiplexer
from minION.util import globals

import pickle

import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import gzip
import subprocess

In [None]:
# General settings

result_path = Path("/home/emre/")
experiment_name = "20231208-RL8plates-MinION"

# Add conditions to avoid running the script accidentally
skip_basecalling = True
skip_demultiplex = False
skip_variant_calling = False


### Step 1 (Optional): Basecall reads

- Basecall can usually be done while sequencing (if GPU available?)
- Otherwise, basecall afterwards

In [None]:
# Path to pod5 files
#file_to_pod5 = "/var/lib/minknow/data/20230905_errorprone-3_test/no_sample/20230905_1342_MN41105_flg114_5c170bfa/pod5/"
file_to_fastq = "/var/lib/minknow/data/20231130_RL-5sites-8plates_flongle/no_sample/20231130_1900_MN41105_FLG114_1904f884/fastq_pass/"

basecall_model_type = "sup"
#basecall_model = globals.DORADO_MODELS[basecall_model_type]
output_name = experiment_name


result_folder = IO_processor.create_folder( experiment_name,
                                            basecall_model_type, 
                                            target_path=result_path)
                                            
experiment_folder = IO_processor.find_experiment_folder(experiment_name) # Folder where pod5 files are located

basecall_folder = result_folder / "basecalled_filtered"
basecall_folder.mkdir(parents=True, exist_ok=True)

print(basecall_folder)

if not skip_basecalling:
    pod5_files = IO_processor.find_folder(experiment_folder, "pod5")
    run_dorado(basecall_model, pod5_files, basecall_folder, fastq = True)

### Step 2: Demultiplex with SW
- Demultiplex with SW 

In [None]:
if not skip_demultiplex:
    path_to_code = "/home/emre/tutorials/sequence_tut/source/test_main"
    prompt = f"{path_to_code} {file_to_fastq} {result_folder} {100} {100}"
    subprocess.run(prompt, shell=True)

In [None]:
demultiplex_folder = result_folder / "Demultiplex_cpp_70"
print(demultiplex_folder)

### Step 3: Call Variant with PileUP Analysis

- Call Variant with min freq of 0.4 & depth min 15

In [None]:
ref_seq = Path("/home/emre/PgA9_padded.fasta")
ref_name = "PgA9"
demultiplex_folder = result_folder / "Demultiplex_cpp_70" 
barcode_dict = IO_processor.get_barcode_dict(demultiplex_folder, "NB", "RB")
print(demultiplex_folder)
if not skip_variant_calling:
    #variants = analyser.get_variant_df_AF(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = True, min_freq=0.4, min_depth= 10, padding=0)
    variants = analyser.get_variant_df_custom(demultiplex_folder, ref_seq, barcode_dict, merge = True, padding = 0, min_depth=5)
    #variants = analyser.get_variant_df_AF_parallel(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = False, min_freq=0.4, min_depth= 15, num_jobs=16)


In [None]:
variants = variants[variants["Plate"] > 4]
variants.tail(30)

pd.DataFrame(variants).to_pickle("/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/local/variants_SW_1Mio.pkl")

In [None]:
# Assuming `variants` is your dictionary

# Iterate over the range of the longest list
for i in range(len(variants['RBC'])):
    rbc = variants['RBC'][i]
    fbc = variants['FBC'][i]

    # Check if this index is beyond the length of other lists
    if i >= len(variants['Position']) or i >= len(variants['Variant']) or i >= len(variants['Alignment Count']) or i >= len(variants['Alignment Frequency']):
        print(f"Missing data for RBC: {rbc}, FBC: {fbc}")
    else:
        # Check if the data at this index matches the RBC and FBC
        position = variants['Position'][i]
        variant = variants['Variant'][i]
        alignment_count = variants['Alignment Count'][i]
        alignment_frequency = variants['Alignment Frequency'][i]

        # If any of these are 'NA', it indicates missing data
        if position == "NA" or variant == "NA" or alignment_count == "NA" or alignment_frequency == "NA":
            print(f"Incomplete data for RBC: {rbc}, FBC: {fbc}")


In [None]:
bam_file = "/home/emre/minION_results/MinION_RBC_0902723_sup/Demultiplex_cpp_70_200k_reads/RB02/NB70/alignment_minimap.bam"
template = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
template_seq = analyser.get_template_sequence(template)
ref_name = "HetCPIII"
min_freq = 0.4
min_depth = 15
nn_variants = analyser.call_variant_pop_frequency(bam_file, template_seq, ref_name, min_freq, min_depth, padding_start=50, padding_end=51)
for i, variant in enumerate(nn_variants["Variant"]):
    print(nn_variants)

In [None]:
alignment_count = int(subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode("utf-8").strip())
range_positions = range(50, len(template_seq) - 50)
freq_dist = pd.DataFrame(analyser.get_highest_non_ref_base_freq_2(bam_file, ref_name, range_positions, template_seq, qualities=False)[0]).T.rename(columns={0:"Base", 1:"Frequency"})

nb_positions = analyser.get_nb_positions(freq_dist, min_freq)


nn_variants = analyser.get_pop_frequency(bam_file, template_seq, ref_name, [245], min_freq=min_freq, min_depth= 15)



In [None]:
summary = analyser.read_summary_file(demultiplex_folder)
n_counts = summary.groupby(["RBC","FBC"])["FBC"].value_counts().reset_index() 
variant_df["Variant"] = variant_df["Variant"].astype(str)
variant_df = analyser.rename_barcode(pd.DataFrame(variants).merge(n_counts, on=["RBC","FBC"] , how="left"))
def format_variant_list(variant_list):
    """ Convert a list of integer variants to a string format. """
    if isinstance(variant_list, list):
        return '_'.join(str(v) for v in variant_list)
    return variant_list

# Apply this conversion to the 'Variant' column
variant_df["Variant"] = variant_df["Variant"].apply(format_variant_list)

# Now apply the 'adjust_variant' function
variant_df["Variant"] = variant_df["Variant"].apply(lambda x: analyser.adjust_variant(x, 50))


# Index Finder

In [5]:
demultiplex_folder = Path("/home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_indexed")
path_to_index = "/home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered"
ref_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii.fasta")
index_file = "output_index.txt"
path_to_code = "/home/emre/tutorials/sequence_tut/source/test_main"
prompt = f"{path_to_code} {path_to_index} {demultiplex_folder} {100} {100}"
subprocess.run(prompt, shell=True)

Number of files: 1
Processing file: /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered/basecalled_filtered.fastq.gz
Time taken by function: 354731 milli seconds


CompletedProcess(args='/home/emre/tutorials/sequence_tut/source/test_main /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_indexed 100 100', returncode=0)