# Run MinION with Smith-Waterman alignment (ours) 

- This scripts runs guppy minion step by step. The actual run can be found in 

In [None]:
# Import all packages

import sys
sys.path.append("/home/emre/github_repo/MinION")

from minION.util import IO_processor
from minION import analyser
from minION import consensus
from minION import demultiplexer
from minION.util import globals



import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import gzip
import subprocess

In [None]:
# General settings

result_path = Path("/home/emre/")
experiment_name = "20230905_errorprone-3_test"

# Add conditions to avoid running the script accidentally
skip_basecalling = True
skip_demultiplex = False
skip_variant_calling = False


### Step 1 (Optional): Basecall reads

- Basecall can usually be done while sequencing (if GPU available?)
- Otherwise, basecall afterwards

In [None]:
# Path to pod5 files
file_to_pod5 = "/var/lib/minknow/data/20230905_errorprone-3_test/no_sample/20230905_1342_MN41105_flg114_5c170bfa/pod5/"
basecall_model_type = "sup"
basecall_model = globals.DORADO_MODELS[basecall_model_type]
output_name = experiment_name


result_folder = IO_processor.create_folder( experiment_name,
                                            basecall_model_type, 
                                            target_path=result_path)
                                            
experiment_folder = IO_processor.find_experiment_folder(experiment_name) # Folder where pod5 files are located

basecall_folder = result_folder / "basecalled_filtered"
basecall_folder.mkdir(parents=True, exist_ok=True)

print(basecall_folder)
if not skip_basecalling:
    pod5_files = IO_processor.find_folder(experiment_folder, "pod5")
    run_dorado(basecall_model, pod5_files, basecall_folder, fastq = True)

### Step 2: Demultiplex with SW
- Demultiplex with SW 

In [6]:
if not skip_demultiplex:
    path_to_code = "/home/emre/tutorials/sequence_tut/source/test_main"
    basecall_folder = str(basecall_folder)
    prompt = f"{path_to_code} {basecall_folder} {result_folder} {150} {150}"
    subprocess.run(prompt, shell=True)

Number of files: 1
Processing file: /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered/basecalled_filtered.fastq.gz
Time taken by function: 503786 milli seconds
Number of files: 1
Processing file: /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered/basecalled_filtered.fastq.gz
Time taken by function: 499798 milli seconds
Number of files: 1
Processing file: /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered/basecalled_filtered.fastq.gz
Time taken by function: 500548 milli seconds


### Step 3: Call Variant with PileUP Analysis

In [7]:
demultiplex_folder = result_folder / "Demultiplex_cpp_70"
ref_seq = "/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta"
ref_name = "HetCPIII"
barcode_dict = IO_processor.get_barcode_dict(demultiplex_folder, "NB", "RB")

if not skip_variant_calling:
    variants = analyser.get_variant_df_AF(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = False, min_freq=0.4, min_depth= 15)

Error in calling variant
Skipping RB01/NB80
Error in calling variant
Skipping RB01/NB79
Error in calling variant
Skipping RB02/NB38


In [17]:
for key in variants.keys():
    print(key)

RBC
FBC
Position
Variant
Alignment Count
Alignment Frequency


In [23]:
# Save key RBC, FBC as DataFrame

df = pd.DataFrame({k: variants[k] for k in ['RBC', 'FBC', "Position", "Variant"]})

In [31]:
# Show Duplicates based on column Variant

duplicate_rows = df[df.duplicated(subset=['RBC', 'FBC'], keep=False)]



In [32]:
duplicate_rows

Unnamed: 0,RBC,FBC,Position,Variant
0,RB03,NB87,"[91, 93, 95, 219, 235, 238]",G93DEL_G95DEL_G219DEL_G235DEL_C238T
1,RB03,NB87,"[91, 93, 95, 219, 235, 238]",T91C_G93C
16,RB03,NB13,"[134, 135, 154, 238, 239, 338, 342, 465, 470, ...",G134A_A135T_G338T_C342T_G480A_A484DEL_T485DEL_...
17,RB03,NB13,"[134, 135, 154, 238, 239, 338, 342, 465, 470, ...",A154DEL_C238DEL_G239T_C465T_T470C_A484DEL_T485...
21,RB03,NB10,"[94, 119, 120, 154, 179, 181, 215, 226, 344, 3...",A94DEL_G344C
...,...,...,...,...
321,RB02,NB27,"[52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 66, 6...",T166C_A195DEL_C201DEL_T205DEL_A220G_T586DEL
322,RB02,NB08,[270],T270C
323,RB02,NB08,[270],
333,RB02,NB18,"[66, 154, 162, 222]",A154G_G162A
