# Run MinION with Smith-Waterman alignment (ours) 

- This scripts runs guppy minion step by step. The actual run can be found in 

In [1]:
# Import all packages

import sys
sys.path.append("/home/emre/github_repo/MinION")

from minION.util import IO_processor
from minION import analyser
from minION import consensus
from minION import demultiplexer
from minION.util import globals



import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import gzip
import subprocess

In [2]:
# General settings

result_path = Path("/home/emre/")
experiment_name = "20230905_errorprone-3_test"

# Add conditions to avoid running the script accidentally
skip_basecalling = True
skip_demultiplex = True
skip_variant_calling = False


### Step 1 (Optional): Basecall reads

- Basecall can usually be done while sequencing (if GPU available?)
- Otherwise, basecall afterwards

In [3]:
# Path to pod5 files
file_to_pod5 = "/var/lib/minknow/data/20230905_errorprone-3_test/no_sample/20230905_1342_MN41105_flg114_5c170bfa/pod5/"
basecall_model_type = "sup"
basecall_model = globals.DORADO_MODELS[basecall_model_type]
output_name = experiment_name


result_folder = IO_processor.create_folder( experiment_name,
                                            basecall_model_type, 
                                            target_path=result_path)
                                            
experiment_folder = IO_processor.find_experiment_folder(experiment_name) # Folder where pod5 files are located

basecall_folder = result_folder / "basecalled_filtered"
basecall_folder.mkdir(parents=True, exist_ok=True)

print(basecall_folder)
if not skip_basecalling:
    pod5_files = IO_processor.find_folder(experiment_folder, "pod5")
    run_dorado(basecall_model, pod5_files, basecall_folder, fastq = True)

/home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered


### Step 2: Demultiplex with SW
- Demultiplex with SW 

In [6]:
if not skip_demultiplex:
    path_to_code = "/home/emre/tutorials/sequence_tut/source/test_main"
    basecall_folder = str(basecall_folder)
    prompt = f"{path_to_code} {basecall_folder} {result_folder} {150} {150}"
    subprocess.run(prompt, shell=True)

Number of files: 1
Processing file: /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered/basecalled_filtered.fastq.gz
Time taken by function: 503786 milli seconds
Number of files: 1
Processing file: /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered/basecalled_filtered.fastq.gz
Time taken by function: 499798 milli seconds
Number of files: 1
Processing file: /home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered/basecalled_filtered.fastq.gz
Time taken by function: 500548 milli seconds


### Step 3: Call Variant with PileUP Analysis

In [4]:
demultiplex_folder = result_folder / "Demultiplex_cpp_70"
ref_seq = "/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta"
ref_name = "HetCPIII"
barcode_dict = IO_processor.get_barcode_dict(demultiplex_folder, "NB", "RB")

if not skip_variant_calling:
    variants = analyser.get_variant_df_AF(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = True, min_freq=0.4, min_depth= 15)

8
149
191
660
357
689
336
738
500
440
592
394
71
406
80
6
527
710
538
4
569
248
729
261
545
672
1091
197
120
559
253
105
619
272
4
385
314
677
437
117
636
740
409
793
280
475
539
218
1017
583
3
4
563
617
423
23
279
406
892
594
749
563
280
40
110
669
230
3
473
172
1492
6
438
340
1090
683
206
548
478
565
6
347
424
368
57
7
4
17
6
6
5
81
4
5
57
24
64
103
15
Error in calling variant
Skipping RB01/NB80 - -
6
5
6
33
20
15
Error in calling variant
Skipping RB01/NB79 - -
93
95
6
4
20
8
148
72
219
123
15
4
210
198
3
3
3
61
12
29
3
95
24
3
18
4
5
72
33
14
76
15
15
175
4
4
132
34
13
33
4
17
26
10
4
16
4
4
86
79
139
85
47
9
9
6
91
9
769
70
157
200
1267
5
542
423
549
701
801
1533
453
143
141
1319
386
1011
568
979
279
4
1423
38
72
1119
632
776
948
39
295
283
883
1216
16
1058
294
1044
376
571
349
590
790
1137
764
532
23
571
893
1010
661
21
532
1371
596
594
3
806
1050
105
2147
1254
1019
609
767
5
15
Error in calling variant
Skipping RB02/NB38 - -
795
1252
629
819
3
144
614
4
17
6
4
974
651
3
183
1801


In [5]:
variants

Unnamed: 0,Plate,Well,Position,Variant,Alignment Count,Alignment Frequency,count
0,1,A1,,,,,
1,1,A2,,,,,
2,1,A3,"[76, 144, 257]",A76G_A144T,3,0.428571,7.0
3,1,A3,"[76, 144, 257]",A76G_A144T_G257C,3,0.428571,7.0
4,1,A3,"[76, 144, 257]",,1,0.142857,7.0
...,...,...,...,...,...,...,...
367,3,H10,-,#PARENT#,110,-,110.0
368,3,H11,-,#PARENT#,409,-,409.0
369,3,H12,"[183, 226, 227, 229, 237, 356, 396, 483, 484, ...",C183T_G226A_G227T_A229G_G237C_G356C_A483T_A484...,2,0.5,41.0
370,3,H12,"[183, 226, 227, 229, 237, 356, 396, 483, 484, ...",C183T_G226A_G227T_A229G_G237C_G356C_G396T_A483...,1,0.25,41.0


In [12]:
# Save key RBC, FBC as DataFrame

df = pd.DataFrame({k: variants[k] for k in variants.keys()})

In [16]:
# Show Duplicates based on column Variant


df.head(20)

Unnamed: 0,RBC,FBC,Position,Variant,Alignment Count,Alignment Frequency
0,RB03,NB87,"[91, 93, 95, 219, 235, 238]",G93DEL_G95DEL_G219DEL_G235DEL_C238T,1.0,0.5
1,RB03,NB87,"[91, 93, 95, 219, 235, 238]",T91C_G93C,1.0,0.5
2,RB03,NB03,-,#PARENT#,,
3,RB03,NB20,-,#PARENT#,,
4,RB03,NB70,-,#PARENT#,,
5,RB03,NB04,-,#PARENT#,,
6,RB03,NB37,169,A169T,689.0,0.902616
7,RB03,NB88,-,#PARENT#,,
8,RB03,NB14,-,#PARENT#,,
9,RB03,NB82,-,#PARENT#,,


In [32]:
duplicate_rows

Unnamed: 0,RBC,FBC,Position,Variant
0,RB03,NB87,"[91, 93, 95, 219, 235, 238]",G93DEL_G95DEL_G219DEL_G235DEL_C238T
1,RB03,NB87,"[91, 93, 95, 219, 235, 238]",T91C_G93C
16,RB03,NB13,"[134, 135, 154, 238, 239, 338, 342, 465, 470, ...",G134A_A135T_G338T_C342T_G480A_A484DEL_T485DEL_...
17,RB03,NB13,"[134, 135, 154, 238, 239, 338, 342, 465, 470, ...",A154DEL_C238DEL_G239T_C465T_T470C_A484DEL_T485...
21,RB03,NB10,"[94, 119, 120, 154, 179, 181, 215, 226, 344, 3...",A94DEL_G344C
...,...,...,...,...
321,RB02,NB27,"[52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 66, 6...",T166C_A195DEL_C201DEL_T205DEL_A220G_T586DEL
322,RB02,NB08,[270],T270C
323,RB02,NB08,[270],
333,RB02,NB18,"[66, 154, 162, 222]",A154G_G162A
