# Every Variant Sequencing with Oxford Nanopore Technologies

This script is being used after sequencing. The raw pod5 files can be basecalled or the already basecalled files can be used directly (fastq.gz)

## Workflow

### 1. Basecalling (Optional)

- The raw reads are stored in the main folder of ONT (e.g /var/lib/minknow/data). Enter the experiment name as input. 
- Sequences are basecalled based on the model of choice. If enough computational power is available, we recommend "sup" method

### 2. Demultiplexing 
- Each reead is assigned to a well/plate combination. 

### 3. Variant Calling
- Minimap2 for creating Multiple Sequence Alignment (MSA)
- Base Frequency Caller is being used for variant calling



### Packages 

In [1]:
# Import all packages

import sys
sys.path.append("/home/emre/github_repo/MinION")

from minION.util import IO_processor
from minION import analyser
from minION import consensus
from minION import demultiplexer
from minION import basecaller
from minION.util import globals
from minION import analyser_bayes_AF

import pickle

import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import gzip
import subprocess

### Meta Data 

- Provide the following arguments:

- Result Path: Path where the minion result folder will be created. All experiment results are then stored within the folder
- Experiment Name: The experiment name is assigned when running the sequencer. Use the same name for identification


In [4]:
# General settings

result_path = Path("/home/emre/")
experiment_name = "20231214-RL-8-49"
file_to_experiment= f"/var/lib/minknow/data/{experiment_name}"
basecall_model_type = "sup"
result_folder = IO_processor.create_folder( experiment_name,
                                            basecall_model_type, 
                                            target_path=result_path)
basecall_folder = result_folder / "basecalled"
basecall_folder.mkdir(parents=True, exist_ok=True)
experiment_folder = IO_processor.find_experiment_folder(experiment_name) # Folder where pod5 files are located

# Add conditions to avoid running the script accidentally
skip_basecalling = True
skip_demultiplex = False
skip_variant_calling = False


print(basecall_folder)
print(experiment_folder)

/home/emre/minION_results/20231214-RL-8-49_sup/basecalled
/var/lib/minknow/data/20231214-RL-8-49


### Step 1 (Optional): Basecall reads

- Basecall can usually be done while sequencing (if GPU available?)
- Otherwise, basecall afterwards

In [4]:
if not skip_basecalling:
    pod5_files = IO_processor.find_folder(experiment_folder, "pod5")
    basecaller.run_dorado(basecall_model_type, pod5_files, basecall_folder, fastq = True)

/home/emre/minION_results/20231214-RL-8-49_sup/basecalled_filtered


In [5]:
# Find fastq files
file_to_fastq = IO_processor.find_folder(experiment_folder, "fastq_pass")

### Step 2: Demultiplex with SW
- Demultiplex with SW 

In [None]:

# Barcode file path
barcode_file = "/home/emre/github_repo/MinION/minION/barcoding/minion_barcodes_pga9.fasta"
if not skip_demultiplex:
    path_to_code = "/home/emre/tutorials/sequence_tut/source/test_main"
    prompt = f"{path_to_code} {file_to_fastq} {result_folder} {barcode_file} {100} {100}"
    subprocess.run(prompt, shell=True)

Usage: /home/emre/tutorials/sequence_tut/source/test_mainPlease provide all arguments required <folder_name> <demultiplexer_folder_path> <barcode fasta> <front_window_size> <rear_window_size>



KeyboardInterrupt

Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_6.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_6.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_6.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_6.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_6.fastq.gz
Error: Invalid base found in sequence in file: /va

Number of files: 206
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_0.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_1.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_2.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_3.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_4.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_5.fastq.gz
Processing file: /var/lib/minknow/data/2023

Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_16.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_16.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_16.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_16.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_16.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_7.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_8.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_9.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_10.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_11.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_12.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sa

Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_27.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_27.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_27.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_27.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_27.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_17.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_18.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_19.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_20.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_21.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_22.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no

Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_30.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_30.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_30.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_30.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_30.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_28.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_29.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_30.fastq.gz


Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_34.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_34.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_34.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_34.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_34.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_31.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_32.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_33.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_34.fastq.gz


Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_50.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_50.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_50.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_50.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_50.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_35.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_36.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_37.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_38.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_39.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_40.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no

Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_58.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_58.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_58.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_58.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_58.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_51.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_52.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_53.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_54.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_55.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_56.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no

Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_82.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_82.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_82.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_82.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_82.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_83.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_84.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_85.fastq.gz


Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_85.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_85.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_85.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_85.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_85.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_86.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_87.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_88.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_89.fastq.gz


Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_89.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_89.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_89.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_89.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_89.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_90.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_91.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_92.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_93.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_94.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_95.fastq.gz


Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_95.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_95.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_95.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_95.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_95.fastq.gz
Error: Invalid base found in sequence in file: /var/lib/minknow/d

Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_96.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_97.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_98.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_99.fastq.gz
Processing file: /var/lib/minknow/data/20231214-RL-8-49/no_sample/20231214_1201_MN41105_flg114_30d6c2a8/fastq_pass/flg114_pass_30d6c2a8_fc6ee4cc_100.fastq.gz
Time taken by function: 118549 milli seconds


In [7]:
demultiplex_folder = result_folder 
print(demultiplex_folder)

/home/emre/minION_results/20231214-RL-8-49_sup


### Step 3: Call Variant with PileUP Analysis

- Call Variant with min freq of 0.4 & depth min 15

Read Summary file (Optional):


In [8]:
summary_file = analyser.read_summary_file(result_folder)
summary_file

Unnamed: 0,RBC,RBC_Score,FBC,FBC_Score
0,RB07,100.0000,unclassified,43.3531
1,RB05,100.0000,NB67,85.7023
2,RB07,100.0000,unclassified,37.9544
3,RB05,100.0000,unclassified,39.9320
4,RB08,100.0000,NB73,71.0997
...,...,...,...,...
61858,RB08,100.0000,NB93,100.0000
61859,unclassified,41.5825,,0.0000
61860,RB08,93.2489,NB38,95.9009
61861,RB05,100.0000,NB69,100.0000


In [16]:
ref_seq = Path("/home/emre/PgA9.fasta")
ref_name = "PgA9"
demultiplex_folder = result_folder 
barcode_dict = IO_processor.get_barcode_dict(demultiplex_folder, "NB", "RB")
print(demultiplex_folder)
if not skip_variant_calling:
    #variants = analyser.get_variant_df_AF(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = True, min_freq=0.4, min_depth= 10, padding=0)
    #variants = analyser.get_variant_df_custom(demultiplex_folder, ref_seq, barcode_dict, merge = True, padding = 0, min_depth=5)
    #variants = analyser.get_variant_df_AF_parallel(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = False, min_freq=0.4, min_depth= 15, num_jobs=16)
    variants = analyser_bayes_AF.get_variant_df_soft(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = False, min_depth= 5, padding=0, rowwise = False, alignment_name = "alignment_minimap_site_saturation.bam")

/home/emre/minION_results/20231214-RL-8-49_sup
Processing RB06 NB87
Alignment file in /home/emre/minION_results/20231214-RL-8-49_sup/RB06/NB87 does not exist, running alignment and indexing
Running minimap2...
Padding is 0. Implementing soft alignment
Variant: NB87/RB06 114 0.8902199327188391
Processing RB06 NB03
Alignment file in /home/emre/minION_results/20231214-RL-8-49_sup/RB06/NB03 does not exist, running alignment and indexing
Running minimap2...
Padding is 0. Implementing soft alignment
Variant: NB03/RB06 104 0.8887314377560309
Processing RB06 NB20
Alignment file in /home/emre/minION_results/20231214-RL-8-49_sup/RB06/NB20 does not exist, running alignment and indexing
Running minimap2...
Padding is 0. Implementing soft alignment
Variant: NB20/RB06 34 0.8313148788927337
Processing RB06 NB48
Alignment file in /home/emre/minION_results/20231214-RL-8-49_sup/RB06/NB48 does not exist, running alignment and indexing
Running minimap2...
Padding is 0. Implementing soft alignment
Variant:

In [11]:
variants = variants[variants["Plate"] > 4]
variants.tail(30)

KeyError: 'Plate'

In [18]:
pd.DataFrame(variants).head(180)

Unnamed: 0,RBC,FBC,Position,Variant,Alignment Probability,Alignment Count
0,RB06,NB87,"[175, 176, 177]",G175C_C176G_G177T,0.890220,114.0
1,RB06,NB03,"[308, 326, 120]",#PARENT#,0.888731,104.0
2,RB06,NB20,"[175, 176]",G175T_C176G,0.831315,34.0
3,RB06,NB48,"[175, 176, 177]",G175T_C176T_G177T,0.911719,99.0
4,RB06,NB70,"[176, 594]",C176T_C594DEL,0.862041,70.0
...,...,...,...,...,...,...
175,RB08,NB39,"[541, 542, 543]",C541A_A542G_A543G,0.764333,59.0
176,RB08,NB69,"[541, 543]",C541T_A543T,0.915556,105.0
177,RB08,NB25,"[541, 542, 543]",C541G_A542C_A543T,0.929652,125.0
178,RB05,NB87,"[55, 57]",A55G_C57G,0.884938,135.0


In [34]:
pd.DataFrame(variants).to_csv(result_folder / "variant_df.csv")

pd.DataFrame(variants).to_pickle("/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/local/variants_SW_1Mio.pkl")

In [None]:
# Assuming `variants` is your dictionary

# Iterate over the range of the longest list
for i in range(len(variants['RBC'])):
    rbc = variants['RBC'][i]
    fbc = variants['FBC'][i]

    # Check if this index is beyond the length of other lists
    if i >= len(variants['Position']) or i >= len(variants['Variant']) or i >= len(variants['Alignment Count']) or i >= len(variants['Alignment Frequency']):
        print(f"Missing data for RBC: {rbc}, FBC: {fbc}")
    else:
        # Check if the data at this index matches the RBC and FBC
        position = variants['Position'][i]
        variant = variants['Variant'][i]
        alignment_count = variants['Alignment Count'][i]
        alignment_frequency = variants['Alignment Frequency'][i]

        # If any of these are 'NA', it indicates missing data
        if position == "NA" or variant == "NA" or alignment_count == "NA" or alignment_frequency == "NA":
            print(f"Incomplete data for RBC: {rbc}, FBC: {fbc}")


In [None]:
bam_file = "/home/emre/minION_results/MinION_RBC_0902723_sup/Demultiplex_cpp_70_200k_reads/RB02/NB70/alignment_minimap.bam"
template = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
template_seq = analyser.get_template_sequence(template)
ref_name = "HetCPIII"
min_freq = 0.4
min_depth = 15
nn_variants = analyser.call_variant_pop_frequency(bam_file, template_seq, ref_name, min_freq, min_depth, padding_start=50, padding_end=51)
for i, variant in enumerate(nn_variants["Variant"]):
    print(nn_variants)

In [None]:
alignment_count = int(subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode("utf-8").strip())
range_positions = range(50, len(template_seq) - 50)
freq_dist = pd.DataFrame(analyser.get_highest_non_ref_base_freq_2(bam_file, ref_name, range_positions, template_seq, qualities=False)[0]).T.rename(columns={0:"Base", 1:"Frequency"})

nb_positions = analyser.get_nb_positions(freq_dist, min_freq)


nn_variants = analyser.get_pop_frequency(bam_file, template_seq, ref_name, [245], min_freq=min_freq, min_depth= 15)



In [None]:
summary = analyser.read_summary_file(demultiplex_folder)
n_counts = summary.groupby(["RBC","FBC"])["FBC"].value_counts().reset_index() 
variant_df["Variant"] = variant_df["Variant"].astype(str)
variant_df = analyser.rename_barcode(pd.DataFrame(variants).merge(n_counts, on=["RBC","FBC"] , how="left"))
def format_variant_list(variant_list):
    """ Convert a list of integer variants to a string format. """
    if isinstance(variant_list, list):
        return '_'.join(str(v) for v in variant_list)
    return variant_list

# Apply this conversion to the 'Variant' column
variant_df["Variant"] = variant_df["Variant"].apply(format_variant_list)

# Now apply the 'adjust_variant' function
variant_df["Variant"] = variant_df["Variant"].apply(lambda x: analyser.adjust_variant(x, 50))


# Index Finder

In [None]:
demultiplex_folder = Path("/home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_indexed")
path_to_index = "/home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered"
ref_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii.fasta")
index_file = "output_index.txt"
path_to_code = "/home/emre/tutorials/sequence_tut/source/test_main"
prompt = f"{path_to_code} {path_to_index} {demultiplex_folder} {100} {100}"
subprocess.run(prompt, shell=True)