# Every Variant Sequencing with Oxford Nanopore Technologies

This script is being used after sequencing. The raw pod5 files can be basecalled or the already basecalled files can be used directly (fastq.gz)

## Workflow

### 1. Basecalling (Optional)

- The raw reads are stored in the main folder of ONT (e.g /var/lib/minknow/data). Enter the experiment name as input. 
- Sequences are basecalled based on the model of choice. If enough computational power is available, we recommend "sup" method

### 2. Demultiplexing 
- Each reead is assigned to a well/plate combination. 

### 3. Variant Calling
- Minimap2 for creating Multiple Sequence Alignment (MSA)
- Base Frequency Caller is being used for variant calling



### Packages 

In [1]:
# Import all packages

import sys
sys.path.append("/home/emre/github_repo/MinION")

from minION.util import IO_processor
from minION import analyser
from minION import consensus
from minION import demultiplexer
from minION import basecaller
from minION.util import globals
from minION import analyser_bayes_AF
from minION.variantcaller import *

from pathlib import Path
import pandas as pd

import matplotlib.pyplot as plt
import subprocess

In [3]:
import glob

output_dir = Path("/home/emre/minION_results/test_minion/NB23")

list(output_dir.glob("*.fastq"))

[PosixPath('/home/emre/minION_results/test_minion/NB23/demultiplexed_RB06_NB23_000.fastq')]

### Meta Data 

- Provide the following arguments:

- Result Path: Path where the minion result folder will be created. All experiment results are then stored within the folder
- Experiment Name: The experiment name is assigned when running the sequencer. Use the same name for identification


In [2]:
# General settings

result_path = Path("/home/emre/")
experiment_name = "20240112-RL-8Plates-FLongle-2"
file_to_experiment= f"/var/lib/minknow/data/{experiment_name}"
basecall_model_type = "sup"
result_folder = IO_processor.create_folder( experiment_name,
                                            basecall_model_type, 
                                            target_path=result_path)

barcode_file = "/home/emre/github_repo/MinION/minION/barcoding/minion_barcodes_pga9.fasta"
basecall_folder = result_folder / "basecalled"
basecall_folder.mkdir(parents=True, exist_ok=True)
experiment_folder = IO_processor.find_experiment_folder(experiment_name) # Folder where pod5 files are located

# Add conditions to avoid running the script accidentally
skip_basecalling = True
skip_demultiplex = False
skip_variant_calling = False


print(basecall_folder)
print(experiment_folder)

/home/emre/minION_results/20240112-RL-8Plates-FLongle-2_sup/basecalled
/var/lib/minknow/data/20240112-RL-8Plates-FLongle-2


### Step 1 (Optional): Basecall reads

- Basecall can usually be done while sequencing (if GPU available?)
- Otherwise, basecall afterwards

In [3]:
if not skip_basecalling:
    pod5_files = IO_processor.find_folder(experiment_folder, "pod5")
    basecaller.run_dorado(basecall_model_type, pod5_files, basecall_folder, fastq = True)

In [4]:
# Find fastq files
file_to_fastq = IO_processor.find_folder(experiment_folder, "fastq_pass")
print(file_to_fastq)

/var/lib/minknow/data/20240112-RL-8Plates-FLongle-2/no_sample/20240112_1646_MN45017_flg114_9ac1102b/fastq_pass


### Step 2: Demultiplex with SW
- Demultiplex with SW 

In [5]:
if not skip_demultiplex:
    path_to_code = "/home/emre/github_repo/MinION/source/source/demultiplex"
    prompt = f"{path_to_code} -f {file_to_fastq} -d {result_folder} -b {barcode_file} -w {100} -r {100}"
    subprocess.run(prompt, shell=True)

Processed argument: -f with value: /var/lib/minknow/data/20240112-RL-8Plates-FLongle-2/no_sample/20240112_1646_MN45017_flg114_9ac1102b/fastq_pass
Processed argument: -d with value: /home/emre/minION_results/20240112-RL-8Plates-FLongle-2_sup
Processed argument: -b with value: /home/emre/github_repo/MinION/minION/barcoding/minion_barcodes_pga9.fasta
Processed argument: -w with value: 100
Processed argument: -r with value: 100
Number of files: 305
Processing files: [##################################################] 100%


In [None]:
demultiplex_folder = result_folder 
print(demultiplex_folder)

### Step 3: Call Variant with PileUP Analysis

- Call Variant with min freq of 0.4 & depth min 15

Read Summary file (Optional):


In [None]:
summary_file = analyser.read_summary_file(result_folder)
summary_file

In [None]:
ref_seq = Path("/home/emre/PgA9.fasta")
ref_name = "PgA9"
demultiplex_folder = result_folder 
barcode_dict = IO_processor.get_barcode_dict(demultiplex_folder, "NB", "RB")
print(demultiplex_folder)
if not skip_variant_calling:
    variants = analyser_bayes_AF.get_variant_df_soft(demultiplex_folder, ref_seq, ref_name, barcode_dict, merge = False, min_depth= 5, padding=0, rowwise = False, alignment_name = "alignment_minimap_site_saturation.bam")

In [None]:
variants = variants[variants["Plate"] > 4]
variants.tail(30)

In [None]:
pd.DataFrame(variants).head(180)

In [None]:
pd.DataFrame(variants).to_csv(result_folder / "variant_df.csv")

pd.DataFrame(variants).to_pickle("/home/emre/github_repo/MinION/results/2_hetcpiii_minion_errorprone/local/variants_SW_1Mio.pkl")

In [None]:
# Assuming `variants` is your dictionary

# Iterate over the range of the longest list
for i in range(len(variants['RBC'])):
    rbc = variants['RBC'][i]
    fbc = variants['FBC'][i]

    # Check if this index is beyond the length of other lists
    if i >= len(variants['Position']) or i >= len(variants['Variant']) or i >= len(variants['Alignment Count']) or i >= len(variants['Alignment Frequency']):
        print(f"Missing data for RBC: {rbc}, FBC: {fbc}")
    else:
        # Check if the data at this index matches the RBC and FBC
        position = variants['Position'][i]
        variant = variants['Variant'][i]
        alignment_count = variants['Alignment Count'][i]
        alignment_frequency = variants['Alignment Frequency'][i]

        # If any of these are 'NA', it indicates missing data
        if position == "NA" or variant == "NA" or alignment_count == "NA" or alignment_frequency == "NA":
            print(f"Incomplete data for RBC: {rbc}, FBC: {fbc}")


In [None]:
bam_file = "/home/emre/minION_results/MinION_RBC_0902723_sup/Demultiplex_cpp_70_200k_reads/RB02/NB70/alignment_minimap.bam"
template = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii_padded.fasta")
template_seq = analyser.get_template_sequence(template)
ref_name = "HetCPIII"
min_freq = 0.4
min_depth = 15
nn_variants = analyser.call_variant_pop_frequency(bam_file, template_seq, ref_name, min_freq, min_depth, padding_start=50, padding_end=51)
for i, variant in enumerate(nn_variants["Variant"]):
    print(nn_variants)

In [None]:
alignment_count = int(subprocess.run(f"samtools view -c {bam_file}", shell=True, capture_output=True).stdout.decode("utf-8").strip())
range_positions = range(50, len(template_seq) - 50)
freq_dist = pd.DataFrame(analyser.get_highest_non_ref_base_freq_2(bam_file, ref_name, range_positions, template_seq, qualities=False)[0]).T.rename(columns={0:"Base", 1:"Frequency"})

nb_positions = analyser.get_nb_positions(freq_dist, min_freq)


nn_variants = analyser.get_pop_frequency(bam_file, template_seq, ref_name, [245], min_freq=min_freq, min_depth= 15)



In [None]:
summary = analyser.read_summary_file(demultiplex_folder)
n_counts = summary.groupby(["RBC","FBC"])["FBC"].value_counts().reset_index() 
variant_df["Variant"] = variant_df["Variant"].astype(str)
variant_df = analyser.rename_barcode(pd.DataFrame(variants).merge(n_counts, on=["RBC","FBC"] , how="left"))
def format_variant_list(variant_list):
    """ Convert a list of integer variants to a string format. """
    if isinstance(variant_list, list):
        return '_'.join(str(v) for v in variant_list)
    return variant_list

# Apply this conversion to the 'Variant' column
variant_df["Variant"] = variant_df["Variant"].apply(format_variant_list)

# Now apply the 'adjust_variant' function
variant_df["Variant"] = variant_df["Variant"].apply(lambda x: analyser.adjust_variant(x, 50))


# Index Finder

In [None]:
demultiplex_folder = Path("/home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_indexed")
path_to_index = "/home/emre/minION_results/20230905_errorprone-3_test_sup/basecalled_filtered"
ref_seq = Path("/home/emre/github_repo/MinION/minION/refseq/hetcpiii.fasta")
index_file = "output_index.txt"
path_to_code = "/home/emre/tutorials/sequence_tut/source/test_main"
prompt = f"{path_to_code} {path_to_index} {demultiplex_folder} {100} {100}"
subprocess.run(prompt, shell=True)