# Command line interface

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp cli

In [None]:
#| hide
from nbdev.showdoc import *


In [None]:
#| export
from fastcore.basics import *
from Bio import SeqIO
import gzip as gz
import os
from collections import defaultdict, Counter
import numpy as np
import itertools
import click
import csv
from dgrec.utils import get_mutations, mut_to_str
from dgrec.genotypes import get_genotypes
from dgrec.genotypes_paired import get_genotypes_paired

In [None]:
#| hide
from dgrec.example_data import get_example_data_dir

In [None]:
#| hide
data_path=get_example_data_dir()
os.listdir(data_path)

['paired_example1_R2.fastq.gz',
 'sacB_genotypes.csv',
 'sacB_ref.fasta',
 '__pycache__',
 'model_mms_2024_02_14.pickle',
 'sacB_example.fastq.gz',
 'example1_ref.fasta',
 'paired_example1_R1.fastq.gz',
 '__init__.py']

In [None]:
#| hide
import subprocess

In [None]:
#| export
#Commande line interface
@click.group()
def dgrec():
    pass

In [None]:
#| export

@dgrec.command('genotypes')
@click.argument('fastq', type=click.Path(exists=True))
@click.argument('ref', type=click.Path(exists=True))
@click.option('--umi_size', '-u', default=10, help="Number of nucleotides at the begining of the read that will be used as the UMI")
@click.option('--quality_threshold', '-q', default=10, help="threshold value used to filter out reads of poor average quality")
@click.option('--ignore_pos', '-i', default=[], multiple=True, help="list of positions that are ignored in the genotype, e.g. [0,1,149,150]")
@click.option('--reads_per_umi_thr', '-r', default=0, help="minimum number of reads required to take a UMI into account. Using a number >2 enables to perform error correction for UMIs with multiple reads")
@click.option('--save_umi_data','-s', default=None, help="path to a csv file to save the details of the genotypes reads for each UMI. If None the data isn't saved.")
@click.option('--output', '-o', default="genotypes.csv", help="output file path")
def genotypes(fastq, ref, umi_size, quality_threshold, ignore_pos, reads_per_umi_thr, save_umi_data, output):
    ref=next(SeqIO.parse(ref,"fasta"))
    ref_seq=str(ref.seq)
    gen_list = get_genotypes(fastq, ref_seq, 
                             umi_size=umi_size, 
                             quality_threshold=quality_threshold, 
                             ignore_pos=ignore_pos,
                             reads_per_umi_thr=reads_per_umi_thr,
                             save_umi_data=save_umi_data)
    
    with open(output,"w") as handle:
            for g,n in gen_list:
                handle.write(f"{g}\t{n}\n")

In [None]:
#| hide
#Testing the cli

fastq_file="sacB_example.fastq.gz"
read_ref_file="sacB_ref.fasta"

result = subprocess.run(["dgrec","genotypes", 
                         os.path.join(data_path,fastq_file), 
                         os.path.join(data_path,read_ref_file),
                         ])
print(result.stdout)  # Print the standard output of the command
print(result.returncode)  # Get the exit code of the command


# Remove test files

# List all files in the directory
files = os.listdir()

# Iterate over the files
for file in files:
    if file.endswith(".csv"):
        
        try:
            # Delete the file
            os.remove(file)
        except PermissionError:
            print(f"Permission denied to delete file '{file}'.")
        except FileNotFoundError:
            print(f"File '{file}' not found.")



n reads:	1000
n_reads pass filter:	955
n_reads aligned:	912
Number of UMIs: 902
Median number of reads per UMI: 1.0
Number of genotypes: 185
None
0


In [None]:
#| export

@dgrec.command('genotypes_paired')
@click.argument('fastq_fwd', type=click.Path(exists=True))
@click.argument('fastq_rev', type=click.Path(exists=True))
@click.argument('ref', type=click.Path(exists=True))
@click.option('--fwd_span', nargs=2, type=(int, int), required = True, 
              help ="Span of the reference sequence read in the forward orientation format: (start, end)")
@click.option('--rev_span', nargs=2, type=(int, int), required = True, 
              help ="Span of the reference sequence read in the reverse orientation format: (start, end)")
@click.option('--require_perfect_pair_agreement', '-p', is_flag=True, default=True, 
              help="Require perfect pair agreement for genotype calling (default: True).\
                  If set to False, the forward sequence will be used in case of disagreement.")
@click.option('--umi_size_fwd', '-u1', default=10,
              help="Number of nucleotides at the beginning of the fwd read that will be used as the UMI (default: 10)")
@click.option('--umi_size_rev', '-u2', default=0,
              help="Number of nucleotides at the beginning of the rev read that will be used as the UMI (default: 0)")
@click.option('--quality_threshold', '-q', default=30,
              help="Threshold value used to filter out reads of poor average quality (default: 30)")
@click.option('--ignore_pos', '-i', default=[], multiple=True,
              help="List of positions that are ignored in the genotype (default: [])")
@click.option('--reads_per_umi_thr', '-r', default=0,
              help="Minimum number of reads required to take a UMI into account (default: 0).\
                  Using a number >2 enables to perform error correction for UMIs with multiple reads")
@click.option('--save_umi_data','-s', default=None,
              help="Path to a csv file to save the details of the genotypes reads for each UMI. If None the data isn't saved (default: None)")
@click.option('-n', default=None, help="Number of reads to use. If None all the reads are used (default: None)")
@click.option('--output', '-o', default="genotypes.csv", help="Output file path")
def genotypes_paired(fastq_fwd, fastq_rev, ref, fwd_span, rev_span, require_perfect_pair_agreement, umi_size_fwd, umi_size_rev, quality_threshold, ignore_pos, reads_per_umi_thr, save_umi_data, n, output):
  """Calls dgrec.genotypes_paired.get_genotypes_paired
  """
  ref=next(SeqIO.parse(ref,"fasta"))
  ref_seq=str(ref.seq)
  gen_list = get_genotypes_paired(fastq_fwd, fastq_rev, ref_seq, 
                                   fwd_span=fwd_span, 
                                   rev_span=rev_span,
                                   require_perfect_pair_agreement=require_perfect_pair_agreement,
                                   umi_size_fwd=umi_size_fwd, 
                                   umi_size_rev=umi_size_rev,
                                   quality_threshold=quality_threshold, 
                                   ignore_pos=ignore_pos,
                                   reads_per_umi_thr=reads_per_umi_thr,
                                   save_umi_data=save_umi_data,
                                   N=int(n))
  
  with open(output,"w") as handle:
    for g,n in gen_list:
      handle.write(f"{g}\t{n}\n")


In [None]:
#| hide
# Define mocked fastq paths (replace with actual paths if needed)
fastq_fwd = os.path.join(data_path,"paired_example1_R2.fastq.gz")
fastq_rev = os.path.join(data_path,"paired_example1_R1.fastq.gz")
ref_file = os.path.join(data_path,"example1_ref.fasta")
# Define output file path
output_file = "test_genotypes.csv"


# Test the command with arguments
result = subprocess.run(["dgrec","genotypes_paired", 
                        fastq_fwd, fastq_rev, ref_file, 
                        "--fwd_span", "0", "150", 
                        "--rev_span", "0", "0", 
                         "-u1", "0", "-u2", "10", "-n", "10",
                          "-o", output_file])
print(result.stdout)  # Print the standard output of the command
print(result.returncode)  # Get the exit code of the command# Assert successful execution - exit code 0


# Cleanup - remove the output file
try:
  os.remove(output_file)
except FileNotFoundError:
  pass  # Ignore if file not found




n reads:	10
n_reads pass filter:	10
n_reads aligned:	10
n_pairs agree:	0
Number of UMIs: 10
Median number of reads per UMI: 1.0
Number of genotypes: 5
None
0


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()