# genotypes

> Functions to go a fastq file to a list of genotypes

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp genotypes

In [None]:
#| hide
from nbdev.showdoc import *


In [None]:
#| export
from fastcore.basics import *
from Bio import SeqIO
import gzip as gz
import os
from collections import defaultdict, Counter
import numpy as np
import itertools
import click
import csv
from dgrec.utils import get_mutations, mut_to_str

In [None]:
#| hide
from dgrec.example_data import get_example_data_dir

In [None]:
#| hide
data_path=get_example_data_dir()
os.listdir(data_path)

['paired_example1_R2.fastq.gz',
 'sacB_genotypes.csv',
 'sacB_ref.fasta',
 '__pycache__',
 'model_mms_2024_02_14.pickle',
 'sacB_example.fastq.gz',
 'example1_ref.fasta',
 'paired_example1_R1.fastq.gz',
 '__init__.py']

In [None]:
#| export

def get_UMI_genotype(fastq_path: str, #path to the input fastq file
                     ref_seq: str, #sequence of the reference amplicon
                     umi_size: int = 10, #number of nucleotides at the begining of the read that will be used as the UMI
                     quality_threshold: int = 30, #threshold value used to filter out reads of poor average quality
                     ignore_pos: list = [], #list of positions that are ignored in the genotype
                     **kwargs #alignment parameters can be passed here (match, mismatch, gap_open, gap_extend)
                     ) -> dict:
    
    """Takes as input a fastq_file of single read amplicon sequencing, and a reference amplicon sequence.
       Returns a dictionnary containing as keys UMIs and as values a Counter of all genotype strings read for that UMI.
    """
    align_param={"match":2,
                 "mismatch":-1, 
                 "gap_open":-1, 
                 "gap_extend":-.5,
                 }
    
    for arg in kwargs:
        if arg in align_param:
            align_param[arg]=kwargs[arg]

    with gz.open(fastq_path,'rt') as handle:
        reads=SeqIO.parse(handle,"fastq")
        n_reads=0
        n_reads_pass_Qfilter=0
        n_reads_aligned=0
        UMI_dict=defaultdict(list,{})
        for r in reads:
            n_reads+=1
            meanScore=np.mean(r.letter_annotations['phred_quality'])

            if meanScore>quality_threshold:
                n_reads_pass_Qfilter+=1
                umi=str(r.seq[:umi_size])
                mutations=get_mutations(ref_seq,r.seq[umi_size:], **align_param)
                if ignore_pos:
                    mutations = [m for m in mutations if m[1] not in ignore_pos]
                n_mut=len(mutations)
                if n_mut<15: #more than 10 mutation is almost certainly crap
                    n_reads_aligned+=1
                    UMI_dict[umi].append(mut_to_str(mutations))
    
    log='n reads:\t{}\nn_reads pass filter:\t{}\nn_reads aligned:\t{}\n'.format(n_reads,n_reads_pass_Qfilter,n_reads_aligned)
    log+=f"Number of UMIs: {len(UMI_dict)}\n"
    
    UMI_gencounter={}
    umi_readcounts=[]
    for umi in UMI_dict:
        umi_readcounts.append(len(UMI_dict[umi]))
        UMI_gencounter[umi]=Counter(UMI_dict[umi])

    log+=f"Median number of reads per UMI: {np.median(umi_readcounts)}"
    print(log)
    return UMI_gencounter

In [None]:
fastq_file="sacB_example.fastq.gz"
fastq_path=os.path.join(data_path,fastq_file)

read_ref_file="sacB_ref.fasta"
ref=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
ref_seq=str(ref.seq)

UMI_gencounter = get_UMI_genotype(fastq_path, ref_seq, ignore_pos=[0,1,2,138,139,140,141], gap_open=-5)

for umi in itertools.islice(UMI_gencounter,20):
    print(umi, list(UMI_gencounter[umi].items()))

n reads:	1000
n_reads pass filter:	847
n_reads aligned:	824
Number of UMIs: 814
Median number of reads per UMI: 1.0
GCATANCTCA [('A61G,-63T,A76T,A91T', 1)]
CGCATNTATA [('', 1)]
CCTTGNAGTA [('', 1)]
GGCGCNAGAA [('', 1)]
TCTCTTGTGA [('', 1)]
ATTACAGAAT [('', 1)]
CTTTTACTAT [('', 1)]
TCAAAGTTTT [('A79T,A91G', 1)]
TTAGCTCATA [('', 1)]
TCATAATGTA [('', 1)]
ATGTGCGGAT [('', 1)]
TGTGTTTATA [('', 1)]
CCATACATCC [('', 1)]
AGGGACGTTT [('A61G,A72G,A76G,A79T', 1)]
GTGTAATAGC [('', 1)]
ATGTCTTTTA [('', 1)]
TATCGGTAGT [('', 1)]
GTCGGGGGGG [('', 1)]
AAGTGGCACA [('', 1)]
AATAGAACCT [('T108A,G127T,G132T', 1)]


In [None]:
#| export

def correct_UMI_genotypes(UMI_gencounter: dict, #the output of the get_UMI_genotype function
                          reads_per_umi_thr=2 #only assign a genotype to a UMI if we have reads_per_umi_thr reads for that genotype or more
                          ) -> dict:
    """Keeps only the genotype with the most reads for each UMI.
    Returns a dictionary with UMIs as keys and a tuple as value: (genotype string, number of reads)
    """
    UMI_gen_dict={}
    for umi in UMI_gencounter:
        gen, n =UMI_gencounter[umi].most_common(1)[0]
        if n>=reads_per_umi_thr: #only assign a genotype to a UMI if we have reads_per_umi_thr reads for that genotype or more
            UMI_gen_dict[umi]=gen

    return UMI_gen_dict

In [None]:
correct_UMI_genotypes(UMI_gencounter)

{'CTCCGGGGAG': '',
 'TGCTTGAGTG': 'A79T',
 'AGGGCGGGCT': '',
 'ATTTCTGTTT': '',
 'TGGGGGGGCT': '',
 'GATTGGTAGA': '',
 'GAACTCTAGT': '',
 'TAACTAATCG': 'A79G,A86G,A91G'}

In [None]:
#| export

def genotype_UMI_counter(UMI_gen_dict):
    """Takes as input the output of correct_UMI_genotypes() and 
    returns a list of genotypes sorted by the number of UMIs detected corresponding that each genotype."""
    umi_counter=Counter(UMI_gen_dict.values())
    gen_sorted_list=sorted(list(umi_counter.items()),key=lambda x: x[1], reverse=True)
    return gen_sorted_list


In [None]:
UMI_gen_dict=correct_UMI_genotypes(UMI_gencounter, reads_per_umi_thr=0)
gen_list = genotype_UMI_counter(UMI_gen_dict)
for g in gen_list[:20]:
    print(f"{g[1]}\t{g[0]}")

675	
3	C56A
3	A76G
3	A91G
3	A91T
2	C69T
2	T122A
2	A91C
2	A105G
2	C116A
2	T60A
2	T59A
2	A68G
2	T134A
1	A61G,-63T,A76T,A91T
1	A79T,A91G
1	A61G,A72G,A76G,A79T
1	T108A,G127T,G132T
1	A48T,A86G
1	A61T,A68T,A72G,A79C,A91G


In [None]:
#| export

def get_genotypes(fastq_path: str, #path to the input fastq file
                    ref_seq: str, #sequence of the reference amplicon
                    umi_size: int = 10, #number of nucleotides at the begining of the read that will be used as the UMI
                    quality_threshold: int = 30, #threshold value used to filter out reads of poor average quality
                    ignore_pos: list = [], #list of positions that are ignored in the genotype
                    reads_per_umi_thr: int = 0, #minimum number of reads required to take a UMI into account. Using a number >2 enables to perform error correction for UMIs with multiple reads.
                    save_umi_data: str = None, #path to the csv file where to save the details of the genotypes reads for each UMI. If None the data isn't saved.
                    **kwargs, #alignment parameters can be passed here (match, mismatch, gap_open, gap_extend)
                    ):
    """Putting things together in a single wrapper function that takes the fastq as input and returns the list of genotypes."""
    UMI_dict = get_UMI_genotype(fastq_path, ref_seq, umi_size, quality_threshold, ignore_pos, **kwargs)
    if save_umi_data:
        with open(save_umi_data,"w", newline='') as handle: 
            csv_writer = csv.writer(handle,delimiter="\t",doublequote=False)
            for umi in itertools.islice(UMI_dict,20):
                csv_writer.writerow([umi,list(UMI_dict[umi].items())])

    UMI_gen_dict=correct_UMI_genotypes(UMI_dict, reads_per_umi_thr)
    gen_list = genotype_UMI_counter(UMI_gen_dict)
    print("Number of genotypes:", len(gen_list))
    return gen_list
    

In [None]:
fastq_file="sacB_example.fastq.gz"
fastq_path=os.path.join(data_path,fastq_file)
read_ref_file="sacB_ref.fasta"
ref=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
ref_seq=str(ref.seq)
gen_list = get_genotypes(fastq_path, ref_seq, 
                         ignore_pos=[0,1,2,138,139,140,141],
                         gap_open=-4, 
                         save_umi_data="test.csv")
for g in gen_list[:20]:
    print(f"{g[1]}\t{g[0]}")

n reads:	1000
n_reads pass filter:	847
n_reads aligned:	824
Number of UMIs: 814
Median number of reads per UMI: 1.0
Number of genotypes: 123
675	
3	C56A
3	A76G
3	A91G
3	A91T
2	C69T
2	T122A
2	A91C
2	A105G
2	C116A
2	T60A
2	T59A
2	A68G
2	T134A
1	A61G,-63T,A76T,A91T
1	A79T,A91G
1	A61G,A72G,A76G,A79T
1	T108A,G127T,G132T
1	A48T,A86G
1	A61T,A68T,A72G,A79C,A91G


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()