# genotypes_paired

> Functions to go a fastq file to a list of genotypes

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp genotypes_paired

In [None]:
#| hide
from nbdev.showdoc import *


In [None]:
#| export
from fastcore.basics import *
from Bio import SeqIO
import gzip as gz
import os
from collections import defaultdict, Counter
import numpy as np
import itertools
import click
import csv
from dgrec.utils import get_mutations, mut_to_str
from dgrec.genotypes import get_mutations, correct_UMI_genotypes, genotype_UMI_counter



In [None]:
#| hide
from dgrec.example_data import get_example_data_dir

In [None]:
#| hide
data_path=get_example_data_dir()
os.listdir(data_path)

['paired_example1_R2.fastq.gz',
 'sacB_genotypes.csv',
 'sacB_ref.fasta',
 '__pycache__',
 'model_mms_2024_02_14.pickle',
 'sacB_example.fastq.gz',
 'example1_ref.fasta',
 'paired_example1_R1.fastq.gz',
 '__init__.py']

In [None]:
#| export

def get_UMI_genotype_paired(fastq_path_fwd: str, #path to the input fastq file reading the ref_seq in the forward orientation
                            fastq_path_rev: str, #path to the input fastq file reading the ref_seq in the reverse orientation
                            ref_seq: str, #sequence of the reference amplicon
                            fwd_span: tuple, #span of the ref_seq that is read in the forward orientation format: (start, end)
                            rev_span: tuple, #span of the ref_seq that is read in the reverse orientation format: (start, end)
                            require_perfect_pair_agreement: bool = True, #if True only pairs of reads that perfectly agree on the sequence within the common span will be used. If False the fwd sequence will be used. Will be set to False by default if there is no overlap.
                            umi_size_fwd: int = 10, #number of nucleotides at the begining of the fwd read that will be used as the UMI
                            umi_size_rev: int = 0, #number of nucleotides at the begining of the rev read that will be used as the UMI (if both are provided the umi will be the concatenation of both)
                            quality_threshold: int = 30, #threshold value used to filter out reads of poor average quality. Both reads have to pass the threshold.
                            ignore_pos: list = [], #list of positions that are ignored in the genotype
                            N = None, #number of reads to consider (useful to get a quick view of the data without going through the whole fastq files). If None the whole data will be used.
                            **kwargs, #alignment parameters can be passed here (match, mismatch, gap_open, gap_extend)
                            ) -> dict:

    align_param={"match":2,
                "mismatch":-1, 
                "gap_open":-1, 
                "gap_extend":-.5,
                }
    
    for arg in kwargs:
        if arg in align_param:
            align_param[arg]=kwargs[arg]

    fwd_span = sorted(fwd_span)
    rev_span = sorted(rev_span)

    if fwd_span[1]>rev_span[0] and (rev_span[1]-rev_span[0])>0:
        overlap=True
        overlap_size=fwd_span[1]-rev_span[0]
    else:
        overlap=False
        require_perfect_pair_agreement=False

    with gz.open(fastq_path_fwd,'rt') as handle1, gz.open(fastq_path_rev,'rt') as handle2:
        fwd_reads=SeqIO.parse(handle1,"fastq")
        rev_reads=SeqIO.parse(handle2,"fastq")
        n_reads=0
        n_reads_pass_Qfilter=0
        n_reads_aligned=0
        n_reads_agree=0
        UMI_dict=defaultdict(list,{})

        paired_iter=zip(fwd_reads,rev_reads)
        if n_reads!=None:
            paired_iter=itertools.islice(paired_iter,0,N)

        for r1, r2 in paired_iter:
            n_reads+=1
            meanScore_r1=np.mean(r1.letter_annotations['phred_quality'])
            meanScore_r2=np.mean(r2.letter_annotations['phred_quality'])

            if meanScore_r1>quality_threshold and meanScore_r2>quality_threshold:
                n_reads_pass_Qfilter+=1
                umi1=str(r1.seq[:umi_size_fwd])
                umi2=str(r2.seq[:umi_size_rev])
                umi=umi1+umi2

                fwd_seq=r1.seq[umi_size_fwd:]
                rev_seq=r2.seq[umi_size_rev:].reverse_complement()

                if overlap:
                    fwd_common_seq=str(fwd_seq[rev_span[0]:fwd_span[1]])
                    rev_common_seq=str(rev_seq[:overlap_size])
                    if require_perfect_pair_agreement:
                        if fwd_common_seq==rev_common_seq:
                            n_reads_agree+=1
                        else:
                            continue
                        
                    consensus=fwd_seq+rev_seq[overlap_size:]
                    mutations=get_mutations(ref_seq[fwd_span[0]:rev_span[1]],consensus)
                else:
                    consensus=""
                    if fwd_span[1]-fwd_span[0]>0:
                        consensus+=fwd_seq
                    if rev_span[1]-rev_span[0]>0:
                        consensus+=rev_seq
                        
                    mutations=get_mutations(ref_seq[fwd_span[0]:fwd_span[1]]+ref_seq[rev_span[0]:rev_span[1]], 
                                            consensus, 
                                            **align_param)

                if ignore_pos:
                    mutations = [m for m in mutations if m[1] not in ignore_pos]
                    
                n_mut=len(mutations)
                if n_mut<15: #more than 10 mutation is almost certainly crap
                    n_reads_aligned+=1
                    UMI_dict[umi].append(mut_to_str(mutations))
    
    log='n reads:\t{}\nn_reads pass filter:\t{}\nn_reads aligned:\t{}\nn_pairs agree:\t{}\n'.format(n_reads,n_reads_pass_Qfilter,n_reads_aligned,n_reads_agree)
    log+=f"Number of UMIs: {len(UMI_dict)}\n"
    
    UMI_gencounter={}
    umi_readcounts=[]
    for umi in UMI_dict:
        umi_readcounts.append(len(UMI_dict[umi]))
        UMI_gencounter[umi]=Counter(UMI_dict[umi])

    log+=f"Median number of reads per UMI: {np.median(umi_readcounts)}"
    print(log)
    return UMI_gencounter

In [None]:
#|hide
from Bio.Seq import Seq

In [None]:
#|hide
fastq_path_fwd=os.path.join(data_path,"paired_example1_R1.fastq.gz")
fastq_path_rev=os.path.join(data_path,"paired_example1_R2.fastq.gz")


read_ref_file="sacB_ref.fasta"
ref=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
ref_seq=str(Seq("GAGGAGACGGTGACCTGGGTCCCCTGGCCCCAGTTGTTGAAGTCATACCCTTAGGCACCATAGTATCCAGACGCACAGTAGTACAAGGCAGTGTCCTCAGGTTTCAGGCTGTTCATTTGCAGATACACCGTGTTCTTGGCGTTGTCTTGGG").reverse_complement())

UMI_gencounter = get_UMI_genotype_paired(fastq_path_fwd, 
                                         fastq_path_rev, 
                                         ref_seq, 
                                         fwd_span=(0,0), 
                                         rev_span=(0,150),
                                         umi_size_fwd=10,
                                         umi_size_rev=0,
                                         N=10,
                                         ignore_pos=[0,1,2,150,151])

for umi in itertools.islice(UMI_gencounter,20):
    print(umi, list(UMI_gencounter[umi].items()))

n reads:	10
n_reads pass filter:	10
n_reads aligned:	10
n_pairs agree:	0
Number of UMIs: 10
Median number of reads per UMI: 1.0
GGACGCGATA [('A99C,A100T', 1)]
GCTGTATGTT [('A99C,A100T', 1)]
GGGAAGGCGT [('', 1)]
GGCGACAGTG [('', 1)]
CCATGTCGGG [('', 1)]
CNGATGTTTG [('A99C,A116T,A117C', 1)]
GTCACACCAC [('A100T,A114G,A116G,A117G', 1)]
CCAGACTGTT [('', 1)]
CAGGGTTTAT [('A99C,A100T', 1)]
ATGGGTACGG [('A20T', 1)]


In [None]:
fastq_path_fwd=os.path.join(data_path,"paired_example1_R2.fastq.gz")
fastq_path_rev=os.path.join(data_path,"paired_example1_R1.fastq.gz")


read_ref_file="sacB_ref.fasta"
ref=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
ref_seq="GAGGAGACGGTGACCTGGGTCCCCTGGCCCCAGTTGTTGAAGTCATACCCTTAGGCACCATAGTATCCAGACGCACAGTAGTACAAGGCAGTGTCCTCAGGTTTCAGGCTGTTCATTTGCAGATACACCGTGTTCTTGGCGTTGTCTTGGG"

UMI_gencounter = get_UMI_genotype_paired(fastq_path_fwd, 
                                         fastq_path_rev, 
                                         ref_seq, 
                                         fwd_span=(0,150), 
                                         rev_span=(0,0),
                                         umi_size_fwd=0,
                                         umi_size_rev=10,
                                         N=10,
                                         ignore_pos=[0,1,2,150,151])

for umi in itertools.islice(UMI_gencounter,20):
    print(umi, list(UMI_gencounter[umi].items()))

n reads:	10
n_reads pass filter:	10
n_reads aligned:	10
n_pairs agree:	0
Number of UMIs: 10
Median number of reads per UMI: 1.0
GGACGCGATA [('T50A,T51G', 1)]
GCTGTATGTT [('T50A,T51G', 1)]
GGGAAGGCGT [('', 1)]
GGCGACAGTG [('', 1)]
CCATGTCGGG [('', 1)]
CNGATGTTTG [('T33G,T34A,T51G', 1)]
GTCACACCAC [('T33C,T34C,T36C,T50A', 1)]
CCAGACTGTT [('', 1)]
CAGGGTTTAT [('T50A,T51G', 1)]
ATGGGTACGG [('T130A', 1)]


In [None]:
#| export

def get_genotypes_paired(fastq_path_fwd: str, #path to the input fastq file reading the ref_seq in the forward orientation
                        fastq_path_rev: str, #path to the input fastq file reading the ref_seq in the reverse orientation
                        ref_seq: str, #sequence of the reference amplicon
                        fwd_span: tuple, #span of the ref_seq that is read in the forward orientation format: (start, end)
                        rev_span: tuple, #span of the ref_seq that is read in the reverse orientation format: (start, end)
                        require_perfect_pair_agreement: bool = True, #if True only pairs of reads that perfectly agree on the sequence within the common span will be used. If False the fwd sequence will be used. Will be set to False by default if there is no overlap.
                        umi_size_fwd: int = 10, #number of nucleotides at the begining of the fwd read that will be used as the UMI
                        umi_size_rev: int = 0, #number of nucleotides at the begining of the rev read that will be used as the UMI (if both are provided the umi will be the concatenation of both)
                        quality_threshold: int = 30, #threshold value used to filter out reads of poor average quality
                        ignore_pos: list = [], #list of positions that are ignored in the genotype
                        reads_per_umi_thr: int = 0, #minimum number of reads required to take a UMI into account. Using a number >2 enables to perform error correction for UMIs with multiple reads.
                        save_umi_data: str = None, #path to the csv file where to save the details of the genotypes reads for each UMI. If None the data isn't saved.
                        N = None, #number of reads to consider (useful to get a quick view of the data without going through the whole fastq files). If None the whole data will be used.
                        **kwargs, #alignment parameters can be passed here (match, mismatch, gap_open, gap_extend)
                        ):
    """Putting things together in a single wrapper function that takes the fastq as input and returns the list of genotypes."""
    UMI_dict = get_UMI_genotype_paired(fastq_path_fwd, 
                                         fastq_path_rev, 
                                         ref_seq, 
                                         fwd_span=fwd_span, 
                                         rev_span=rev_span,
                                         require_perfect_pair_agreement=require_perfect_pair_agreement,
                                         umi_size_fwd=umi_size_fwd,
                                         umi_size_rev=umi_size_rev,
                                         quality_threshold=quality_threshold,
                                         ignore_pos=ignore_pos,
                                         N=N,
                                         **kwargs
                                         )
    if save_umi_data:
        with open(save_umi_data,"w", newline='') as handle: 
            csv_writer = csv.writer(handle,delimiter="\t",doublequote=False)
            for umi in itertools.islice(UMI_dict,20):
                csv_writer.writerow([umi,list(UMI_dict[umi].items())])

    UMI_gen_dict=correct_UMI_genotypes(UMI_dict, reads_per_umi_thr)
    gen_list = genotype_UMI_counter(UMI_gen_dict)
    print("Number of genotypes:", len(gen_list))
    return gen_list

In [None]:
gen_list = get_genotypes_paired(fastq_path_fwd, 
                        fastq_path_rev, 
                        ref_seq, 
                        fwd_span=(0,150), 
                        rev_span=(0,0),
                        umi_size_fwd=0,
                        umi_size_rev=10,
                        ignore_pos=[0,1,150,151],
                        N=100,
                        save_umi_data="test.csv")
for g in gen_list[:20]:
    print(f"{g[1]}\t{g[0]}")

n reads:	100
n_reads pass filter:	96
n_reads aligned:	94
n_pairs agree:	0
Number of UMIs: 85
Median number of reads per UMI: 1.0
Number of genotypes: 12
49	
23	T50A,T51G
2	T33G,T34A,T51G
2	T33C,T34C,T36C,T50A
2	T37C,T42C,T45A,T50C,T51C
1	T130A
1	A52C
1	G8A
1	T51C
1	A52C,G131C
1	G69A
1	T37C,T42C,T45A,T50C,T51C,T146A


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()