In [None]:
import numpy as np
import pysam, os, pickle, sys
from Bio.Seq import Seq
from Bio import SeqIO
from fuzzysearch import find_near_matches
from dataclasses import dataclass
from copy import deepcopy

@dataclass
class read_part:
    cigar: str
    read_start: int
    read_end: int
    chrom: str=''
    strand: chr=''
    ref_start: int = -1
    ref_end: int = -1
    mapq: int = -1
    
    def get_length(self):
        return self.read_end-self.read_start
    
sys.path.append('../scripts')
from process_reads import *

In [None]:
transgene_ref = 'references/220416_template+rDNAflanks.fa'
flank_len = 840
transgene_len = len(list(SeqIO.parse(transgene_ref, format='fasta'))[0].seq) - 2*flank_len
tg_end = flank_len+transgene_len
transgene_refname = list(SeqIO.parse(transgene_ref, format='fasta'))[0].name

rDNA_repeat = list(SeqIO.parse('references/rDNA_fullrepeat.fa', format='fasta'))[0].seq
rDNA_repeat = str(rDNA_repeat).upper()

template_flanks = SeqIO.parse(transgene_ref, format='fasta')
template_flanks = str(list(template_flanks)[0].seq).upper()

In [None]:
# read sam file and convert to dataframe representation
input_sam = 'KCXZ0001A_transgeneflanks_mappedmates.sam'
transgenereads = pysam.AlignmentFile(input_sam, 'r')
read_dict = populate_read_dict(transgenereads)

In [4]:
read_dict = realign_reads(read_dict, 'references/rDNA_fullrepeat.fa')
read_dict = realign_reads(read_dict, transgene_ref)
read_dict = realign_reads(read_dict, 'references/chm13v2.0.fa.gz')

# read_dict = discard_plasmid_reads(read_dict, '../../../templates/CBh_plasmid.fa')
read_dict = discard_contaminating_reads(read_dict, 'references/blacklist.fa')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 77056 sequences (10000190 bp)...
[M::process] read 33146 sequences (4650619 bp)...
[M::mem_process_seqs] Processed 77056 reads in 4.571 CPU sec, 4.565 real sec
[M::mem_process_seqs] Processed 33146 reads in 0.715 CPU sec, 0.695 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -B 6 ../../../templates/rDNA_fullrepeat.fa temp.fa
[main] Real time: 5.286 sec; CPU: 5.314 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 73942 sequences (10000118 bp)...
[M::process] read 1256 sequences (171983 bp)...
[M::mem_process_seqs] Processed 73942 reads in 1.635 CPU sec, 1.635 real sec
[M::mem_process_seqs] Processed 1256 reads in 0.051 CPU sec, 0.029 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -B 6 ../../templates/TCARZ_CBh_GFP_SV40PA_GeFo3+rDNAflanks.fa temp.fa
[main] Real time: 1.679 sec; CPU: 1.702 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 73942 sequences (10000118 b

In [5]:
read_dict = align_clips(read_dict,  'references/rDNA_fullrepeat.fa', transgene_refname)
read_dict = align_clips(read_dict,  'references/chm13v2.0.fa.gz', transgene_refname)
read_dict = align_clips(read_dict, transgene_ref, transgene_refname)

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 17112 sequences (670483 bp)...
[M::mem_process_seqs] Processed 17112 reads in 0.103 CPU sec, 0.103 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -B 6 ../../../templates/rDNA_fullrepeat.fa temp.fa
[main] Real time: 0.112 sec; CPU: 0.112 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 16789 sequences (651293 bp)...
[M::mem_process_seqs] Processed 16789 reads in 0.400 CPU sec, 0.400 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -B 6 ../../../templates/chm13v2.0.fa.gz temp.fa
[main] Real time: 1.330 sec; CPU: 1.277 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 16644 sequences (643604 bp)...
[M::mem_process_seqs] Processed 16644 reads in 0.078 CPU sec, 0.078 real sec
[main] Version: 0.7.17-r1188
[main] CMD: bwa mem -B 6 ../../templates/TCARZ_CBh_GFP_SV40PA_GeFo3+rDNAflanks.fa temp.fa
[main] Real time: 0.085 sec; CPU: 0.087 sec


In [6]:
read_dict = filter_transgenemapping(read_dict, flank_len, tg_end, transgene_refname)
read_dict = realign_rDNA_reads(read_dict, transgene_refname, flank_len, tg_end, rDNA_repeat)
read_dict = remap_sensitive(read_dict, template_flanks, rDNA_repeat, transgene_refname)
read_dict = filter_transgenemapping(read_dict, flank_len, tg_end, transgene_refname)

with open('KCXZ0001A_reads.pkl', 'wb') as f:
    pickle.dump(read_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

print(len(read_dict))

4783
