In [2]:
import numpy as np
from Bio import SeqIO
from Bio import Seq
import csv
import os
from collections import OrderedDict
import gzip
import matplotlib.pyplot as plt
import re
import pandas as pd
from copy import deepcopy

In [3]:
def load_index (file,read_len = 151, trim = "TCTGGTGGATCTGGAGGTCTCGA", Fedit = "", Redit = "", rev = False):
    
    """ Loads in csv file in format of 1) ID 2) sequence as a sequnce cross check. Output is a refrence dictionary that is of form
    {"Seq": (ID,count)} where ID is uniprot ID and goup assingment and count is defaulted to start at 0 reads mapped"""
    
    index = OrderedDict()
    indexfull = []
    with open(file,'r') as f:
        lines = f.readlines()
        header,seqs = lines[0],lines[1:]
        seqs_clean = [s.strip().split(',') for s in seqs]
        
        # For matching with reverse NGS reads, index is reformated with reverse compliment data. 
        if rev:
            for r in seqs_clean:
                ID = r[0]
                tempseq = r[1]
                cutoff = tempseq.find(trim)# For matching with reverse NGS reads, index is reformated with reverse compliment data. 
                param = 151 - len(Fedit) - len(Redit) #adust max index length parameter to account for NGS read length and with forward and reverse trims
                
                if len(tempseq) <= param:
                    fseq = Seq.Seq(Fedit + tempseq[cutoff:-1]) #Apply trims
                    rseq = fseq.reverse_complement() #Take rev complement
                
                    index[Redit + str(rseq)] = [ID,0] #Create index entry
                    indexfull.append(str(Redit + Seq.Seq(tempseq).reverse_complement())) #append full index in a list of sequence strings with no read trim
                else:
                    fseq = Seq.Seq(tempseq[-151 + len(Redit):-1]) #If read is longer than NGS read length, trim from the given read direction to match read length for exact seq match
                    rseq = fseq.reverse_complement()
                                   
                    index[Redit + str(rseq)] = [ID,0]
                    indexfull.append(Redit + str(Seq.Seq(tempseq).reverse_complement()))
                    
        else:
            Redit2 = str(Seq.Seq(Redit).reverse_complement())
            for r in seqs_clean:
                ID = r[0]
                fseq = r[1][:-trim]
                index[fseq] = [ID,0]
        
    return index,indexfull

def process_fastq (fastq_file,fwd,rev,QCavg = 35, QCbase = 20):

    f_ind = len(fwd) ## Defines forward trim length
    qc_seq = [] ## Initialize list of accetable sequences at trimmed sequence
    
    try:
        with gzip.open(fastq_file,'rt') as f:
            records = SeqIO.parse(f,'fastq') # Load in fastq files
            for r in records:
                r_ind = r.seq.find(rev) # find reverse handle in sequence, if not there assign -1
                qual = r.letter_annotations['phred_quality'][f_ind:r_ind] # Call in quality scores for trimmed region
                if np.mean(qual) >= QCavg: # Check the average quality is above threshold
                    for q in qual:
                        if q <= QCbase: # Check no outlier base is bad quality in mapped region, if so don't append
                            er = 'False'
                            break
                        else:
                            er = 'True'
                    if er:
                        qc_seq.append(str(r.seq[:r_ind]))
                else:
                    continue ## Or jump to next sequnce if QC condition is not met
    except: 
        with open(fastq_file,'rt') as f:
            records = SeqIO.parse(f,'fastq') # Load in fastq files
            for r in records:
                r_ind = r.seq.find(rev) # find reverse handle in sequence, if not there assign -1
                qual = r.letter_annotations['phred_quality'][f_ind:r_ind] # Call in quality scores for trimmed region
                if np.mean(qual) >= QCavg: # Check the average quality is above threshold
                    for q in qual:
                        if q <= QCbase: # Check no outlier base is bad quality in mapped region, if so don't append
                            er = 'False'
                            break
                        else:
                            er = 'True'
                    if er:
                        if r_ind > 0:
                            qc_seq.append(str(r.seq[:r_ind + len(rev)])) ## Append good sequences to cleaned list
                        else:
                            qc_seq.append(str(r.seq[:r_ind]))
                else:
                    continue ## Or jump to next sequnce if QC condition is not met
    return qc_seq,records
        
def map_seqs (qc_seqs,index,fullindex,fwd):
    
    num_reads = 0
    num_mapped = 0
    num_unmapped = 0
    no_fwd_handle = 0
    mapped,unmapped,nohand = [],[],[]
    
    for s in qc_seqs:
        num_reads += 1
        
        f_ind = s.find(fwd)
        
        if f_ind >= 0:
            
            if s in index:
                num_mapped += 1
                mapped.append(s)
                index[s][1] += 1
                
            else:
                for i,k in zip(index.keys(),fullindex):
                    loc = re.search(s,k)
                    if loc:
                        num_mapped += 1
                        index[i][1] += 1
                        mapped.append(s)
                        break
                        
                if not loc:
                    num_unmapped += 1
                    unmapped.append(s)
        else:
            no_fwd_handle += 1
            nohand.append(s)
        
    return index,num_reads,num_mapped,mapped,num_unmapped,unmapped,no_fwd_handle,nohand

def final_return(fileout,index1,index2 = {},num_reads = []):
    """Writes final read data out to file
    """
    
    if len(num_reads) < 1:
        raise Exception("Please pass the number of reads as a list of [Rep1,Rep2]")
    if len(list(index2.keys())) > 0 and len(num_reads) < 2:
        raise Exception("Please pass number of reads for BOTH replicates in a list")
        
    index_temp = deepcopy(index1)
    for k in index_temp.keys():
        index_temp[k].append(index_temp[k][1]/num_reads[0])
    
    
    output_df = pd.DataFrame.from_dict(index_temp,'index',columns = ['ID1','Read Count Rep 1','Read Norm Rep 1'])
    
    if len(list(index2.keys())) > 0:
        rep_index = deepcopy(index2)
        for k in rep_index.keys():
            rep_index[k].append(rep_index[k][1]/num_reads[1])
    
        rep_df = pd.DataFrame.from_dict(rep_index,'index',columns = ['ID2','Read Count Rep 2','Read Norm Rep 2'])
        rep_df_trim = rep_df.loc[:,'ID':'Read Norm Rep 2']
        
        output_df = output_df.join(rep_df_trim)
    with open(fileout,'w') as f:
        output_df.to_csv(f,index=True,index_label = "Sequence")
    return 

def load_and_run(data_file, index, fullindex, fwd_handle = "TCGAGACCTCCAGATCCACCAGA", rev_handle = "GCCCAGATCCTCTTCTGAGATGAG",fwd_tag = "", rev_tag = ""):
    
    # Add in stagger to handles in correct order
    fwd_handle_new = fwd_tag + fwd_handle
    rev_handle_new = rev_handle + str(Seq.Seq(rev_tag).reverse_complement())
    
    # Run QC with new handles
    qc_seq,recs = process_fastq(data_file,fwd_handle_new,rev_handle_new)
    index_updated,read_count,map_count,mapped,unmap_count,unmap,notfound_count,notfound = map_seqs(qc_seq,index, fullindex = fullindex, fwd = fwd_handle_new)
    
    print(f'Mapped % : {map_count/read_count}')
    print(f'Unmapped % : {unmap_count/read_count}')
    print(f'No Handle % : {notfound_count/read_count}')
    
    
    return index_updated,read_count,map_count,mapped,unmap_count,unmap,notfound_count,notfound


In [4]:

index,full = load_index("DNA_ALL_SEQUENCES_INDEX_cleaned_ext.csv",trim = "CTCATCTCAGAAGAGGATCTGGGC",Fedit = "",Redit = "C", rev = True)

inddf = pd.DataFrame.from_dict(index,'index')
inddf.to_csv('crosscheck2.csv',index = True)

In [7]:

fastq_file = "/Volumes/Connor 1TB/20221229_AEV_Screen_GZ/AEV_6/AEV_6_S86_L003_R2_001.fastq.gz"

In [22]:
fastq_file = "/Volumes/Connor 1TB/20221229_AEV_Screen_GZ/AEV_6/AEV_6_S86_L003_R2_001.fastq.gz"

records = SeqIO.parse(gzip.open(fastq_file,'rt'),'fastq')
with open('tempdata_6.fastq','w') as fq:
    count = 0
    newrecs = []
    for r in records:
        newrecs.append(r)
        count += 1
        
        if count > 1e4:
            break
    SeqIO.write(newrecs,fq,'fastq')

In [77]:
rev = str(Seq.Seq("CTCATCTCAGAAGAGGATCTGGGC").reverse_complement())

qc_seq,recs = process_fastq('tempdata.fastq',"CTCGAGACCTCCAGATCCACCAGA",rev)

GCCCAGATCCTCTTCTGAGATGAG


In [80]:
index_update,r,m,mapped,u,unmap,nf,nope = map_seqs(qc_seq,index,fullindex = full,fwd = "CTCGAGACCTCCAGATCCACCAGA")

In [65]:
print(f'Mapped % : {m/r}')
print(f'Unmapped % : {u/r}')
print(f'No Handle % : {nf/r}')
print(unmap[0:10])

Mapped % : 0.8211077844311377
Unmapped % : 0.1528193612774451
No Handle % : 0.026072854291417164
['ACTCGAGACCTCCAGATCCACCAGACATCTGCAGGTGCTTCTTCCTCAGGATGATCAGGGTCACCAGCAGCAGGCCGATCAGCAGGGTGCTCAAGATAACTAACACAGAAATGCTGCCGCTGCCGCCGCTGCCGCTGCCGCCGCCCAGAT', 'ACTCGAGACCTCCAGATCCACCAGAGTACAGATATGTGCTCAGGCCGGCGGTGCCCATGATCACGGCAGCGGCCACAACTGTAATAATCACGATGCTGCCGCTGCCGCCGCTGCCGCTGCCGCCGCCCAGATCCTCTTCTGAGATGAGAC', 'ACTCGAGACCTCCAGATCCACCAGACAGCTTCTCCATCCTCTGGTGGCTGATCAGCAGGGTGCCCAGCAGCAGGATGCTGCTGCTGATGATCACGGGCACGATAACATATAATCCAGCAGAGCCGCTGCCGCCGCTGCCGCTGCCGACGC', 'ACTCGAGACCTCCAGATCCACCAGACCTCCACTGGAAGGTCAGGATCAGCAGGCTGATGCACAGGATCAGGCCCACGGCGATGTTGAACTTGAAGCCGAAAGGAAAAATAATGCTGCCGCTGCCGCCGCTGCCGCTGCCGCCGCCCAGAT', 'ACTCGAGACCTCCAGATCCACCAGAAACGAAAAAGGCGCTCACCACCAGGGCCAGGGTGCAGCCCAGCACCCAAGAAACTGTTAAAACAGAGCCGCTGCCGCCGCTGCCGCTGCCGCCGCCCAGATCCTATTCTGAGATGAGACGAAGAT', 'ACTCGAGACCTCCAGATCCACCAGACTTGTGCCACCTCTGCAGCCTCAGAGCAAGCAGCACGATCACGAAGGCCAGGAACACGCAGCTCACGGCAGCAACAGCAACAACTAAAGATCCAGAGCCGCCGCTG

In [84]:
f = "tempdata.fastq"
index1,full = load_index("DNA_ALL_SEQUENCES_INDEX_cleaned_ext.csv",trim = "CTCATCTCAGAAGAGGATCTGGGC",Fedit = "",Redit = "C", rev = True)
update_index1,read1,map_c1,map1,unmap_c1,unmap1,nf_c1,notfound1 = load_and_run(f, index1, fullindex = full, fwd_handle = "TCGAGACCTCCAGATCCACCAGA", rev_handle = "GCCCAGATCCTCTTCTGAGATGAG",fwd_tag = "C")

Mapped % : 0.8381585192216422
Unmapped % : 0.1330090175605126
No Handle % : 0.02883246321784528


In [24]:
os.chdir("/Users/connorcall/Library/CloudStorage/GoogleDrive-cccall@stanford.edu/Shared drives/Gao Lab/Connor Call/Lab Python")

In [10]:
## 1 and 2 Unind, Unb
## 3 and 4 Unind, Bound
## 5 and 6 Ind, Unb
## 7 and 8 Ind, Bound

## Load in data file names

files = [["AEV_1_S81_L003_R2_001.fastq.gz","AEV_2_S82_L003_R2_001.fastq.gz"],["AEV_3_S83_L003_R2_001.fastq.gz","AEV_4_S84_L003_R2_001.fastq.gz"]
         ,["AEV_5_S85_L003_R2_001.fastq.gz","AEV_6_S86_L003_R2_001.fastq.gz"],["AEV_7_S87_L003_R2_001.fastq.gz","AEV_8_S88_L003_R2_001.fastq.gz"]]
outfiles = ["AEV_1_2_data.csv","AEV_3_4_data.csv","AEV_5_6_data.csv","AEV_7_8_data.csv"]
for f,out in zip([files[0]],[outfiles[0]]):
    ## Load in Index and make a trimmed and full sequence index
    index1,full = load_index("DNA_ALL_SEQUENCES_INDEX_cleaned_ext.csv",trim = "CTCATCTCAGAAGAGGATCTGGGC",rev = True)
    index2 = deepcopy(index1)
    os.chdir("/Volumes/Connor 1TB/20221229_AEV_Screen_GZ/AEV_1")
    update_index1,read1,map_c1,map1,unmap_c1,unmap1,nf_c1,notfound1 = load_and_run(f[0], index1, fullindex = full, fwd_handle = "CTCGAGACCTCCAGATCCACCAGA", rev_handle = "GCCCAGATCCTCTTCTGAGATGAG")
    os.chdir("/Volumes/Connor 1TB/20221229_AEV_Screen_GZ/AEV_2")
    update_index2,read2,map_c2,map2,unmap_c2,unmap2,nf_c2,notfound2 = load_and_run(f[1], index2, fullindex = full, fwd_handle = "CTCGAGACCTCCAGATCCACCAGA", rev_handle = "GCCCAGATCCTCTTCTGAGATGAG")
    
    final_return(out,index1 = update_index1, index2 = update_index2, num_reads = [read1,read2])


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Mapped % : 0.8076971964908833
Unmapped % : 0.14241784793516873
No Handle % : 0.04988495557394797
Mapped % : 0.011071331441531032
Unmapped % : 0.0021132109163767383
No Handle % : 0.9868154576420922


In [13]:
f = "tempdata.fastq"
out = "testfile_out.csv"

## Load in Index and make a trimmed and full sequence index
index1,full = load_index("DNA_ALL_SEQUENCES_INDEX_cleaned_ext.csv",trim = "CTCATCTCAGAAGAGGATCTGGGC",Fedit = "",Redit = "C", rev = True)
index2,full2 = load_index("DNA_ALL_SEQUENCES_INDEX_cleaned_ext.csv",trim = "CTCATCTCAGAAGAGGATCTGGGC",Fedit = "",Redit = "GAAT", rev = True)
update_index1,read1,map_c1,map1,unmap_c1,unmap1,nf_c1,notfound1 = load_and_run(f, index1, fullindex = full, fwd_handle = "TCGAGACCTCCAGATCCACCAGA", rev_handle = "GCCCAGATCCTCTTCTGAGATGAG")
update_index2,read2,map_c2,map2,unmap_c2,unmap2,nf_c2,notfound2 = load_and_run(f, index2, fullindex = full, fwd_handle = "TCGAGACCTCCAGATCCACCAGA", rev_handle = "GCCCAGATCCTCTTCTGAGATGAG")
    
final_return(out,index1 = update_index1, index2 = update_index2, num_reads = [read1,read2])

Mapped % : 0.8426672994779307
Unmapped % : 0.12850023730422402
No Handle % : 0.02883246321784528
Mapped % : 0.8426672994779307
Unmapped % : 0.12850023730422402
No Handle % : 0.02883246321784528


ValueError: columns overlap but no suffix specified: Index(['ID'], dtype='object')

In [15]:
final_return(out,index1 = update_index1, index2 = update_index2, num_reads = [read1,read2])

In [14]:
final_return(outfiles[0],index1 = update_index1, index2 = update_index2, num_reads = [read1,read2])