In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import os
import csv

import duckdb

conn = duckdb.connect('mydb.db') # create/connect to an in-memory database

In [2]:
conn.sql("SHOW ALL TABLES")

┌──────────┬─────────┬───────────┬───────────────────────────┬──────────────────────────────────────┬───────────┐
│ database │ schema  │   name    │       column_names        │             column_types             │ temporary │
│ varchar  │ varchar │  varchar  │         varchar[]         │              varchar[]               │  boolean  │
├──────────┼─────────┼───────────┼───────────────────────────┼──────────────────────────────────────┼───────────┤
│ mydb     │ main    │ TBB       │ [Tiles, ADbc, RPTRbc]     │ [VARCHAR, VARCHAR, VARCHAR]          │ false     │
│ mydb     │ main    │ TBB_UNION │ [Tile, ADbc, RPTRbc, TBB] │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR] │ false     │
└──────────┴─────────┴───────────┴───────────────────────────┴──────────────────────────────────────┴───────────┘

In [1]:
pwd

'/global/scratch/users/empchase/CiberVI/Analysis'

In [4]:
#where are the reads
path = '/global/scratch/users/empchase/CiberVI/Analysis/CZB_EC'

directory = os.fsencode(path)
AD1files = []
RPTR1files = []

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.fastq'):
#         print (type(filename))
#         read1files.append(filename)
#         print(type(filename))
        x = filename.split('_')
#         print(x)
        if 'AD' in x:
            AD1files.append(filename)
        elif 'RPTR' in x:
            RPTR1files.append(filename)
            
print(AD1files)
print(RPTR1files)

['EC_1_AD_S205_L004_R1_001.fastq', 'EC_3_AD_S207_L004_R1_001.fastq', 'EC_3_AD_S207_L003_R1_001.fastq', 'EC_1_AD_S205_L003_R1_001.fastq']
['EC_4_RPTR_S208_L003_R1_001.fastq', 'EC_4_RPTR_S208_L004_R1_001.fastq', 'EC_2_RPTR_S206_L003_R1_001.fastq', 'EC_2_RPTR_S206_L004_R1_001.fastq']


In [5]:
def complement(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N', 'X':'X'} 
    bases = list(seq) 
    bases = [complement[base] for base in bases] 
    return ''.join(bases)
def reverse_complement(s):
        return complement(s[::-1])

In [6]:
def getmid(seq, pre, post, bclen):
    # seq = the sequence to parse
    # pre = substring that precedes piece of interest
    # post = substring that follows piece of interest
    # returns piece of interest

    re_key = pre + "(.*)"+ post 
    poi_search = re.search(re_key, seq)
#     print(poi_search)


    if poi_search is None:
        #the barcode will be called X
        poi = "X"
        
        #then we search for which restriction site had the error
        #regex for the bc we want to ignore
        w = "(.{"+str(bclen)+"})" 
#         print(w)
        pre_re = pre + w + "(.{7})"
        pre_search = re.search(pre_re, seq)
#         print(pre_search)
        post_re = "(\w{7})" + w + post
        post_search = re.search(post_re, seq)
#         print(post_search)
        
        if pre_search is None and post_search is None:
            a = 'X'
            z = 'X'
        elif pre_search is None:
            poi = post_search.group(2)
            a = post_search.group(1)
            z = post
        elif post_search is None:
            poi = pre_search.group(1)
            z = pre_search.group(2)
            a = pre
            
    else:
        poi = poi_search.group(1)
        a = pre
        z = post
    
    return poi, a, z


#putative consensus sequences ***reverse complement of snapgene***
adp = 'CGGGCCC'#7 bp ; beforeAD barcode in read1
adf = 'GGCGCGC' #7bp ; after AD barcode in read1

rpp = 'AGCGGCC' #7bp ; before rptr barcode in read1
rpf = 'CTCGAGT' #7 bp ; after rptr barcode in read1


# function that just looks for bc
def bc_finder(readfile,bc_pre, bc_post, bc_len):

    # make lists of reads
    seqlist = []
    
    with open(readfile, 'r') as fin:
        for line in fin:
            if line.startswith('@'):
                #look at next line to get read sequence, add to list
                seq = next(fin)
                seq = seq.strip()
                seqlist.append(seq)

    
    #make lists of BCs from list of reads
    bc_list = []
    bc_lens = []
    q_list = []
    
    prex_list = []
#     prex_len = []
    postx_list = []
#     postx_len = []
       
    for read in seqlist:
   
        bc, prex, postx = getmid(read, bc_pre, bc_post, bc_len)
        bc = reverse_complement(bc) #return reverse complement
        bc_list.append(bc)
        bcl = len(bc)
        bc_lens.append(bcl)
        
        #restriction sites: not doing reverse complement
        prex_list.append(prex)
#         prex_len.append(len(prex))
        postx_list.append(postx)
#         postx_len.append(len(postx))
        if bcl == bc_len:
            q_list.append(1)
        else:
            q_list.append(0)
    #         print(bc)

               
    # make the df
    
#     print(len(bc_list))
#     print(len(bc_lens))
#     print(len(q_list))
#     print(len(prex_list))
#     print(len(prex_len))
#     print(len(postx_list))
#     print(len(postx_len))
    
    BC_dict = {"BCs":bc_list, "Length":bc_lens, #"Qual":q_list, 
               "prex":prex_list, #'prexl':prex_len,
               'postx':postx_list, #'postxl':postx_len
              } 
    BC_df = pd.DataFrame.from_dict(BC_dict)
    
    return BC_df

In [7]:
#read in as df and create dictionary
tbbdf = conn.sql("""SELECT * FROM TBB_UNION""").df()
tbbdf.head()

Unnamed: 0,Tile,ADbc,RPTRbc,TBB
0,GATTTGCAAGGTAAATTCTTGGCTGCTCCATTGGAAGAGAATCCAA...,TTTATCCTCGG,AAAAAATTACTTGG,GATTTGCAAGGTAAATTCTTGGCTGCTCCATTGGAAGAGAATCCAA...
1,GACCCATATATGTCTGCTCCAAATTCTACTGCATTTACTGCTTTGA...,TGAAATGGTAA,CAAAAAGCTATGGT,GACCCATATATGTCTGCTCCAAATTCTACTGCATTTACTGCTTTGA...
2,GCTTTGGCTGGTGATCCATGGTATCCATTGTTTCCACAAGATGATC...,TGGTAATGAGA,TTTAAATGGCTAAT,GCTTTGGCTGGTGATCCATGGTATCCATTGTTTCCACAAGATGATC...
3,GATTATCATTCTCAAGCTACTGCTGACTTTGTCTTGTATCCACAAG...,AAGGTCCGAGG,AACAAATACAGAGT,GATTATCATTCTCAAGCTACTGCTGACTTTGTCTTGTATCCACAAG...
4,GATATTGAAGCTAGATCATTCTATGTCTTTGGTTCTCCAGTTTCTC...,CCCCGCCGTCG,TCTAGGTTTCACCC,GATATTGAAGCTAGATCATTCTATGTCTTTGGTTCTCCAGTTTCTC...


In [8]:
#dictionary ~ {RPTRbc: TBB}

RPTRdict = dict(zip(tbbdf.RPTRbc, tbbdf.TBB))