In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import numpy as np
import glob, os
%matplotlib inline  
plt.rcParams['svg.fonttype'] = 'none'
from re import sub


In [2]:
#function input previous base, desired trit
def nextBase(prevBase, wantedTrit):
    transitionTable = {'A0':'G', 'A1':'C', 'A2':'T','C0':'T', 'C1':'G', 'C2':'A','G0':'A', 'G1':'T', 'G2':'C','T0':'C', 'T1':'A', 'T2':'G'}

    return transitionTable[prevBase+str(wantedTrit)]

In [3]:
def encodeStr(inputInfo_string):
    inputInfo = [int(s) for s in list(inputInfo_string)]
    outStr = []
    for i in range(len(inputInfo)):
        if(i==0):
            prevBase = 'G'
        else:
            prevBase = outStr[i-1]
        currtrit = inputInfo[i]

        outStr.append(nextBase(prevBase,currtrit))
    return outStr

encodeStr("00010212")

['A', 'G', 'A', 'C', 'T', 'G', 'T', 'G']

In [4]:
hello_world = ["00010212","00110202","00211000","01011000","01111010","01201012",
               "02011102","02111010","02211020","10011000","10110201","10201020"]

for string in hello_world:
    print(string,''.join(encodeStr(string)))

00010212 AGACTGTG
00110202 AGTAGCTG
00211000 AGCGTCTC
01011000 ACTACTCT
01111010 ACGTAGTC
01201012 ACAGTCGC
02011102 ATCGTAGC
02111010 ATACGACT
02211020 ATGTAGCT
10011000 TCTACTCT
10110201 TCGTCAGT
10201020 TCAGTCAG


In [5]:

####
#### Decoding by 2-step filter rule: 
#### 1. Template architecture (length, terminal base)
#### 2. Most frequent
####

In [6]:
data = pd.read_csv("H01-H12_dataTable.csv",sep=",", header=0,
           dtype={"template_ID":str,"match":int,"template":str,"strandC":str,"strandR":str,"strandR_len":int,"strandC_len":int,"template_align":str,"strand_align":str})

In [7]:
def findpossiblehits(row):
    curr_seq = row['strandC']
    desired_seq = row['template']
    
    # filter rule - look for strands of a set length and the desired terminal 'C'
    if((len(curr_seq) == len(desired_seq)) and len(curr_seq)>0 and (curr_seq[-1] is desired_seq[-1])):
        return 1
    else:
        return 0


In [8]:
data["possiblehit"] = data.apply(findpossiblehits,axis=1)

In [9]:
def decoding(df, templateID):
    currdf = df[df["template_ID"]==templateID]
    currdf_possiblehits  = currdf[currdf["possiblehit"]==1]
    print(templateID,currdf.iloc[0]["template"])
    print(currdf_possiblehits['strandC'].value_counts(ascending=False).head(5).to_frame(),"\n\n")

In [10]:
templates_to_decode = ['H01','H02','H03','H04','H05','H06','H07','H08','H09','H10','H11','H12']

for template in templates_to_decode: 
    decoding(data,template)



H01 AGACTGTGC
           strandC
AGACTGTGC    42337
AGAGACTGC     2242
AGACTGCGC      331
GAGCTGTGC      254
AGACTGAGC      247 


H02 AGTAGCTGC
           strandC
AGTAGCTGC    62243
AGAGTCTGC      458
AGAGAGCTC      434
AGCAGCTGC      413
AGAGAGTGC      369 


H03 AGCGTCTC
          strandC
AGCGTCTC    89302
AGAGCGTC     4926
AGAGTCTC     1268
GAGCGCTC      570
AGCGAGTC      520 


H04 ACTACTCTC
           strandC
ACTACTCTC   200154
AGACTACTC     2108
ACTCTACTC     1589
GACTCTCTC     1157
ACTAGACTC     1106 


H05 ACGTAGTC
          strandC
ACGTAGTC   115345
ACGCAGTC      806
AGACGTGC      680
GACGTGTC      677
AGCGTGTC      627 


H06 ACAGTCGC
          strandC
ACAGTCGC   126849
ACAGAGTC     8139
AGACAGTC     1493
ACAGACGC      717
ACATAGTC      552 


H07 ATCGTAGC
          strandC
ATCGTAGC   169767
ATCGCAGC     1486
GATCGAGC      706
AGTCGAGC      586
GATCGTGC      542 


H08 ATACGACTC
           strandC
ATACGACTC   113567
AGATACGAC     1653
ATACGAGAC     1560
ATAGACGAC     1559
AT