In [1]:
#!/usr/bin/env python3
import os, glob
import numpy as np
import pandas as pd
import Bio
from Bio.Seq import MutableSeq, Seq
from Bio import SeqIO
from Bio.SeqUtils import GC
import fnmatch
import matplotlib.pyplot as plt


In [2]:
os.chdir('/home/dave/Documents/Bacteria/libs')
import primer #From DaveBio
import Bacterial #From DaveBio
import PyClustalW #From DaveBio
import probe #From DaveBio
primer

<module 'primer' from '/home/dave/Documents/Bacteria/libs/primer.py'>

In [3]:
#Define Common Primers

#Forward Primers 
F8 = 'AGAGTTTGATCCTGGCTCAG'
F27 = 'AGAGTTTGATCMTGGCTCAG'
CC_F = 'CCAGACTCCTACGGGAGGCAGC'
F357F = 'CTCCTACGGGAGGCAGCAG'
F515F = 'GTGCCAGCMGCCGCGGTAA'
F533F = 'GTGCCAGCAGCCGCGGTAA'
F16S = 'CAACGAGCGCAACCCT'
F1237F = 'GGGCTACACACGYGCWAC'
V1F = 'GYGGCGNACGGGTGAGTAA'

#Reverse Primers
R519 = 'GWATTACCGCGGCKGCTG'
CDR = 'CTTGTGCGGGCCCCCGTCAATTC'
R907 = 'CCGTCAATTCMTTTRAGTTT'
R1100 = 'AGGGTTGCGCTCGTTG'
R1391 = 'GACGGGCGGTGTGTRCA'
R1492 = 'GGTTACCTTGTTACGACTT'
R1492 = 'ACCTTGTTACGACTT'
V6R = 'AGCTGACGACANCCATGCA'


In [4]:
#Generate Amplicons of Bacterial Species from Primers
core_data = Bacterial.get_species()

forward_primer = MutableSeq(V1F)
reverse_primer = MutableSeq(V6R)

fwd_tup = primer.degenerate_primer(forward_primer)
rev_tup = primer.degenerate_primer(reverse_primer)
calculated_data = primer.create_PCR_amplicon(core_data, rev_tup, fwd_tup)
#calculated_data.to_csv('Bacterial_Amplicons.csv', sep='\t')
calculated_data


Unnamed: 0,Species,Record id,Forward Primer,forward_primer_position,Reverse Primer,reverse_primer_position,GC Content,Length of Amplicon,Amplicon
0,Micrococcus.luteus,NR_037113.1,GTGGCGAACGGGTGAGTAA,75,TGCATGGTTGTCGTCAGCT,1009,57.3,953,GTGGCGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTAACTCTG...
1,Escherichia.coli,AJ605115.1,GTGGCGGACGGGTGAGTAA,30,TGCATGGCTGTCGTCAGCT,981,55.4,970,GTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGG...
2,Streptococcus.pyogenes,NR_028598.1,N/a,N/a,TGCATGGTTGTCGTCAGCT,1013,0.0,0,
3,Streptococcus.pneumoniae,NR_028665.1,N/a,N/a,TGCATGGTTGTCGTCAGCT,1046,0.0,0,
4,Streptococcus.agalactiae,NR_040821.1,N/a,N/a,TGCATGGTTGTCGTCAGCT,1031,0.0,0,
5,Escherichia.coli,XX000001.1,GTGGCGGACGGGTGAGTAA,100,TGCATGGCTGTCGTCAGCT,1050,55.4,969,GTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGG...
6,Haemophilus.influenzae,NR_044682.2,GTGGCGGACGGGTGAGTAA,100,TGCATGGCTGTCGTCAGCT,1048,52.5,967,GTGGCGGACGGGTGAGTAATGCTTGGGAATCTGGCTTATGGAGGGG...
7,Moraxella.osloensis,NR_104936.1,GTGGCGGACGGGTGAGTAA,91,TGCATGGCTGTCGTCAGCT,1036,51.1,964,GTGGCGGACGGGTGAGTAACATTTAGGAATCTACCTAGTAGTGGGG...
8,Serratia.marcescens,NR_036886.1,GCGGCGGACGGGTGAGTAA,95,TGCATGGCTGTCGTCAGCT,1045,54.9,969,GCGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGG...
9,Flavonifractor.plautii,NR_029356.1,GTGGCGGACGGGTGAGTAA,92,TGCATGGTTGTCGTCAGCT,1027,55.0,954,GTGGCGGACGGGTGAGTAACGCGTGAGGAACCTGCCTTGGAGAGGG...


In [5]:
#Generate Dendro of Bacterial Species from 16S Segments
file_name = "16S"
writen_alignment = PyClustalW.create_alignment_file(core_data, '16S Sequence', file_name)
writen_alignment
PyClustalW.ClustalW_alignment(str(file_name + '.faa'))
#ClustalW_alignment
PyClustalW.Phylo_tree(file_name)

                                            ____________ NR_037113.1
                                     ______|
                           _________|      |_____________ NR_026215.1
                          |         |
                          |         |____________________ NR_074802.2
                          |
                          |                       ______ NR_028598.1
                          |                     ,|
                          |                    _||______ NR_040821.1
                          |                   | |
                          |                   | |_____ NR_027517.1
                          |         __________|
                          |        |          |      , NR_028665.1
                         ,|        |          |   ___|
                         ||        |          |__|   | NR_028664.1
                         ||        |             |
                         ||        |             |____ NR_024842.1
                   

In [6]:
'''This code imports aligned sequneces into a pandas dataframe'''
file_name = "16S"
aligned_sequnces = PyClustalW.Read_alignment_file(file_name)
aligned_sequnces['offset'].max()
aligned_sequnces
aligned_frame = pd.merge(core_data, aligned_sequnces, 
         how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)
aligned_frame
aligned_sequnces.loc[(aligned_frame['Record id'] == 'XX000001.1')]['Aligned 16S Sequence'].item()


'----TGAGAATTTGATCTTGGTTCAGATTGAACGCTGGCGGCGTG----GATGAGGCATGC--AAGTCGAACGGA--------------GCAATTGTTTCGACG-----ATTGTTTAGTGGCGGAAGGGTTAGTAATGCATAGATAATTTGTCCTTAACTTGGGAATAACGGTTGGAAACGGCCGCTAATACCGAATGTGGCGATATTTGGGCATCCGAGTAACGTTAAAGAAGGGGATCTTAGGACCTTTCGGTTAAGGGAGAGTCTATGTGATATCAGCTAGTTGGTGGGGTAAAGGCCTACCAAGGCTATGACGTCTAGGCGGATTGAGAGATTGGCCGCCAACACTGGGACTGAGACACTGCCCAGACTCCTACGGGAGGCTGCAGTCGAGAATCTTTCGCAATGGAC-GGAAGTCTGACGAAGCGACGCCGCGTGTGTGATGAAGGCTCTAGGGTTGTAAAGCACTTTCGCTTGGGAATAAGAGAAGACGGTTAATACCCGCTGG-ATTTGAGCGTACCA-GGTAAAGAAGCACCGGCTAACTCCGTGCCAGCAGCTGCGGTAAT-ACGGAGGGTGCTAGCGTTAATCGGATTTATTGGGCGTAAAGGGCGTGTAGGCGGAAAGGTAAGTTAGTTGTCAAAGATCGGGGCTCAACCCCGAGTCGGCATCTAATACTATTT-TTCTAGAGGATAGATGGAGAAAAGGGAATTTCACGTG-TAGCGGTGAAATGCGTAGATATGTGGAAGAACACCAGTGGCGAAGGCGCTTTTCTAATTTATACCTGACGCTAA-GGCGCGAAAGCAAGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCTTGCCGTAAACGATGCATACTT--GATGTGGATGGTCTCAACCCC-ATCCGTGTCGGAG-CTAACGCGTTAAGTATGCCGCCTGAGGAGTACACTCGCAAGGGTGAAACTCAAAAGAATTGACGGGGGCCCGCACAAGCAGTGGAGCATGTGG

In [7]:
#Generate Dendro of Bacterial Species from 16S Segments
file_name = "amplicons"
writen_alignment = PyClustalW.create_alignment_file(calculated_data, 'Amplicon', file_name)
writen_alignment
PyClustalW.ClustalW_alignment(str(file_name + '.faa'))
#ClustalW_alignment
PyClustalW.Phylo_tree(file_name)

                                                  _____________ NR_037113.1
                                 ________________|
                                |                |____________ NR_026215.1
                               _|
                              | |  ______________________ NR_029356.1
                              | |_|
                              |   |_____________________________ NR_042365.1
                              |
                   ___________|                          _ JX131632.1
                  |           |                        ,|
                  |           |                        ||_ NR_118997.2
                  |           |              __________|
                  |           |             |          |___ NR_114596.1
                  |           |             |          |
                 ,|           |_____________|          |___ LC145699.1
                 ||                         |
                 ||                         |  

In [8]:
amplicon_alignments = PyClustalW.Read_alignment_file('amplicons')
amplicon_alignments['Aligned 16S Sequence']
conserved_regions = PyClustalW.conserved_regions(amplicon_alignments, 'Aligned 16S Sequence')

In [9]:
conserved_regions

runs_documentation = []
runs_list = []
global_flag = True
local_flag = True
warning_flag = True
global_items = 0
local_items = 0
length = 0
end = len(conserved_regions)
              
while global_flag is True:
    runs_list = []
    if global_items + 2 < end:
        global_flag = True
    else:
        global_flag = False
        runs_list.append(conserved_regions[global_items + 1])
        print('ending')
        
    local_flag = True
    warning_flag = True
        
    while local_flag is True:
        runs_list.append(conserved_regions[global_items])
        global_items = global_items + 1

        if global_items == end:
            local_flag = False 
        
        elif (conserved_regions[global_items-1] == '*'
            and conserved_regions[global_items] == '*'
            and warning_flag is True):
            warning_flag = True
        
        elif (conserved_regions[global_items-1] == '*'
            or conserved_regions[global_items] == '*'
            and warning_flag is True):
            warning_flag = False
        
        else:
            length = length + (len(runs_list))
            runs_documentation.append(str(runs_list))
            local_flag = False       

#runs_documentation

ending


In [17]:
ROIs = probe.splice_variable_regions(core_data, '16S Sequence', 1, 7)
print(ROIs.loc[(ROIs['Record id'] == 'XX000001.1')]['Variable Regions'].item()[0])
print(ROIs.loc[(ROIs['Record id'] == 'NR_037113.1')]['Variable Regions'].item()[0])
print(ROIs.loc[(ROIs['Record id'] == 'NR_036886.1')]['Variable Regions'].item()[0])
print(ROIs.loc[(ROIs['Record id'] == 'XX000001.1')]['Variable Regions'].item()[1])
print(ROIs.loc[(ROIs['Record id'] == 'NR_037113.1')]['Variable Regions'].item()[1])
print(ROIs.loc[(ROIs['Record id'] == 'NR_036886.1')]['Variable Regions'].item()[1])


TAACAGGAAGAAGCTTGCTCTTTGCTGACG
GGATTAGTGGCGAACGGGTGAGTAACACGT
AGGGGAGCTTGCTCCCTGGGTGACGAGCGG
ATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGA
CTGGGTCTAATACCGGATAGGAGCGTCCACCGCATGGTGGGTGTTGGAAAGATTTATCGGTTTTGGATGGACTCGCGGCCTATCAGCTTGTTGGTGAGGTAATGG
GGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCAGATGTGCCCAGATGGGATTAGC


In [28]:
PyClustalW.create_alignment_file_segment(ROIs, 'Variable Regions', 'Section_1', 0)

30

In [17]:
ROI = aligned_frame.loc[(aligned_frame['Record id'] == 'XX000001.1')]['Aligned 16S Sequence'].item()
offset = 0
variable_regions = ((0,0), (69-offset,99-offset), (137-offset,242-offset), (433-offset,497-offset), 
                    (576-offset,682-offset), (822-offset,879-offset), (986-offset,1043-offset),
                     (1117-offset,1173-offset),(1243-offset,1294-offset), (1435-offset,1465-offset))
count_accu = 0
updated_variable_regions = []
updated_conserved_regions = []

for region in range(0, len(variable_regions)-1):
    count = sum(map(lambda x : 1 if '-' in x else 0, ROI[variable_regions[region][1]:variable_regions[region+1][0]]))
    
    updated_conserved_regions.append(tuple((variable_regions[region][1] + count_accu, 
                                            variable_regions[region+1][0] + count_accu + count)))
    count_accu = count_accu + count
    
    count = sum(map(lambda x : 1 if '-' in x else 0, 
                    ROI[variable_regions[region + 1][0] + 1:variable_regions[region + 1][1]]))
    
    updated_variable_regions.append(tuple((variable_regions[region + 1][0] + count_accu, 
                                           variable_regions[region + 1][1] + count_accu + count)))
    count_accu = count_accu + count + 1

'GTGGCGAACGGGTGAGTAA'
updated_conserved_regions, updated_variable_regions

([(0, 75),
  (112, 156),
  (270, 471),
  (536, 618),
  (725, 868),
  (928, 1038),
  (1096, 1176),
  (1233, 1305),
  (1358, 1501)],
 [(75, 111),
  (156, 269),
  (471, 535),
  (618, 724),
  (868, 927),
  (1038, 1095),
  (1176, 1232),
  (1305, 1357),
  (1501, 1546)])

In [21]:
'''This code selects the V1-V9 regions of the aligned sequences relative to E.Coli'''

offset = 0
E_Coli = amplicon_alignments.loc[(amplicon_alignments['Record id'] == 'NR_024570.1')]['Aligned 16S Sequence'].item()

variable_regions = ((69-offset,99-offset), (137-offset,242-offset), (433-offset,497-offset), 
                    (576-offset,682-offset), (822-offset,879-offset), (986-offset,1043-offset),
                     (1117-offset,1173-offset),(1243-offset,1294-offset), (1435-offset,1465-offset))

variable_regions_breakdown = [E_Coli[variable_regions[0][0]:variable_regions[0][1]],
E_Coli[variable_regions[1][0]:variable_regions[1][1]],
E_Coli[variable_regions[2][0]:variable_regions[2][1]],
E_Coli[variable_regions[3][0]:variable_regions[3][1]],
E_Coli[variable_regions[4][0]:variable_regions[4][1]],
E_Coli[variable_regions[5][0]:variable_regions[5][1]]]
variable_regions_breakdown

['TGTCTGGG-AAACTGCCTGATGGAGGGGGA',
 'CAAGCA-CAAAGAGGGGGACCTTAGGGCCTCT---TGC-------CATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACG',
 'TTGCT--CATTGACGTTACCC-GCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTA',
 'CCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAA-GCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTG-TAGCGGTGAAATGCGTAGAGATCTGGAGG',
 'TGGCTTCCGGAN-NTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTT',
 'AGAGATGAG--AATGTGCCTTCGGGA--ACCGTGAGACAGGTGCT------------']

In [37]:
'''This code selects the C1-C9 regions of the aligned sequences relative to E.Coli'''

offset = 69-18

E_Coli = amplicon_alignments.loc[(amplicon_alignments['Record id'] == 'NR_024570.1')]['Aligned 16S Sequence'].item()

variable_regions = ((69-offset,99-offset), (137-offset,242-offset), (433-offset,497-offset), 
                    (576-offset,682-offset), (822-offset,879-offset), (986-offset,1043-offset),
                     (1117-offset,1173-offset),(1243-offset,1294-offset), (1435-offset,1465-offset))

conserved_regions_breakdown = [E_Coli[0:variable_regions[0][0]],
E_Coli[variable_regions[0][1]:variable_regions[1][0]],
E_Coli[variable_regions[1][1]:variable_regions[2][0]],
E_Coli[variable_regions[2][1]:variable_regions[3][0]],
E_Coli[variable_regions[3][1]:variable_regions[4][0]],
E_Coli[variable_regions[4][1]:variable_regions[5][0]]]
conserved_regions_breakdown

['TGGCGGACGGGTGAGTAA',
 'TAACTACTGGAAACGGTAGCTAATACCGCATAACGTCG',
 'ATCCCTAGCTGGTCTGAGAGGATGACCAGCAACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGC-GCAAGCCTGATGCAGCCATGCNGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCT',
 'AT-ACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAAT',
 'AATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCA-GGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTT--GGAGGTTGTGCCCTTGAGGC--G',
 'AAAACTCAAATGAATTGACGGGGGCC-GCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGTCTTGACAT-CCACGGAAGTTTTC']

In [16]:
offset = 69-18
E_Coli = conserved_regions

variable_regions = ((69-offset,99-offset), (137-offset,242-offset), (433-offset,497-offset), 
                    (576-offset,682-offset), (822-offset,879-offset), (986-offset,1043-offset),
                     (1117-offset,1173-offset),(1243-offset,1294-offset), (1435-offset,1465-offset))

unconserved_regions_breakdown = [str(E_Coli[variable_regions[0][0]:variable_regions[0][1]]),
str(E_Coli[variable_regions[1][0]:variable_regions[1][1]]),
str(E_Coli[variable_regions[2][0]:variable_regions[2][1]]),
str(E_Coli[variable_regions[3][0]:variable_regions[3][1]]),
str(E_Coli[variable_regions[4][0]:variable_regions[4][1]]),
str(E_Coli[variable_regions[5][0]:variable_regions[5][1]])]

for i in range(0,len(unconserved_regions_breakdown)):
    count = sum(map(lambda x : 1 if '*' in x else 0, unconserved_regions_breakdown[i])) 
    print(-count/(variable_regions[i][0]-variable_regions[i][1]))

0.23333333333333334
0.26666666666666666
0.578125
0.41509433962264153
0.49122807017543857
0.12280701754385964


In [51]:
offset = 69-18
E_Coli = conserved_regions

variable_regions = ((69-offset,99-offset), (137-offset,242-offset), (433-offset,497-offset), 
                    (576-offset,682-offset), (822-offset,879-offset), (986-offset,1043-offset),
                     (1117-offset,1173-offset),(1243-offset,1294-offset), (1435-offset,1465-offset))

conserved_regions_breakdown = [str(E_Coli[0:variable_regions[0][0]]),
str(E_Coli[variable_regions[0][1]:variable_regions[1][0]]),
str(E_Coli[variable_regions[1][1]:variable_regions[2][0]]),
str(E_Coli[variable_regions[2][1]:variable_regions[3][0]]),
str(E_Coli[variable_regions[3][1]:variable_regions[4][0]]),
str(E_Coli[variable_regions[4][1]:variable_regions[5][0]])]
conserved_regions_breakdown

count = sum(map(lambda x : 1 if '*' in x else 0, conserved_regions_breakdown[0]))
print(-count/(-variable_regions[0][0]))

for i in range(1,len(conserved_regions_breakdown)):
    count = sum(map(lambda x : 1 if '*' in x else 0, conserved_regions_breakdown[i])) 
    print(-count/(variable_regions[i-1][1]-variable_regions[i][0]))
    

0.8888888888888888
0.34210526315789475
0.5445026178010471
0.46835443037974683
0.5285714285714286
0.6822429906542056


In [55]:
unconserved_regions_breakdown[2]

"[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '*', '*', '*', '*', ' ', '*', '*', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', '*', '*', '*', '*', ' ', ' ', '*', '*', '*', '*', '*', '*', '*', ' ', '*', ' ', ' ', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*']"

In [56]:
conserved_regions_breakdown[2]

"['*', ' ', ' ', ' ', ' ', '*', '*', ' ', '*', ' ', '*', ' ', ' ', '*', '*', '*', '*', '*', '*', '*', '*', ' ', '*', '*', ' ', ' ', '*', ' ', '*', ' ', ' ', '*', '*', '*', ' ', ' ', '*', '*', ' ', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', ' ', ' ', '*', '*', ' ', ' ', '*', '*', '*', '*', ' ', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', ' ', '*', '*', '*', '*', '*', '*', ' ', '*', '*', ' ', ' ', ' ', '*', ' ', '*', '*', '*', '*', ' ', ' ', ' ', ' ', ' ', '*', '*', ' ', ' ', '*', '*', '*', '*', ' ', ' ', ' ', '*', '*', '*', ' ', '*', ' ', ' ', '*', ' ', '*', ' ', '*', '*', '*', ' ', ' ', ' ', '*', '*', ' ', '*', '*', ' ', '*', ' ', ' ', ' ', '*', '*', ' ', ' ', '*', ' ', ' ', ' ', '*', '*', '*', '*', '*', ' ', ' ', ' ', '*', '*', ' ', '*', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '*', '*', ' ', '*', '*', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '*', ' ', ' ', ' ', ' ']"

In [None]:
#Bacterial.compare_letters(calculated_data.loc[(core_data['Species'] == 'Klebsiella.pneumoniae')]['Amplicon'].item(), calculated_data.loc[(core_data['Species'] == 'Enterococcus.faecalis')]['Amplicon'].item(), 0)

In [None]:
R = 1.38
T = np.arange(65+295, 90+295, 0.1)
H = 10
S = 1

Stability_Factor = np.exp(-(H-S**T))/(R**T)*10**23

row = ['AT-TA', 'TA-AT', 'AT-AT', 'GC-AT', 'CG-AT', 'AT-GC', 'AT-CG', 'GC-GC', 'CG-GC', 'GC-CG']
col = ['dTij/dlogNA+', 'Tij 1.0M-Na+ C', 'Tij 1.0M-Na+ K', 'Tij0.0745M-Na+ C', 'Delta Hij', 'Delta Sij']

Constants = [(21.00, 81.85, 355.01, 58.23, 8.00, 22.53),
(20.11, 86.72, 359.88, 64.10, 8.31, 24.64),
(19.78, 89.08, 362.24, 66.77, 8.45, 24.86),
(17.76, 99.49, 372.65, 79.51, 9.13, 24.50),
(17.10, 103.18, 376.34, 83.94, 9.36, 24.87),
(16.87, 104.43, 377.59, 85.45, 9.44, 25.00),
(16.21, 107.96, 381.12, 89.72, 9.67, 25.37),
(14.18, 118.49, 391.65, 102.50, 10.34, 27.52),
(13.20, 124.54, 397.70, 109.69, 10.72, 26.95),
(13.20, 124.61, 397.77, 109.76, 10.72, 26.95)]

Blake_1999 = pd.DataFrame(Constants, columns=col, index=row)


In [12]:
TA = str('AAAUUGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAACGGUAACAGGAAGAAGCUUGCUCUUUGCUGACGAGUGGCGGACGGGUGAGUAAUGUCUGGGAAACUGCCUGAUGGAGGGGGAUAACUACUGGAAACGGUAGCUAAUACCGCAUAACGUCGCAAGACCAAAGAGGGGGACCUUCGGGCCUCUUGCCAUCGGAUGUGCCCAGAUGGGAUUAGCUAGUAGGUGGGGUAACGGCUCACCUAGGCGACGAUCCCUAGCUGGUCUGAGAGGAUGACCAGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCCUGAUGCAGCCAUGCCGCGUGUAUGAAGAAGGCCUUCGGGUUGUAAAGUACUUUCAGCGGGGAGGAAGGGAGUAAAGUUAAUACCUUUGCUCAUUGACGUUACCCGCAGAAGAAGCACCGGCUAACUCCGUGCCAGCAGCCGCGGUAAUACGGAGGGUGCAAGCGUUAAUCGGAAUUACUGGGCGUAAAGCGCACGCAGGCGGUUUGUUAAGUCAGAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCUGAUACUGGCAAGCUUGAGUCUCGUAGAGGGGGGUAGAAUUCCAGGUGUAGCGGUGAAAUGCGUAGAGAUCUGGAGGAAUACCGGUGGCGAAGGCGGCCCCCUGGACGAAGACUCACGCUCAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCCGUAAACGAUGUCGACUUGGAGGUUGUGCCCUUGAGGCGUGGCUUCCGGAGCUAACGCGUUAAGUCGACCGCCUGGGGAGUACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAUGCAACGCGAAGAACCUUACCUGGUCUUGACAUCCACGGAAGUUUUCAGAGAUGAGAAUGUGCCUUCGGGAACCGUGAGACAGGUGCUGCAUGGCUGUCGUCAGCUCGUGUUGUGAAAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUUAUCCUUUGUUGCCAGCGGUCCGGCCGGGAACUCAA AGGAGACUGCCAGUGAUAAACUGGAGGAAGGUGGGGAUGACGUCAAGUCAUCAUGGCCCUUACGACCAGGGCUACACACGUGCUACAAUGGCGCAUACAAAGAGAAGCGACCUCGCGAGAGCAAGCGGACCUCAUAAAGUGCGUCGUAGUCCGGAUUGGAGUCUGCAACUCGACUCCAUGAAGUCGGAAUCGCUAGUAAUCGUGGAUCAGAAUGCCACGGUGAAUACGUUCCCGGGCCUUGUACACACCGCCCGUCACACCAUGGGAGUGGGUUGCAAAAGAAGUAGGUAGCUUAACCUUCGGGAGGGCGCUUACCACUUUGUGAUUCAUGACUGGGGUGAAGUCGUAACAAGGUAACCGUAGGGGAACCUGCGGUUGGAUCACCUCCUUA')

In [14]:
def changeWord(word):
    for letter in word:
        if letter == "U":
            word = word.replace(letter,"T")
    return word
TA = changeWord(TA)
TA

'AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAGAAGCTTGCTCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTCACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGTCTTGACATCCA

In [54]:
region = 0
count_accu = 0
count = sum(map(lambda x : 1 if '-' in x else 0, ROI[variable_regions[region][1]:variable_regions[region+1][0]]))
count_accu = count_accu + count
updated_conserved_regions = ()
alist = (variable_regions[region][1], variable_regions[region+1][0]-1+count_accu)
updated_conserved_regions = updated_conserved_regions + alist
#count = sum(map(lambda x : 1 if '-' in x else 0, ROI[variable_regions[region+1][0]+1:variable_regions[region+1][1]]))
#updated_variable_regions.append[(variable_regions[region+1][0]+count_accu, variable_regions[region+1][1]+count_accu+count)]
#count_accu = count_accu + count
updated_conserved_regions

(0, 68)

In [40]:
conserved_region = []
variable_region = []
pd_frame = aligned_frame
column = 'Aligned 16S Sequence'
end_reg = 6
start_reg = 1

offset = 0
variable_regions = ((0,0), (69-offset,99-offset), (137-offset,242-offset), (433-offset,497-offset), 
                    (576-offset,682-offset), (822-offset,879-offset), (986-offset,1043-offset),
                     (1117-offset,1173-offset),(1243-offset,1294-offset), (1435-offset,1465-offset))

for item in pd_frame['Record id']:
    sequence = pd_frame.loc[(pd_frame['Record id'] == item)][column].item()
    the_tuple = ()
    for region in range(0, end_reg-start_reg+1):
        the_tuple = the_tuple + (sequence[variable_regions[region + 1][0]:variable_regions[region + 1][1]], )
    variable_region.append([core_data.loc[(core_data['Record id'] == item)]['Species'].item(),
                           item,
                           the_tuple])
columns = ['Species', 'Record id', 'Variable Regions',]
returned_pd_frame = pd.DataFrame(variable_region, columns=columns)
returned_pd_frame

Unnamed: 0,Species,Record id,Variable Regions
0,Escherichia.coli,AJ605115.1,"(--------C-----AGGAAGCAGCTTGCTG, GTCTGGG-AAACT..."
1,Staphylococcus.epidermidis,JX131632.1,"(AGCGAA-----CA-GACGAGGAGCTTGCTC, ACGTGGATAACCT..."
2,Staphylococcus.carnosus,LC145699.1,"(GCCGAA-----CA-GACGAGGAGCTTGCTC, ACGTGGGTAACCT..."
3,Escherichia.coli,NR_024570.1,"(AACGGTAAC-----AGGAAGCAGCTTGCTG, GTCTGGG-AAACT..."
4,Streptococcus.parasanguinis,NR_024842.1,"(AACGCT-----GA-AGCTTGGTGCTTGCAC, GCGTAGGTAACCT..."
5,Chlamydia.trachomatis,NR_025888.1,"(AACGGA--------------GCAATTGTTT, GCATAGATAATTT..."
6,Pseudomonas.aeruginosa,NR_026078.1,"(AGCTTATGA-----AGG---GAGCTTGCC-, GCCTAGG-AATCT..."
7,Neisseria.gonorrhoeae,NR_026079.2,"(GACGGCAGC-----ACAGGGAAGCTTGCTT, ATATCGG-AACGT..."
8,Acinetobacter.baumannii,NR_026206.1,"(AGCGGGGGA-----AGG---TAGCTTGCT-, GCTTAGG-AATCT..."
9,Corynebacterium.amycolatum,NR_026215.1,"(AACGGTAA-------GGCTCCAGCTTGCTG, ACGTGGGTGACCT..."


In [4]:
conserved_regions_adjusted= [(0, 75), 
                             (112, 156),
                             (270, 471),
                             (536, 618), 
                             (725, 868), 
                             (928, 1038), 
                             (1096, 1176), 
                             (1233, 1305),
                             (1358, 1501)]
variable_regions_adjusted = [(75, 111), 
                             (156, 269), 
                             (471, 535), 
                             (618, 724), 
                             (868, 927), 
                             (1038, 1095), 
                             (1176, 1232), 
                             (1305, 1357), 
                             (1501, 1546)]

111