In [113]:
import numpy
import MDAnalysis as mda
from MDAnalysis.tests.datafiles import PSF, DCD, GRO, XTC
from MDAnalysis.analysis.distances import distance_array
import MDAnalysisTests
import Bio
import Bio.PDB
import pytest
import pandas as pd
import biopandas
from biopandas.pdb import PandasPdb
import freesasa
import os.path
from MDAnalysis import AtomGroup

In [2]:
KatG = mda.Universe("7ag8.pdb")
print(KatG)

<Universe with 10922 atoms>




In [3]:
#Find Ser315 centre of mass in protomer A and B
Ser315_A = KatG.select_atoms('resid 315 and segid A')
print(Ser315_A)
Ser315_B = KatG.select_atoms('resid 315 and segid B')
print(Ser315_B)
Ser315_A_CoM = Ser315_A.center_of_mass()
Ser315_B_CoM = Ser315_B.center_of_mass()
print('Centre of mass for Ser315 on protomer A:', Ser315_A_CoM)
print('Centre of mass for Ser315 on protomer B:', Ser315_B_CoM)

<AtomGroup [<Atom 2109: N of type N of resname SER, resid 315 and segid A and altLoc >, <Atom 2110: CA of type C of resname SER, resid 315 and segid A and altLoc >, <Atom 2111: C of type C of resname SER, resid 315 and segid A and altLoc >, <Atom 2112: O of type O of resname SER, resid 315 and segid A and altLoc >, <Atom 2113: CB of type C of resname SER, resid 315 and segid A and altLoc >, <Atom 2114: OG of type O of resname SER, resid 315 and segid A and altLoc >]>
<AtomGroup [<Atom 7579: N of type N of resname SER, resid 315 and segid B and altLoc >, <Atom 7580: CA of type C of resname SER, resid 315 and segid B and altLoc >, <Atom 7581: C of type C of resname SER, resid 315 and segid B and altLoc >, <Atom 7582: O of type O of resname SER, resid 315 and segid B and altLoc >, <Atom 7583: CB of type C of resname SER, resid 315 and segid B and altLoc >, <Atom 7584: OG of type O of resname SER, resid 315 and segid B and altLoc >]>
Centre of mass for Ser315 on protomer A: [137.71549016 1

In [4]:
#Find Asp137 centre of mass in protomer A and B
Asp137_A = KatG.select_atoms('resid 137 and segid A')
print(Asp137_A)
Asp137_B = KatG.select_atoms('resid 137 and segid B')
print(Asp137_B)
Asp137_A_CoM = Asp137_A.center_of_mass()
Asp137_B_CoM = Asp137_B.center_of_mass()
print('Centre of mass for Asp137 on protomer A:', Asp137_A_CoM)
print('Centre of mass for Asp137 on protomer B:', Asp137_B_CoM)

<AtomGroup [<Atom 886: N of type N of resname ASP, resid 137 and segid A and altLoc >, <Atom 887: CA of type C of resname ASP, resid 137 and segid A and altLoc >, <Atom 888: C of type C of resname ASP, resid 137 and segid A and altLoc >, <Atom 889: O of type O of resname ASP, resid 137 and segid A and altLoc >, <Atom 890: CB of type C of resname ASP, resid 137 and segid A and altLoc >, <Atom 891: CG of type C of resname ASP, resid 137 and segid A and altLoc >, <Atom 892: OD1 of type O of resname ASP, resid 137 and segid A and altLoc >, <Atom 893: OD2 of type O of resname ASP, resid 137 and segid A and altLoc >]>
<AtomGroup [<Atom 6232: N of type N of resname ASP, resid 137 and segid B and altLoc >, <Atom 6233: CA of type C of resname ASP, resid 137 and segid B and altLoc >, <Atom 6234: C of type C of resname ASP, resid 137 and segid B and altLoc >, <Atom 6235: O of type O of resname ASP, resid 137 and segid B and altLoc >, <Atom 6236: CB of type C of resname ASP, resid 137 and segid B 

In [5]:
#Identify Hem atoms in protomer A and B
Hem_A = KatG.select_atoms('resid 801 and segid A')
print(Hem_A)
Hem_B = KatG.select_atoms('resid 801 and segid B')
print(Hem_B)

<AtomGroup [<Atom 10837: CHA of type C of resname HEM, resid 801 and segid A and altLoc >, <Atom 10838: CHB of type C of resname HEM, resid 801 and segid A and altLoc >, <Atom 10839: CHC of type C of resname HEM, resid 801 and segid A and altLoc >, ..., <Atom 10877: NC of type N of resname HEM, resid 801 and segid A and altLoc >, <Atom 10878: ND of type N of resname HEM, resid 801 and segid A and altLoc >, <Atom 10879: FE of type FE of resname HEM, resid 801 and segid A and altLoc >]>
<AtomGroup [<Atom 10880: CHA of type C of resname HEM, resid 801 and segid B and altLoc >, <Atom 10881: CHB of type C of resname HEM, resid 801 and segid B and altLoc >, <Atom 10882: CHC of type C of resname HEM, resid 801 and segid B and altLoc >, ..., <Atom 10920: NC of type N of resname HEM, resid 801 and segid B and altLoc >, <Atom 10921: ND of type N of resname HEM, resid 801 and segid B and altLoc >, <Atom 10922: FE of type FE of resname HEM, resid 801 and segid B and altLoc >]>


In [6]:
#'Site1_A' is an initial centre of mass in protomer A used to identify the Hem atoms which are closest
#to the suspected Isoniazid binding site
Site1_A = KatG.select_atoms('resid 315 and segid A', 'resid 137 and segid A')
Site1_A_CoM = Site1_A.center_of_mass()
haem_distances_A = []
haem_atom_number_A = []

#Identifies distance of each Hem atoms from suspected binding site
for i in Hem_A:
    atom_position = i.position
    dist = numpy.linalg.norm(Site1_A_CoM-atom_position)
    haem_distances_A.append(dist)
    haem_atom_number_A.append(str(i)[6:11])
    
#Creates dataframe to show atom number and distance
data = {'Atom number': haem_atom_number_A, 'Distance from Ser': haem_distances_A}
distance_site1_A_df = pd.DataFrame (data, columns = ['Atom number', 'Distance from Ser'])
distance_site1_A_df = distance_site1_A_df.set_index('Atom number')
print(distance_site1_A_df)

#Uses a carbon from each of Hem, Ser315 and Asp137, close to the suspected binding site
#to find a new, more accurate, binding site location
Site1_A_true = KatG.select_atoms('bynum 10844','bynum 2112','bynum 890')
Site1_A_true_CoM = Site1_A_true.center_of_mass()
print('\n','New site A:''\n', Site1_A_true_CoM)

             Distance from Ser
Atom number                   
10837                 8.124812
10838                 6.996807
10839                10.965349
10840                11.538067
10841                 7.232062
10842                 6.167269
10843                 5.783919
10844                 6.675063
10845                 4.980348
10846                 5.944127
10847                 5.700443
10848                 5.917667
10849                 6.626282
10850                 5.762992
10851                 8.072774
10852                 8.848015
10853                 9.906793
10854                 9.896144
10855                 8.693248
10856                11.165976
10857                11.495721
10858                11.239168
10859                12.364128
10860                12.508451
10861                11.501371
10862                13.351467
10863                13.674286
10864                14.755831
10865                10.712170
10866                11.142325
10867   

In [7]:
#'Site1_B' is an initial centre of mass in protomer A used to identify the Hem atoms which are closest
#to the suspected Isoniazid binding site
Site1_B = KatG.select_atoms('resid 315 and segid B', 'resid 137 and segid B')
Site1_B_CoM = Site1_B.center_of_mass()
haem_distances_B = []
haem_atom_number_B = []

#Identifies distance of each Hem atoms from suspected binding site
for i in Hem_B:
    atom_position = i.position
    dist = numpy.linalg.norm(Site1_B_CoM-atom_position)
    haem_distances_B.append(dist)
    haem_atom_number_B.append(str(i)[6:11])
    
#Creates dataframe to show atom number and distance
data = {'Atom number': haem_atom_number_B, 'Distance from Ser': haem_distances_B}
distance_site1_B_df = pd.DataFrame (data, columns = ['Atom number', 'Distance from Ser'])
distance_site1_B_df = distance_site1_B_df.set_index('Atom number')
print(distance_site1_B_df)

#Uses a carbon from each of Hem, Ser315 and Asp137, close to the suspected binding site
#to find a new, more accurate, binding site location
Site1_B_true = KatG.select_atoms('bynum 10911','bynum 7582','bynum 6236')
Site1_B_true_CoM = Site1_B_true.center_of_mass()
print('\n','New site B:''\n', Site1_B_true_CoM)

             Distance from Ser
Atom number                   
10880                 8.107010
10881                11.718632
10882                11.099186
10883                 7.093021
10884                 9.197226
10885                10.317572
10886                11.228584
10887                10.841160
10888                12.617138
10889                10.638367
10890                 9.983967
10891                10.696844
10892                11.581066
10893                10.555311
10894                11.669039
10895                12.640522
10896                12.483518
10897                11.379090
10898                13.814301
10899                13.414285
10900                14.341164
10901                10.018767
10902                10.003643
10903                 8.951995
10904                 8.175599
10905                11.202010
10906                 8.927318
10907                 8.057576
10908                 6.731700
10909                 5.800159
10910   

In [8]:
#AA_VOLUME (A^3) ('http://www.imgt.org/IMGTeducation/Aide-memoire/_UK/aminoacids/abbreviation.html')
#AA_MW (g/mol) ('https://www.thermofisher.com/uk/en/home/references/ambion-tech-support/rna-tools-and-calculators/proteins-and-amino-acids.html')
#AA_hydropathy index ('https://doi.org/10.1016/0022-2836(82)90515-0')
#AA_Pi ('https://www.sigmaaldrich.com/life-science/metabolomics/learning-center/amino-acid-reference-chart.html')

def aa_volume():
    aa_volumes = {'A': 88.6, 'R': 173.4, 'N': 114.1, 'D': 111.1, 'C': 108.5,
                 'Q': 143.8, 'E': 138.4, 'G': 60.1, 'H': 153.2, 'I': 166.7,
                 'L': 166.7, 'K': 168.6, 'M': 162.9, 'F': 189.9, 'P': 112.7,
                 'S': 89.0, 'T': 116.1, 'W': 227.8, 'Y': 193.6, 'V': 140.0}
    return aa_volumes


def MW():
    aa_MW = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.1,
             'E': 147.1, 'Q': 146.2, 'G':75.1, 'H': 155.2, 'I': 131.2,
             'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
             'S': 105.1, 'T': 119.1, 'W': 204.2, 'Y': 181.2, 'V': 117.1}
    return aa_MW


def hydropathy():
    aa_hydropathy_index = {'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5,
                           'C': 2.5, 'E': -3.5, 'Q': -3.5, 'G': -0.4,
                           'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9,
                           'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 
                           'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2}
    return aa_hydropathy_index


def Pi():
    
    aa_Pi = {'A': 6, 'R': 10.76, 'N': 5.41, 'D': 2.77, 'C': 5.07, 'E': 3.22,
             'Q': 5.65, 'G': 5.97, 'H': 7.59, 'I': 6.02, 'L': 5.98, 'K': 9.74,
             'M': 5.74, 'F': 5.48, 'P': 6.3, 'S': 5.68, 'T': 5.6, 'W': 5.89,
             'Y': 5.66, 'V': 5.96}
    return aa_Pi

In [9]:
def d_chem_features():
    '''calculates change in biochemical feature for every possible mutation'''
    
    #loads in aa attributes
    volume_dict = aa_volume()
    MW_dict = MW()
    hydropathy_dict = hydropathy()
    Pi_dict = Pi()
    
    #creates df from attribute function data
    df = pd.DataFrame.from_dict([volume_dict, MW_dict, hydropathy_dict, Pi_dict]).T.reset_index(0)
    df.rename(columns = {'index':'Amino_acid', 0:'volume', 1:'MW', 2:'hydropathy',
                        3:'Pi'}, inplace=True)
    
    #creates dicts for delta attributes from df
    d_volume_dict, d_Pi_dict, d_MW_dict, d_hydropathy_dict = {}, {}, {}, {}
    for i in df.index:
        for j in df.index:
            mutation = df['Amino_acid'][i] + df['Amino_acid'][j]
            d_volume_dict[mutation] = df['volume'][i] - df['volume'][j]
            d_MW_dict[mutation] = df['MW'][i] - df['MW'][j]
            d_hydropathy_dict[mutation] = df['hydropathy'][i] - df['hydropathy'][j]
            d_Pi_dict[mutation] = df['Pi'][i] - df['Pi'][j]        
    
    #creates df from delta dictionaries
    d_chem_df = pd.DataFrame.from_dict([d_volume_dict, d_MW_dict, d_hydropathy_dict, d_Pi_dict]).T.reset_index(0)
    d_chem_df.rename(columns = {'index':'dAA', 0:'d_volume', 1:'d_MW', 2:'d_hydropathy',
                        3:'d_Pi'}, inplace=True)
    return d_chem_df

In [10]:
d_chem_features()

Unnamed: 0,dAA,d_volume,d_MW,d_hydropathy,d_Pi
0,AA,0.0,0.0,0.0,0.00
1,AR,-84.8,-85.1,6.3,-4.76
2,AN,-25.5,-43.0,5.3,0.59
3,AD,-22.5,-44.0,5.3,3.23
4,AC,-19.9,-32.0,-0.7,0.93
...,...,...,...,...,...
395,VS,51.0,12.0,5.0,0.28
396,VT,23.9,-2.0,4.9,0.36
397,VW,-87.8,-87.1,5.1,0.07
398,VY,-53.6,-64.1,5.5,0.30


In [11]:
#extracts KatG seq
from Bio import SeqIO
for record in SeqIO.parse("rcsb_pdb_7AG8.fasta","fasta"):
    print(record.id)
records = SeqIO.parse("rcsb_pdb_7AG8.fasta","fasta")

7AG8_1|Chains


In [12]:
KatG_seq = str(record.seq)
aa_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

In [13]:
#loop to find identify all KatG mutations 
resnum = 0
KatG_mutations = []

for i in KatG_seq:
    print(i)
    resnum = resnum + 1
    print(resnum)
    for aa in aa_list:
        if aa == i:
            print('no mutation')
        else:
            print(i, resnum, aa)
            ia, resnuma, aaa = str(i), str(resnum), str(aa)
            df_add = (ia+resnuma+aaa)
            KatG_mutations.append(df_add)

M
1
M 1 A
M 1 C
M 1 D
M 1 E
M 1 F
M 1 G
M 1 H
M 1 I
M 1 K
M 1 L
no mutation
M 1 N
M 1 P
M 1 Q
M 1 R
M 1 S
M 1 T
M 1 V
M 1 W
M 1 Y
P
2
P 2 A
P 2 C
P 2 D
P 2 E
P 2 F
P 2 G
P 2 H
P 2 I
P 2 K
P 2 L
P 2 M
P 2 N
no mutation
P 2 Q
P 2 R
P 2 S
P 2 T
P 2 V
P 2 W
P 2 Y
E
3
E 3 A
E 3 C
E 3 D
no mutation
E 3 F
E 3 G
E 3 H
E 3 I
E 3 K
E 3 L
E 3 M
E 3 N
E 3 P
E 3 Q
E 3 R
E 3 S
E 3 T
E 3 V
E 3 W
E 3 Y
Q
4
Q 4 A
Q 4 C
Q 4 D
Q 4 E
Q 4 F
Q 4 G
Q 4 H
Q 4 I
Q 4 K
Q 4 L
Q 4 M
Q 4 N
Q 4 P
no mutation
Q 4 R
Q 4 S
Q 4 T
Q 4 V
Q 4 W
Q 4 Y
H
5
H 5 A
H 5 C
H 5 D
H 5 E
H 5 F
H 5 G
no mutation
H 5 I
H 5 K
H 5 L
H 5 M
H 5 N
H 5 P
H 5 Q
H 5 R
H 5 S
H 5 T
H 5 V
H 5 W
H 5 Y
P
6
P 6 A
P 6 C
P 6 D
P 6 E
P 6 F
P 6 G
P 6 H
P 6 I
P 6 K
P 6 L
P 6 M
P 6 N
no mutation
P 6 Q
P 6 R
P 6 S
P 6 T
P 6 V
P 6 W
P 6 Y
P
7
P 7 A
P 7 C
P 7 D
P 7 E
P 7 F
P 7 G
P 7 H
P 7 I
P 7 K
P 7 L
P 7 M
P 7 N
no mutation
P 7 Q
P 7 R
P 7 S
P 7 T
P 7 V
P 7 W
P 7 Y
I
8
I 8 A
I 8 C
I 8 D
I 8 E
I 8 F
I 8 G
I 8 H
no mutation
I 8 K
I 8 L
I 8 M
I 8 N
I 8 P
I 

Y 64 L
Y 64 M
Y 64 N
Y 64 P
Y 64 Q
Y 64 R
Y 64 S
Y 64 T
Y 64 V
Y 64 W
no mutation
A
65
no mutation
A 65 C
A 65 D
A 65 E
A 65 F
A 65 G
A 65 H
A 65 I
A 65 K
A 65 L
A 65 M
A 65 N
A 65 P
A 65 Q
A 65 R
A 65 S
A 65 T
A 65 V
A 65 W
A 65 Y
A
66
no mutation
A 66 C
A 66 D
A 66 E
A 66 F
A 66 G
A 66 H
A 66 I
A 66 K
A 66 L
A 66 M
A 66 N
A 66 P
A 66 Q
A 66 R
A 66 S
A 66 T
A 66 V
A 66 W
A 66 Y
E
67
E 67 A
E 67 C
E 67 D
no mutation
E 67 F
E 67 G
E 67 H
E 67 I
E 67 K
E 67 L
E 67 M
E 67 N
E 67 P
E 67 Q
E 67 R
E 67 S
E 67 T
E 67 V
E 67 W
E 67 Y
V
68
V 68 A
V 68 C
V 68 D
V 68 E
V 68 F
V 68 G
V 68 H
V 68 I
V 68 K
V 68 L
V 68 M
V 68 N
V 68 P
V 68 Q
V 68 R
V 68 S
V 68 T
no mutation
V 68 W
V 68 Y
A
69
no mutation
A 69 C
A 69 D
A 69 E
A 69 F
A 69 G
A 69 H
A 69 I
A 69 K
A 69 L
A 69 M
A 69 N
A 69 P
A 69 Q
A 69 R
A 69 S
A 69 T
A 69 V
A 69 W
A 69 Y
T
70
T 70 A
T 70 C
T 70 D
T 70 E
T 70 F
T 70 G
T 70 H
T 70 I
T 70 K
T 70 L
T 70 M
T 70 N
T 70 P
T 70 Q
T 70 R
T 70 S
no mutation
T 70 V
T 70 W
T 70 Y
I
71
I 71 A
I 71 C

A 130 G
A 130 H
A 130 I
A 130 K
A 130 L
A 130 M
A 130 N
A 130 P
A 130 Q
A 130 R
A 130 S
A 130 T
A 130 V
A 130 W
A 130 Y
P
131
P 131 A
P 131 C
P 131 D
P 131 E
P 131 F
P 131 G
P 131 H
P 131 I
P 131 K
P 131 L
P 131 M
P 131 N
no mutation
P 131 Q
P 131 R
P 131 S
P 131 T
P 131 V
P 131 W
P 131 Y
L
132
L 132 A
L 132 C
L 132 D
L 132 E
L 132 F
L 132 G
L 132 H
L 132 I
L 132 K
no mutation
L 132 M
L 132 N
L 132 P
L 132 Q
L 132 R
L 132 S
L 132 T
L 132 V
L 132 W
L 132 Y
N
133
N 133 A
N 133 C
N 133 D
N 133 E
N 133 F
N 133 G
N 133 H
N 133 I
N 133 K
N 133 L
N 133 M
no mutation
N 133 P
N 133 Q
N 133 R
N 133 S
N 133 T
N 133 V
N 133 W
N 133 Y
S
134
S 134 A
S 134 C
S 134 D
S 134 E
S 134 F
S 134 G
S 134 H
S 134 I
S 134 K
S 134 L
S 134 M
S 134 N
S 134 P
S 134 Q
S 134 R
no mutation
S 134 T
S 134 V
S 134 W
S 134 Y
W
135
W 135 A
W 135 C
W 135 D
W 135 E
W 135 F
W 135 G
W 135 H
W 135 I
W 135 K
W 135 L
W 135 M
W 135 N
W 135 P
W 135 Q
W 135 R
W 135 S
W 135 T
W 135 V
no mutation
W 135 Y
P
136
P 136 A
P 136 C
P 136 D


F 181 D
F 181 E
no mutation
F 181 G
F 181 H
F 181 I
F 181 K
F 181 L
F 181 M
F 181 N
F 181 P
F 181 Q
F 181 R
F 181 S
F 181 T
F 181 V
F 181 W
F 181 Y
G
182
G 182 A
G 182 C
G 182 D
G 182 E
G 182 F
no mutation
G 182 H
G 182 I
G 182 K
G 182 L
G 182 M
G 182 N
G 182 P
G 182 Q
G 182 R
G 182 S
G 182 T
G 182 V
G 182 W
G 182 Y
F
183
F 183 A
F 183 C
F 183 D
F 183 E
no mutation
F 183 G
F 183 H
F 183 I
F 183 K
F 183 L
F 183 M
F 183 N
F 183 P
F 183 Q
F 183 R
F 183 S
F 183 T
F 183 V
F 183 W
F 183 Y
G
184
G 184 A
G 184 C
G 184 D
G 184 E
G 184 F
no mutation
G 184 H
G 184 I
G 184 K
G 184 L
G 184 M
G 184 N
G 184 P
G 184 Q
G 184 R
G 184 S
G 184 T
G 184 V
G 184 W
G 184 Y
F
185
F 185 A
F 185 C
F 185 D
F 185 E
no mutation
F 185 G
F 185 H
F 185 I
F 185 K
F 185 L
F 185 M
F 185 N
F 185 P
F 185 Q
F 185 R
F 185 S
F 185 T
F 185 V
F 185 W
F 185 Y
G
186
G 186 A
G 186 C
G 186 D
G 186 E
G 186 F
no mutation
G 186 H
G 186 I
G 186 K
G 186 L
G 186 M
G 186 N
G 186 P
G 186 Q
G 186 R
G 186 S
G 186 T
G 186 V
G 186 W
G 186 Y
R


Y 229 W
no mutation
V
230
V 230 A
V 230 C
V 230 D
V 230 E
V 230 F
V 230 G
V 230 H
V 230 I
V 230 K
V 230 L
V 230 M
V 230 N
V 230 P
V 230 Q
V 230 R
V 230 S
V 230 T
no mutation
V 230 W
V 230 Y
N
231
N 231 A
N 231 C
N 231 D
N 231 E
N 231 F
N 231 G
N 231 H
N 231 I
N 231 K
N 231 L
N 231 M
no mutation
N 231 P
N 231 Q
N 231 R
N 231 S
N 231 T
N 231 V
N 231 W
N 231 Y
P
232
P 232 A
P 232 C
P 232 D
P 232 E
P 232 F
P 232 G
P 232 H
P 232 I
P 232 K
P 232 L
P 232 M
P 232 N
no mutation
P 232 Q
P 232 R
P 232 S
P 232 T
P 232 V
P 232 W
P 232 Y
E
233
E 233 A
E 233 C
E 233 D
no mutation
E 233 F
E 233 G
E 233 H
E 233 I
E 233 K
E 233 L
E 233 M
E 233 N
E 233 P
E 233 Q
E 233 R
E 233 S
E 233 T
E 233 V
E 233 W
E 233 Y
G
234
G 234 A
G 234 C
G 234 D
G 234 E
G 234 F
no mutation
G 234 H
G 234 I
G 234 K
G 234 L
G 234 M
G 234 N
G 234 P
G 234 Q
G 234 R
G 234 S
G 234 T
G 234 V
G 234 W
G 234 Y
P
235
P 235 A
P 235 C
P 235 D
P 235 E
P 235 F
P 235 G
P 235 H
P 235 I
P 235 K
P 235 L
P 235 M
P 235 N
no mutation
P 235 Q
P 235 R


W 300 C
W 300 D
W 300 E
W 300 F
W 300 G
W 300 H
W 300 I
W 300 K
W 300 L
W 300 M
W 300 N
W 300 P
W 300 Q
W 300 R
W 300 S
W 300 T
W 300 V
no mutation
W 300 Y
K
301
K 301 A
K 301 C
K 301 D
K 301 E
K 301 F
K 301 G
K 301 H
K 301 I
no mutation
K 301 L
K 301 M
K 301 N
K 301 P
K 301 Q
K 301 R
K 301 S
K 301 T
K 301 V
K 301 W
K 301 Y
S
302
S 302 A
S 302 C
S 302 D
S 302 E
S 302 F
S 302 G
S 302 H
S 302 I
S 302 K
S 302 L
S 302 M
S 302 N
S 302 P
S 302 Q
S 302 R
no mutation
S 302 T
S 302 V
S 302 W
S 302 Y
S
303
S 303 A
S 303 C
S 303 D
S 303 E
S 303 F
S 303 G
S 303 H
S 303 I
S 303 K
S 303 L
S 303 M
S 303 N
S 303 P
S 303 Q
S 303 R
no mutation
S 303 T
S 303 V
S 303 W
S 303 Y
Y
304
Y 304 A
Y 304 C
Y 304 D
Y 304 E
Y 304 F
Y 304 G
Y 304 H
Y 304 I
Y 304 K
Y 304 L
Y 304 M
Y 304 N
Y 304 P
Y 304 Q
Y 304 R
Y 304 S
Y 304 T
Y 304 V
Y 304 W
no mutation
G
305
G 305 A
G 305 C
G 305 D
G 305 E
G 305 F
no mutation
G 305 H
G 305 I
G 305 K
G 305 L
G 305 M
G 305 N
G 305 P
G 305 Q
G 305 R
G 305 S
G 305 T
G 305 V
G 305 W
G 

G 358 P
G 358 Q
G 358 R
G 358 S
G 358 T
G 358 V
G 358 W
G 358 Y
A
359
no mutation
A 359 C
A 359 D
A 359 E
A 359 F
A 359 G
A 359 H
A 359 I
A 359 K
A 359 L
A 359 M
A 359 N
A 359 P
A 359 Q
A 359 R
A 359 S
A 359 T
A 359 V
A 359 W
A 359 Y
G
360
G 360 A
G 360 C
G 360 D
G 360 E
G 360 F
no mutation
G 360 H
G 360 I
G 360 K
G 360 L
G 360 M
G 360 N
G 360 P
G 360 Q
G 360 R
G 360 S
G 360 T
G 360 V
G 360 W
G 360 Y
A
361
no mutation
A 361 C
A 361 D
A 361 E
A 361 F
A 361 G
A 361 H
A 361 I
A 361 K
A 361 L
A 361 M
A 361 N
A 361 P
A 361 Q
A 361 R
A 361 S
A 361 T
A 361 V
A 361 W
A 361 Y
G
362
G 362 A
G 362 C
G 362 D
G 362 E
G 362 F
no mutation
G 362 H
G 362 I
G 362 K
G 362 L
G 362 M
G 362 N
G 362 P
G 362 Q
G 362 R
G 362 S
G 362 T
G 362 V
G 362 W
G 362 Y
T
363
T 363 A
T 363 C
T 363 D
T 363 E
T 363 F
T 363 G
T 363 H
T 363 I
T 363 K
T 363 L
T 363 M
T 363 N
T 363 P
T 363 Q
T 363 R
T 363 S
no mutation
T 363 V
T 363 W
T 363 Y
I
364
I 364 A
I 364 C
I 364 D
I 364 E
I 364 F
I 364 G
I 364 H
no mutation
I 364 K
I 36

420
M 420 A
M 420 C
M 420 D
M 420 E
M 420 F
M 420 G
M 420 H
M 420 I
M 420 K
M 420 L
no mutation
M 420 N
M 420 P
M 420 Q
M 420 R
M 420 S
M 420 T
M 420 V
M 420 W
M 420 Y
G
421
G 421 A
G 421 C
G 421 D
G 421 E
G 421 F
no mutation
G 421 H
G 421 I
G 421 K
G 421 L
G 421 M
G 421 N
G 421 P
G 421 Q
G 421 R
G 421 S
G 421 T
G 421 V
G 421 W
G 421 Y
P
422
P 422 A
P 422 C
P 422 D
P 422 E
P 422 F
P 422 G
P 422 H
P 422 I
P 422 K
P 422 L
P 422 M
P 422 N
no mutation
P 422 Q
P 422 R
P 422 S
P 422 T
P 422 V
P 422 W
P 422 Y
V
423
V 423 A
V 423 C
V 423 D
V 423 E
V 423 F
V 423 G
V 423 H
V 423 I
V 423 K
V 423 L
V 423 M
V 423 N
V 423 P
V 423 Q
V 423 R
V 423 S
V 423 T
no mutation
V 423 W
V 423 Y
A
424
no mutation
A 424 C
A 424 D
A 424 E
A 424 F
A 424 G
A 424 H
A 424 I
A 424 K
A 424 L
A 424 M
A 424 N
A 424 P
A 424 Q
A 424 R
A 424 S
A 424 T
A 424 V
A 424 W
A 424 Y
R
425
R 425 A
R 425 C
R 425 D
R 425 E
R 425 F
R 425 G
R 425 H
R 425 I
R 425 K
R 425 L
R 425 M
R 425 N
R 425 P
R 425 Q
no mutation
R 425 S
R 425 T
R 425 

S 481 F
S 481 G
S 481 H
S 481 I
S 481 K
S 481 L
S 481 M
S 481 N
S 481 P
S 481 Q
S 481 R
no mutation
S 481 T
S 481 V
S 481 W
S 481 Y
S
482
S 482 A
S 482 C
S 482 D
S 482 E
S 482 F
S 482 G
S 482 H
S 482 I
S 482 K
S 482 L
S 482 M
S 482 N
S 482 P
S 482 Q
S 482 R
no mutation
S 482 T
S 482 V
S 482 W
S 482 Y
F
483
F 483 A
F 483 C
F 483 D
F 483 E
no mutation
F 483 G
F 483 H
F 483 I
F 483 K
F 483 L
F 483 M
F 483 N
F 483 P
F 483 Q
F 483 R
F 483 S
F 483 T
F 483 V
F 483 W
F 483 Y
R
484
R 484 A
R 484 C
R 484 D
R 484 E
R 484 F
R 484 G
R 484 H
R 484 I
R 484 K
R 484 L
R 484 M
R 484 N
R 484 P
R 484 Q
no mutation
R 484 S
R 484 T
R 484 V
R 484 W
R 484 Y
G
485
G 485 A
G 485 C
G 485 D
G 485 E
G 485 F
no mutation
G 485 H
G 485 I
G 485 K
G 485 L
G 485 M
G 485 N
G 485 P
G 485 Q
G 485 R
G 485 S
G 485 T
G 485 V
G 485 W
G 485 Y
S
486
S 486 A
S 486 C
S 486 D
S 486 E
S 486 F
S 486 G
S 486 H
S 486 I
S 486 K
S 486 L
S 486 M
S 486 N
S 486 P
S 486 Q
S 486 R
no mutation
S 486 T
S 486 V
S 486 W
S 486 Y
D
487
D 487 A
D 48

F 540 G
F 540 H
F 540 I
F 540 K
F 540 L
F 540 M
F 540 N
F 540 P
F 540 Q
F 540 R
F 540 S
F 540 T
F 540 V
F 540 W
F 540 Y
A
541
no mutation
A 541 C
A 541 D
A 541 E
A 541 F
A 541 G
A 541 H
A 541 I
A 541 K
A 541 L
A 541 M
A 541 N
A 541 P
A 541 Q
A 541 R
A 541 S
A 541 T
A 541 V
A 541 W
A 541 Y
D
542
D 542 A
D 542 C
no mutation
D 542 E
D 542 F
D 542 G
D 542 H
D 542 I
D 542 K
D 542 L
D 542 M
D 542 N
D 542 P
D 542 Q
D 542 R
D 542 S
D 542 T
D 542 V
D 542 W
D 542 Y
L
543
L 543 A
L 543 C
L 543 D
L 543 E
L 543 F
L 543 G
L 543 H
L 543 I
L 543 K
no mutation
L 543 M
L 543 N
L 543 P
L 543 Q
L 543 R
L 543 S
L 543 T
L 543 V
L 543 W
L 543 Y
V
544
V 544 A
V 544 C
V 544 D
V 544 E
V 544 F
V 544 G
V 544 H
V 544 I
V 544 K
V 544 L
V 544 M
V 544 N
V 544 P
V 544 Q
V 544 R
V 544 S
V 544 T
no mutation
V 544 W
V 544 Y
V
545
V 545 A
V 545 C
V 545 D
V 545 E
V 545 F
V 545 G
V 545 H
V 545 I
V 545 K
V 545 L
V 545 M
V 545 N
V 545 P
V 545 Q
V 545 R
V 545 S
V 545 T
no mutation
V 545 W
V 545 Y
L
546
L 546 A
L 546 C
L 546 D


K
600
K 600 A
K 600 C
K 600 D
K 600 E
K 600 F
K 600 G
K 600 H
K 600 I
no mutation
K 600 L
K 600 M
K 600 N
K 600 P
K 600 Q
K 600 R
K 600 S
K 600 T
K 600 V
K 600 W
K 600 Y
G
601
G 601 A
G 601 C
G 601 D
G 601 E
G 601 F
no mutation
G 601 H
G 601 I
G 601 K
G 601 L
G 601 M
G 601 N
G 601 P
G 601 Q
G 601 R
G 601 S
G 601 T
G 601 V
G 601 W
G 601 Y
N
602
N 602 A
N 602 C
N 602 D
N 602 E
N 602 F
N 602 G
N 602 H
N 602 I
N 602 K
N 602 L
N 602 M
no mutation
N 602 P
N 602 Q
N 602 R
N 602 S
N 602 T
N 602 V
N 602 W
N 602 Y
P
603
P 603 A
P 603 C
P 603 D
P 603 E
P 603 F
P 603 G
P 603 H
P 603 I
P 603 K
P 603 L
P 603 M
P 603 N
no mutation
P 603 Q
P 603 R
P 603 S
P 603 T
P 603 V
P 603 W
P 603 Y
L
604
L 604 A
L 604 C
L 604 D
L 604 E
L 604 F
L 604 G
L 604 H
L 604 I
L 604 K
no mutation
L 604 M
L 604 N
L 604 P
L 604 Q
L 604 R
L 604 S
L 604 T
L 604 V
L 604 W
L 604 Y
P
605
P 605 A
P 605 C
P 605 D
P 605 E
P 605 F
P 605 G
P 605 H
P 605 I
P 605 K
P 605 L
P 605 M
P 605 N
no mutation
P 605 Q
P 605 R
P 605 S
P 605 T
P 60

N
660
N 660 A
N 660 C
N 660 D
N 660 E
N 660 F
N 660 G
N 660 H
N 660 I
N 660 K
N 660 L
N 660 M
no mutation
N 660 P
N 660 Q
N 660 R
N 660 S
N 660 T
N 660 V
N 660 W
N 660 Y
L
661
L 661 A
L 661 C
L 661 D
L 661 E
L 661 F
L 661 G
L 661 H
L 661 I
L 661 K
no mutation
L 661 M
L 661 N
L 661 P
L 661 Q
L 661 R
L 661 S
L 661 T
L 661 V
L 661 W
L 661 Y
L
662
L 662 A
L 662 C
L 662 D
L 662 E
L 662 F
L 662 G
L 662 H
L 662 I
L 662 K
no mutation
L 662 M
L 662 N
L 662 P
L 662 Q
L 662 R
L 662 S
L 662 T
L 662 V
L 662 W
L 662 Y
D
663
D 663 A
D 663 C
no mutation
D 663 E
D 663 F
D 663 G
D 663 H
D 663 I
D 663 K
D 663 L
D 663 M
D 663 N
D 663 P
D 663 Q
D 663 R
D 663 S
D 663 T
D 663 V
D 663 W
D 663 Y
M
664
M 664 A
M 664 C
M 664 D
M 664 E
M 664 F
M 664 G
M 664 H
M 664 I
M 664 K
M 664 L
no mutation
M 664 N
M 664 P
M 664 Q
M 664 R
M 664 S
M 664 T
M 664 V
M 664 W
M 664 Y
G
665
G 665 A
G 665 C
G 665 D
G 665 E
G 665 F
no mutation
G 665 H
G 665 I
G 665 K
G 665 L
G 665 M
G 665 N
G 665 P
G 665 Q
G 665 R
G 665 S
G 665 T
G 66

V 721 A
V 721 C
V 721 D
V 721 E
V 721 F
V 721 G
V 721 H
V 721 I
V 721 K
V 721 L
V 721 M
V 721 N
V 721 P
V 721 Q
V 721 R
V 721 S
V 721 T
no mutation
V 721 W
V 721 Y
Q
722
Q 722 A
Q 722 C
Q 722 D
Q 722 E
Q 722 F
Q 722 G
Q 722 H
Q 722 I
Q 722 K
Q 722 L
Q 722 M
Q 722 N
Q 722 P
no mutation
Q 722 R
Q 722 S
Q 722 T
Q 722 V
Q 722 W
Q 722 Y
D
723
D 723 A
D 723 C
no mutation
D 723 E
D 723 F
D 723 G
D 723 H
D 723 I
D 723 K
D 723 L
D 723 M
D 723 N
D 723 P
D 723 Q
D 723 R
D 723 S
D 723 T
D 723 V
D 723 W
D 723 Y
F
724
F 724 A
F 724 C
F 724 D
F 724 E
no mutation
F 724 G
F 724 H
F 724 I
F 724 K
F 724 L
F 724 M
F 724 N
F 724 P
F 724 Q
F 724 R
F 724 S
F 724 T
F 724 V
F 724 W
F 724 Y
V
725
V 725 A
V 725 C
V 725 D
V 725 E
V 725 F
V 725 G
V 725 H
V 725 I
V 725 K
V 725 L
V 725 M
V 725 N
V 725 P
V 725 Q
V 725 R
V 725 S
V 725 T
no mutation
V 725 W
V 725 Y
A
726
no mutation
A 726 C
A 726 D
A 726 E
A 726 F
A 726 G
A 726 H
A 726 I
A 726 K
A 726 L
A 726 M
A 726 N
A 726 P
A 726 Q
A 726 R
A 726 S
A 726 T
A 726 V
A 

In [14]:
#dataframe with all possible KatG chain A mutations 
data = {'KatG mutations': KatG_mutations}
KatG_mutations_df = pd.DataFrame (data, columns = ['KatG mutations'])
print(KatG_mutations_df)


      KatG mutations
0                M1A
1                M1C
2                M1D
3                M1E
4                M1F
...              ...
14055          R740S
14056          R740T
14057          R740V
14058          R740W
14059          R740Y

[14060 rows x 1 columns]


In [15]:
all_mutations = (d_chem_features().dAA)
for i in all_mutations:
    print(str(i))

AA
AR
AN
AD
AC
AQ
AE
AG
AH
AI
AL
AK
AM
AF
AP
AS
AT
AW
AY
AV
RA
RR
RN
RD
RC
RQ
RE
RG
RH
RI
RL
RK
RM
RF
RP
RS
RT
RW
RY
RV
NA
NR
NN
ND
NC
NQ
NE
NG
NH
NI
NL
NK
NM
NF
NP
NS
NT
NW
NY
NV
DA
DR
DN
DD
DC
DQ
DE
DG
DH
DI
DL
DK
DM
DF
DP
DS
DT
DW
DY
DV
CA
CR
CN
CD
CC
CQ
CE
CG
CH
CI
CL
CK
CM
CF
CP
CS
CT
CW
CY
CV
QA
QR
QN
QD
QC
QQ
QE
QG
QH
QI
QL
QK
QM
QF
QP
QS
QT
QW
QY
QV
EA
ER
EN
ED
EC
EQ
EE
EG
EH
EI
EL
EK
EM
EF
EP
ES
ET
EW
EY
EV
GA
GR
GN
GD
GC
GQ
GE
GG
GH
GI
GL
GK
GM
GF
GP
GS
GT
GW
GY
GV
HA
HR
HN
HD
HC
HQ
HE
HG
HH
HI
HL
HK
HM
HF
HP
HS
HT
HW
HY
HV
IA
IR
IN
ID
IC
IQ
IE
IG
IH
II
IL
IK
IM
IF
IP
IS
IT
IW
IY
IV
LA
LR
LN
LD
LC
LQ
LE
LG
LH
LI
LL
LK
LM
LF
LP
LS
LT
LW
LY
LV
KA
KR
KN
KD
KC
KQ
KE
KG
KH
KI
KL
KK
KM
KF
KP
KS
KT
KW
KY
KV
MA
MR
MN
MD
MC
MQ
ME
MG
MH
MI
ML
MK
MM
MF
MP
MS
MT
MW
MY
MV
FA
FR
FN
FD
FC
FQ
FE
FG
FH
FI
FL
FK
FM
FF
FP
FS
FT
FW
FY
FV
PA
PR
PN
PD
PC
PQ
PE
PG
PH
PI
PL
PK
PM
PF
PP
PS
PT
PW
PY
PV
SA
SR
SN
SD
SC
SQ
SE
SG
SH
SI
SL
SK
SM
SF
SP
SS
ST
SW
SY
SV
TA
TR
TN
TD
TC
TQ
TE
TG
TH
TI
TL
TK
TM
T

In [16]:
KatG_mutations_df['KatG mutations']

0          M1A
1          M1C
2          M1D
3          M1E
4          M1F
         ...  
14055    R740S
14056    R740T
14057    R740V
14058    R740W
14059    R740Y
Name: KatG mutations, Length: 14060, dtype: object

In [17]:
#identifies the centre of mass of each residue in chain A
resid_no = 0
residue_no_A = []
residue_CoM_A = []
chain_A = []

for i in KatG_seq:
    resid_no = resid_no + 1
    residue_no_A.append(i + str(resid_no))
    resid_no_string = str(resid_no)
    select_atoms_string = 'resid '+ resid_no_string + ' and segid A'
    atom_coordinates = KatG.select_atoms(select_atoms_string)
    residue_CoM_A.append(atom_coordinates.center_of_mass())
    chain_A.append('A')
    
data = {'Residue': residue_no_A, 'Chain': chain_A, 'Centre of mass': residue_CoM_A}
residue_CoM_A_df = pd.DataFrame (data, columns = ['Residue','Chain','Centre of mass'])
print(residue_CoM_A_df)


    Residue Chain                                     Centre of mass
0        M1     A                                                 []
1        P2     A                                                 []
2        E3     A                                                 []
3        Q4     A                                                 []
4        H5     A                                                 []
..      ...   ...                                                ...
735    R736     A  [101.71184776487075, 134.8153974993157, 136.44...
736    F737     A  [104.94849311732212, 129.8637529291616, 138.16...
737    D738     A  [99.68625343362409, 132.32714759696879, 139.98...
738    V739     A  [98.00821963702562, 131.03244203077924, 136.44...
739    R740     A  [101.54049856367341, 128.31209317416, 134.9765...

[740 rows x 3 columns]


In [18]:
#identifies the centre of mass of each residue in chain A
resid_no = 0
residue_no_B = []
residue_CoM_B = []
chain_B = []

for i in KatG_seq:
    resid_no = resid_no + 1
    residue_no_B.append(i + str(resid_no))
    resid_no_string = str(resid_no)
    select_atoms_string = 'resid '+ resid_no_string + ' and segid B'
    atom_coordinates = KatG.select_atoms(select_atoms_string)
    residue_CoM_B.append(atom_coordinates.center_of_mass())
    chain_B.append('B')
    
data = {'Residue': residue_no_B, 'Chain': chain_B, 'Centre of mass': residue_CoM_B}
residue_CoM_B_df = pd.DataFrame (data, columns = ['Residue','Chain','Centre of mass'])
print(residue_CoM_B_df)

    Residue Chain                                     Centre of mass
0        M1     B                                                 []
1        P2     B                                                 []
2        E3     B                                                 []
3        Q4     B                                                 []
4        H5     B                                                 []
..      ...   ...                                                ...
735    R736     B  [102.56951717105538, 106.68485590620568, 102.0...
736    F737     B  [106.1688799400712, 111.39674823454799, 100.59...
737    D738     B  [100.97355567927137, 109.2970553620259, 98.230...
738    V739     B  [99.15080305467927, 110.36249033674973, 101.78...
739    R740     B  [102.86387110467739, 112.88535360473507, 103.8...

[740 rows x 3 columns]


In [19]:
# adds the chain A and chain B dataframes together
residue_CoM_df = residue_CoM_A_df.append(residue_CoM_B_df, ignore_index = True)

In [20]:
residue_CoM_df

Unnamed: 0,Residue,Chain,Centre of mass
0,M1,A,[]
1,P2,A,[]
2,E3,A,[]
3,Q4,A,[]
4,H5,A,[]
...,...,...,...
1475,R736,B,"[102.56951717105538, 106.68485590620568, 102.0..."
1476,F737,B,"[106.1688799400712, 111.39674823454799, 100.59..."
1477,D738,B,"[100.97355567927137, 109.2970553620259, 98.230..."
1478,V739,B,"[99.15080305467927, 110.36249033674973, 101.78..."


In [21]:
#adds a new column for distance from site 1 for each residue in that given chain
Site1_coord = []
for i in residue_CoM_df['Chain']:
    if (i == 'A'):
        Site1_coord.append(Site1_A_true_CoM)
    else:
        Site1_coord.append(Site1_B_true_CoM)
residue_CoM_df['Site 1 Coordinates'] = Site1_coord

Site1_dist = []
Site1_dist_no = -1

for i in residue_CoM_df['Centre of mass']:
    Site1_dist_no = Site1_dist_no + 1 
    chain = residue_CoM_df.loc[Site1_dist_no,'Chain']
    CoM = residue_CoM_df.loc[Site1_dist_no,'Centre of mass']
    if len(CoM) == 0:
        Site1_dist.append('No centre of mass')
    else:
        if chain == 'A':
            dist = numpy.linalg.norm(Site1_A_CoM-i)
            Site1_dist.append(dist)
        else:
            dist = numpy.linalg.norm(Site1_B_CoM-i)
            Site1_dist.append(dist)
            
residue_CoM_df['Distance from Site 1'] = Site1_dist
residue_CoM_df 

Unnamed: 0,Residue,Chain,Centre of mass,Site 1 Coordinates,Distance from Site 1
0,M1,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
1,P2,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
2,E3,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
3,Q4,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
4,H5,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
...,...,...,...,...,...
1475,R736,B,"[102.56951717105538, 106.68485590620568, 102.0...","[137.7765238725133, 126.92174901271466, 92.186...",41.195753
1476,F737,B,"[106.1688799400712, 111.39674823454799, 100.59...","[137.7765238725133, 126.92174901271466, 92.186...",35.615301
1477,D738,B,"[100.97355567927137, 109.2970553620259, 98.230...","[137.7765238725133, 126.92174901271466, 92.186...",40.836819
1478,V739,B,"[99.15080305467927, 110.36249033674973, 101.78...","[137.7765238725133, 126.92174901271466, 92.186...",42.519218


In [22]:
d_chem_features()

Unnamed: 0,dAA,d_volume,d_MW,d_hydropathy,d_Pi
0,AA,0.0,0.0,0.0,0.00
1,AR,-84.8,-85.1,6.3,-4.76
2,AN,-25.5,-43.0,5.3,0.59
3,AD,-22.5,-44.0,5.3,3.23
4,AC,-19.9,-32.0,-0.7,0.93
...,...,...,...,...,...
395,VS,51.0,12.0,5.0,0.28
396,VT,23.9,-2.0,4.9,0.36
397,VW,-87.8,-87.1,5.1,0.07
398,VY,-53.6,-64.1,5.5,0.30


In [23]:
KatG_mutations_df 

Unnamed: 0,KatG mutations
0,M1A
1,M1C
2,M1D
3,M1E
4,M1F
...,...
14055,R740S
14056,R740T
14057,R740V
14058,R740W


In [24]:
KatG_mutations_chain_A = []
for i in KatG_mutations_df['KatG mutations']:
    KatG_mutations_chain_A.append('A')

KatG_mutations_df['Chain'] =  KatG_mutations_chain_A

In [25]:
KatG_mutations_df_B = KatG_mutations_df.copy()

In [26]:
del KatG_mutations_df_B['Chain']
KatG_mutations_chain_B = []
for i in KatG_mutations_df_B['KatG mutations']:
    KatG_mutations_chain_B.append('B')

KatG_mutations_df_B['Chain'] =  KatG_mutations_chain_B

In [27]:
KatG_mutations_df_B

Unnamed: 0,KatG mutations,Chain
0,M1A,B
1,M1C,B
2,M1D,B
3,M1E,B
4,M1F,B
...,...,...
14055,R740S,B
14056,R740T,B
14057,R740V,B
14058,R740W,B


In [28]:
KatG_mutations_df = KatG_mutations_df.append(KatG_mutations_df_B, ignore_index = True)

In [29]:
residue_CoM_df

Unnamed: 0,Residue,Chain,Centre of mass,Site 1 Coordinates,Distance from Site 1
0,M1,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
1,P2,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
2,E3,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
3,Q4,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
4,H5,A,[],"[135.5568728593392, 113.46202070912435, 148.68...",No centre of mass
...,...,...,...,...,...
1475,R736,B,"[102.56951717105538, 106.68485590620568, 102.0...","[137.7765238725133, 126.92174901271466, 92.186...",41.195753
1476,F737,B,"[106.1688799400712, 111.39674823454799, 100.59...","[137.7765238725133, 126.92174901271466, 92.186...",35.615301
1477,D738,B,"[100.97355567927137, 109.2970553620259, 98.230...","[137.7765238725133, 126.92174901271466, 92.186...",40.836819
1478,V739,B,"[99.15080305467927, 110.36249033674973, 101.78...","[137.7765238725133, 126.92174901271466, 92.186...",42.519218


In [30]:
#creates unique ID for each possible KatG mutation

Unique_ID = []
mutation_no = -1

for i in KatG_mutations_df['KatG mutations']:
    mutation_no = mutation_no + 1
    if mutation_no < 14060:
        ID = i + ' ' + 'A'
    else:
        ID = i + ' ' + 'B'
    Unique_ID.append(ID)

KatG_mutations_df['Unique ID'] =  Unique_ID
KatG_mutations_df


Unnamed: 0,KatG mutations,Chain,Unique ID
0,M1A,A,M1A A
1,M1C,A,M1C A
2,M1D,A,M1D A
3,M1E,A,M1E A
4,M1F,A,M1F A
...,...,...,...
28115,R740S,B,R740S B
28116,R740T,B,R740T B
28117,R740V,B,R740V B
28118,R740W,B,R740W B


In [31]:
#adds distance from site 1 column

KatG_mutations_dist = []
mutation_no = -1

#cycles through all possible KatG mutations
for a in KatG_mutations_df['Unique ID']:
    mutation_no = mutation_no + 1
    #cycles through all KatG residues looking for a match to the residue being mutated
    for b in residue_CoM_df['Residue']:
        if a[:-3] == b:
            if mutation_no < 14060:
                row_no = residue_CoM_df[residue_CoM_df['Residue']==b].index[0]
            else:
                row_no = residue_CoM_df[residue_CoM_df['Residue']==b].index[1]
            Site1_dist = residue_CoM_df.loc[row_no,'Distance from Site 1']
    KatG_mutations_dist.append(Site1_dist)
    
KatG_mutations_df['Distance from Site 1'] =  KatG_mutations_dist
KatG_mutations_df
#dataframe1.set_index

Unnamed: 0,KatG mutations,Chain,Unique ID,Distance from Site 1
0,M1A,A,M1A A,No centre of mass
1,M1C,A,M1C A,No centre of mass
2,M1D,A,M1D A,No centre of mass
3,M1E,A,M1E A,No centre of mass
4,M1F,A,M1F A,No centre of mass
...,...,...,...,...
28115,R740S,B,R740S B,38.661813
28116,R740T,B,R740T B,38.661813
28117,R740V,B,R740V B,38.661813
28118,R740W,B,R740W B,38.661813


In [32]:
d_chem_features = d_chem_features()

In [33]:
#add d_chem_features columns to KatG_mutations_df
d_volume = []
d_MW = []
d_hydropathy = []
d_Pi = []

for a in KatG_mutations_df['KatG mutations']:
    res1 = a[0]
    res2 = a[-1]
    AA = res1 + res2
    for b in d_chem_features['dAA']:
        if str(AA) == str(b):
            row_no = d_chem_features[d_chem_features['dAA']==b].index[0]
            vol = d_chem_features.loc[row_no,'d_volume']
            d_volume.append(vol)
            MW = d_chem_features.loc[row_no,'d_MW']
            d_MW.append(MW)
            hydrop =d_chem_features.loc[row_no,'d_hydropathy']
            d_hydropathy.append(hydrop)
            pi =d_chem_features.loc[row_no,'d_Pi']
            d_Pi.append(pi)
            
KatG_mutations_df['d_volume'] = d_volume
KatG_mutations_df['d_MW'] = d_MW
KatG_mutations_df['d_hydropathy'] = d_hydropathy
KatG_mutations_df['d_Pi'] = d_Pi

KatG_mutations_df
        
#codons to codons, classified by numbr of snips, different probabiliities of mutations happening 
#some mutatns will have multiple mutations - can only consider solo's, works as tb mutates slowly 
# i had a dataset of 15,000 samples, x many had this many mutations in katg,ahpc had x amount etc. 
# promoter mutations too, ahpc 

Unnamed: 0,KatG mutations,Chain,Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi
0,M1A,A,M1A A,No centre of mass,74.3,60.1,0.1,-0.26
1,M1C,A,M1C A,No centre of mass,54.4,28.1,-0.6,0.67
2,M1D,A,M1D A,No centre of mass,51.8,16.1,5.4,2.97
3,M1E,A,M1E A,No centre of mass,24.5,2.1,5.4,2.52
4,M1F,A,M1F A,No centre of mass,-27.0,-16.0,-0.9,0.26
...,...,...,...,...,...,...,...,...
28115,R740S,B,R740S B,38.661813,84.4,69.1,-3.7,5.08
28116,R740T,B,R740T B,38.661813,57.3,55.1,-3.8,5.16
28117,R740V,B,R740V B,38.661813,33.4,57.1,-8.7,4.80
28118,R740W,B,R740W B,38.661813,-54.4,-30.0,-3.6,4.87


In [34]:
codon_list = []
bases = ['T','A','C','G']
for i in bases:
    for h in bases:
        for z in bases:
            codon = (i,h,z)
            codon_list.append(codon)
len(codon_list)

64

In [35]:
def ListToString(s):
    str1 = ''
    for ele in s:
        str1 += ele
    return str1 

In [36]:
codon_list_start = []
codon_list_end = []
for i in codon_list:
    for a in codon_list:
        start_codon = ListToString(i)
        end_codon = ListToString(a)
        codon_list_start.append(start_codon)
        codon_list_end.append(end_codon)
        
data = {'Start codon': codon_list_start, 'End codon': codon_list_end}
codon_df = pd.DataFrame (data, columns = ['Start codon', 'End codon'])
codon_df
    

Unnamed: 0,Start codon,End codon
0,TTT,TTT
1,TTT,TTA
2,TTT,TTC
3,TTT,TTG
4,TTT,TAT
...,...,...
4091,GGG,GCG
4092,GGG,GGT
4093,GGG,GGA
4094,GGG,GGC


In [37]:
codon_1 = 'TAC'
codon_2 = 'GCC'
base_no = -1
errors = 0
for i in codon_1:
    base_no = base_no + 1
    if codon_2[int(base_no)] == i:
        errors += 0
    else:
        errors += 1
        
print(errors)
        

2


In [38]:
substitutions = []
row_no = -1
for i in codon_list_start:
    codon_1 = i
    row_no = row_no + 1
    codon_2 = codon_df.loc[row_no,'End codon']
    base_no = -1
    subs = 0
    for a in codon_1:
        base_no = base_no + 1
        if codon_2[int(base_no)] == a:
            subs += 0
        else:
            subs += 1
    substitutions.append(subs)

codon_df['Number of substitutions'] = substitutions 
codon_df    

Unnamed: 0,Start codon,End codon,Number of substitutions
0,TTT,TTT,0
1,TTT,TTA,1
2,TTT,TTC,1
3,TTT,TTG,1
4,TTT,TAT,1
...,...,...,...
4091,GGG,GCG,1
4092,GGG,GGT,1
4093,GGG,GGA,1
4094,GGG,GGC,1


In [39]:
aa_codons_drft = pd.read_csv(r'C:\Users\user\anaconda3\envs\mdaenv\Inh_resistance\amino_acid_codons.csv')

In [40]:
aa_codons = aa_codons_drft.iloc[:-3,]
aa_codons

Unnamed: 0,Codon,Amino Acid
0,TTT,F
1,TTA,L
2,TTC,F
3,TTG,L
4,TAT,Y
...,...,...
59,GCG,A
60,GGT,G
61,GGA,G
62,GGC,G


In [41]:
KatG_mutations_df

Unnamed: 0,KatG mutations,Chain,Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi
0,M1A,A,M1A A,No centre of mass,74.3,60.1,0.1,-0.26
1,M1C,A,M1C A,No centre of mass,54.4,28.1,-0.6,0.67
2,M1D,A,M1D A,No centre of mass,51.8,16.1,5.4,2.97
3,M1E,A,M1E A,No centre of mass,24.5,2.1,5.4,2.52
4,M1F,A,M1F A,No centre of mass,-27.0,-16.0,-0.9,0.26
...,...,...,...,...,...,...,...,...
28115,R740S,B,R740S B,38.661813,84.4,69.1,-3.7,5.08
28116,R740T,B,R740T B,38.661813,57.3,55.1,-3.8,5.16
28117,R740V,B,R740V B,38.661813,33.4,57.1,-8.7,4.80
28118,R740W,B,R740W B,38.661813,-54.4,-30.0,-3.6,4.87


In [42]:
start_aa = []
end_aa = []
for i in KatG_mutations_df['KatG mutations']:
    start = i[0]
    end = i[-1]
    start_aa.append(start)
    end_aa.append(end)
KatG_mutations_df['Start residue'] = start_aa
KatG_mutations_df['End residue'] = end_aa
KatG_mutations_df

Unnamed: 0,KatG mutations,Chain,Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi,Start residue,End residue
0,M1A,A,M1A A,No centre of mass,74.3,60.1,0.1,-0.26,M,A
1,M1C,A,M1C A,No centre of mass,54.4,28.1,-0.6,0.67,M,C
2,M1D,A,M1D A,No centre of mass,51.8,16.1,5.4,2.97,M,D
3,M1E,A,M1E A,No centre of mass,24.5,2.1,5.4,2.52,M,E
4,M1F,A,M1F A,No centre of mass,-27.0,-16.0,-0.9,0.26,M,F
...,...,...,...,...,...,...,...,...,...,...
28115,R740S,B,R740S B,38.661813,84.4,69.1,-3.7,5.08,R,S
28116,R740T,B,R740T B,38.661813,57.3,55.1,-3.8,5.16,R,T
28117,R740V,B,R740V B,38.661813,33.4,57.1,-8.7,4.80,R,V
28118,R740W,B,R740W B,38.661813,-54.4,-30.0,-3.6,4.87,R,W


In [43]:
aa_codons = aa_codons.rename(columns ={0:'Codon',1:'Amino Acid'})
aa_codons

Unnamed: 0,Codon,Amino Acid
0,TTT,F
1,TTA,L
2,TTC,F
3,TTG,L
4,TAT,Y
...,...,...
59,GCG,A
60,GGT,G
61,GGA,G
62,GGC,G


In [45]:
print(aa_codons)

   Codon  Amino Acid
0     TTT          F
1     TTA          L
2     TTC          F
3     TTG          L
4     TAT          Y
..    ...        ...
59    GCG          A
60    GGT          G
61    GGA          G
62    GGC          G
63    GGG          G

[64 rows x 2 columns]


In [46]:
aa_codons.set_index('Codon ', inplace = True)

In [47]:
aa_codons

Unnamed: 0_level_0,Amino Acid
Codon,Unnamed: 1_level_1
TTT,F
TTA,L
TTC,F
TTG,L
TAT,Y
...,...
GCG,A
GGT,G
GGA,G
GGC,G


In [48]:
codon_df

Unnamed: 0,Start codon,End codon,Number of substitutions
0,TTT,TTT,0
1,TTT,TTA,1
2,TTT,TTC,1
3,TTT,TTG,1
4,TTT,TAT,1
...,...,...,...
4091,GGG,GCG,1
4092,GGG,GGT,1
4093,GGG,GGA,1
4094,GGG,GGC,1


In [49]:
codon_df.set_index('Start codon', inplace = True)

In [50]:
combined_codon_df = codon_df.join(aa_codons)
combined_codon_df

Unnamed: 0,End codon,Number of substitutions,Amino Acid
AAA,TTT,3,K
AAA,TTA,2,K
AAA,TTC,3,K
AAA,TTG,3,K
AAA,TAT,2,K
...,...,...,...
TTT,GCG,3,F
TTT,GGT,2,F
TTT,GGA,3,F
TTT,GGC,3,F


In [51]:
combined_codon_df.reset_index(inplace = True)
combined_codon_df.rename(columns = {'Amino Acid':'Start AA','index':'Start codon'}, inplace = True)
combined_codon_df

Unnamed: 0,Start codon,End codon,Number of substitutions,Start AA
0,AAA,TTT,3,K
1,AAA,TTA,2,K
2,AAA,TTC,3,K
3,AAA,TTG,3,K
4,AAA,TAT,2,K
...,...,...,...,...
4091,TTT,GCG,3,F
4092,TTT,GGT,2,F
4093,TTT,GGA,3,F
4094,TTT,GGC,3,F


In [52]:
combined_codon_df.set_index('End codon', inplace = True)
combined_codon_df = combined_codon_df.join(aa_codons)

In [53]:
combined_codon_df.reset_index(inplace = True)
combined_codon_df.rename(columns = {'Amino Acid':'End AA','index':'End codon'}, inplace = True)
combined_codon_df

Unnamed: 0,End codon,Start codon,Number of substitutions,Start AA,End AA
0,AAA,AAA,0,K,K
1,AAA,AAC,1,N,K
2,AAA,AAG,1,K,K
3,AAA,AAT,1,N,K
4,AAA,ACA,1,T,K
...,...,...,...,...,...
4091,TTT,TGT,1,C,F
4092,TTT,TTA,1,L,F
4093,TTT,TTC,1,F,F
4094,TTT,TTG,1,L,F


In [54]:
combined_codon_df[['Start AA', 'End AA', 'Number of substitutions']].groupby(['Start AA','End AA']).agg(['min','max',numpy.mean,'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of substitutions,Number of substitutions,Number of substitutions,Number of substitutions
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,count
Start AA,End AA,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A,A,0,1,0.750000,16
A,C,2,3,2.750000,8
A,D,1,2,1.750000,8
A,E,1,2,1.750000,8
A,F,2,3,2.750000,8
...,...,...,...,...,...
Y,Stop,1,2,1.333333,6
Y,T,2,3,2.750000,8
Y,V,2,3,2.750000,8
Y,W,2,2,2.000000,2


In [55]:
mutation_1 = []
row_no_1 = -1
for s_aa in combined_codon_df['Start AA']:
    row_no_1 = row_no_1 + 1 
    e_aa = combined_codon_df.loc[row_no_1,'End AA']
    d_aa = str(s_aa + e_aa)
    mutation_1.append(d_aa)
combined_codon_df['Mutation'] = mutation_1
combined_codon_df.set_index('Mutation', inplace = True)
combined_codon_df
    

Unnamed: 0_level_0,End codon,Start codon,Number of substitutions,Start AA,End AA
Mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KK,AAA,AAA,0,K,K
NK,AAA,AAC,1,N,K
KK,AAA,AAG,1,K,K
NK,AAA,AAT,1,N,K
TK,AAA,ACA,1,T,K
...,...,...,...,...,...
CF,TTT,TGT,1,C,F
LF,TTT,TTA,1,L,F
FF,TTT,TTC,1,F,F
LF,TTT,TTG,1,L,F


In [58]:
KatG_mutations_df
mutation_2 = []
row_no_2 = -1 
for s_aa in KatG_mutations_df['Start residue']:
    row_no_2 = row_no_2 + 1
    e_aa = KatG_mutations_df.loc[row_no_2,'End residue']
    d_aa = str(s_aa + e_aa)
    mutation_2.append(d_aa)
KatG_mutations_df['Mutation'] = mutation_2
KatG_mutations_df.set_index('Mutation', inplace = True)
KatG_mutations_df

Unnamed: 0_level_0,KatG mutations,Chain,Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi,Start residue,End residue
Mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MA,M1A,A,M1A A,No centre of mass,74.3,60.1,0.1,-0.26,M,A
MC,M1C,A,M1C A,No centre of mass,54.4,28.1,-0.6,0.67,M,C
MD,M1D,A,M1D A,No centre of mass,51.8,16.1,5.4,2.97,M,D
ME,M1E,A,M1E A,No centre of mass,24.5,2.1,5.4,2.52,M,E
MF,M1F,A,M1F A,No centre of mass,-27.0,-16.0,-0.9,0.26,M,F
...,...,...,...,...,...,...,...,...,...,...
RS,R740S,B,R740S B,38.661813,84.4,69.1,-3.7,5.08,R,S
RT,R740T,B,R740T B,38.661813,57.3,55.1,-3.8,5.16,R,T
RV,R740V,B,R740V B,38.661813,33.4,57.1,-8.7,4.80,R,V
RW,R740W,B,R740W B,38.661813,-54.4,-30.0,-3.6,4.87,R,W


In [60]:
combined_codon_df.reset_index(inplace = True)
subs_df = combined_codon_df[['Mutation', 'Number of substitutions']].groupby(['Mutation']).agg(['min','max','count'])
subs_df

Unnamed: 0_level_0,Number of substitutions,Number of substitutions,Number of substitutions
Unnamed: 0_level_1,min,max,count
Mutation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
AA,0,1,16
AC,2,3,8
AD,1,2,8
AE,1,2,8
AF,2,3,8
...,...,...,...
YStop,1,2,6
YT,2,3,8
YV,2,3,8
YW,2,2,2


In [61]:
subs_df.reset_index(inplace = True)
subs_df.set_index('Mutation', inplace = True)
subs_df


Unnamed: 0_level_0,Number of substitutions,Number of substitutions,Number of substitutions
Unnamed: 0_level_1,min,max,count
Mutation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
AA,0,1,16
AC,2,3,8
AD,1,2,8
AE,1,2,8
AF,2,3,8
...,...,...,...
YStop,1,2,6
YT,2,3,8
YV,2,3,8
YW,2,2,2


In [62]:
subs_df.rename(columns={'index':'delete'}, inplace = True)

subs_df

Unnamed: 0_level_0,Number of substitutions,Number of substitutions,Number of substitutions
Unnamed: 0_level_1,min,max,count
Mutation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
AA,0,1,16
AC,2,3,8
AD,1,2,8
AE,1,2,8
AF,2,3,8
...,...,...,...
YStop,1,2,6
YT,2,3,8
YV,2,3,8
YW,2,2,2


In [64]:
KatG_mutations_df = KatG_mutations_df.join(subs_df)

  return merge(


In [65]:
KatG_mutations_df

Unnamed: 0_level_0,KatG mutations,Chain,Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi,Start residue,End residue,"(Number of substitutions, min)","(Number of substitutions, max)","(Number of substitutions, count)"
Mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AC,A15C,A,A15C A,No centre of mass,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
AC,A16C,A,A16C A,No centre of mass,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
AC,A53C,A,A53C A,35.218287,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
AC,A55C,A,A55C A,31.791704,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
AC,A60C,A,A60C A,38.926044,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
YW,Y597W,B,Y597W B,40.613433,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
YW,Y608W,B,Y608W B,33.528173,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
YW,Y638W,B,Y638W B,46.007273,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
YW,Y678W,B,Y678W B,48.498844,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2


In [66]:
KatG_mutations_df.reset_index(inplace=True)
KatG_mutations_df

Unnamed: 0,Mutation,KatG mutations,Chain,Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi,Start residue,End residue,"(Number of substitutions, min)","(Number of substitutions, max)","(Number of substitutions, count)"
0,AC,A15C,A,A15C A,No centre of mass,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
1,AC,A16C,A,A16C A,No centre of mass,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
2,AC,A53C,A,A53C A,35.218287,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
3,AC,A55C,A,A55C A,31.791704,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
4,AC,A60C,A,A60C A,38.926044,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28115,YW,Y597W,B,Y597W B,40.613433,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
28116,YW,Y608W,B,Y608W B,33.528173,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
28117,YW,Y638W,B,Y638W B,46.007273,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
28118,YW,Y678W,B,Y678W B,48.498844,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2


In [None]:
#freesasa code

In [90]:
structure = freesasa.Structure("7ag8.pdb")
result = freesasa.calc(structure)

ASA_dict_atom = {}
for i in range(0, structure.nAtoms()):
    ASA_dict_atom[i] = result.atomArea(i)
    
SASA_df = pd.DataFrame.from_dict(ASA_dict_atom, orient='index', columns = ['atom_sasa'])

res_codes = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C', 
             'GLU':'E', 'GLN':'Q', 'GLY':'G', 'HIS':'H', 'ILE':'I', 
             'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P', 
             'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'}

#define max total sasa for each residue (also useful to define aa)
maxasa_dict = {'ALA':129, 'ARG':274, 'ASN':195, 'ASP':193, 'CYS':167, 'GLU':223,
                'GLN':225, 'GLY':104, 'HIS':224, 'ILE':197, 'LEU':201, 'LYS':236,
                'MET':224, 'PHE':240, 'PRO':159, 'SER':155, 'THR':172, 'TRP':285,
                'TYR':263, 'VAL':174}

In [91]:
    residue_list, atom_list, residue_number, chain_label, combined_ID = [], [], [], [], []
    for i in range(0, structure.nAtoms()):
        residue_list.append(structure.residueName(i))
        atom_list.append(structure.atomName(i))
        residue_number.append(structure.residueNumber(i))
        chain_label.append(structure.chainLabel(i))
        j = 0
        if int(structure.residueNumber(i)) < 10:
            j = '000' + str(structure.residueNumber(i)).strip()
        elif int(structure.residueNumber(i)) < 100:
            j = '00' + str(structure.residueNumber(i)).strip()
        elif int(structure.residueNumber(i)) < 1000:
            j = '0' + str(structure.residueNumber(i)).strip()
        else:
            j = structure.residueNumber(i).strip()
        combined_ID.append(structure.chainLabel(i) + '_' + str(j) + '_' + structure.residueName(i))

    SASA_df['atom'] = atom_list
    SASA_df['residue'] = residue_list
    SASA_df['residue_number'] = residue_number
    SASA_df['chain'] = chain_label
    SASA_df['combined_ID'] = combined_ID
    
    SASA_df

Unnamed: 0,atom_sasa,atom,residue,residue_number,chain,combined_ID
0,26.778193,N,GLY,24,A,A_0024_GLY
1,41.526774,CA,GLY,24,A,A_0024_GLY
2,1.935937,C,GLY,24,A,A_0024_GLY
3,30.605271,O,GLY,24,A,A_0024_GLY
4,0.000000,N,HIS,25,A,A_0025_HIS
...,...,...,...,...,...,...
10815,6.324353,NE,ARG,740,B,B_0740_ARG
10816,1.726302,CZ,ARG,740,B,B_0740_ARG
10817,2.320039,NH1,ARG,740,B,B_0740_ARG
10818,20.751627,NH2,ARG,740,B,B_0740_ARG


In [93]:

    for i in SASA_df.index:
        if SASA_df['residue'][i] not in maxasa_dict.keys():
            SASA_df.drop(i, inplace=True)
    SASA_df.reset_index(0, inplace=True)
    

In [96]:
    residue_list, residue_ID_list, CA_atom_ID_list = [], [], []
    Dict = {}
    SASA_sum = 0

    for i in SASA_df.index:
        try:
            if SASA_df['combined_ID'][i] == SASA_df['combined_ID'][i+1]:
                #atoms in same residue, so add current sasa and keep iterating
                SASA_sum += SASA_df['atom_sasa'][i]
            else:
                #atoms not in same residue, so add sasa and reset SASA_sum
                SASA_sum += SASA_df['atom_sasa'][i]
                Dict[SASA_df['combined_ID'][i]] = SASA_sum
                SASA_sum = 0
                residue_list.append(SASA_df['residue'][i])
                residue_ID_list.append(SASA_df['residue_number'][i])
        except KeyError:
            #for last index of df, so add sasa and reset SASA_sum
            SASA_sum += SASA_df['atom_sasa'][i]
            Dict[SASA_df['combined_ID'][i]] = SASA_sum
            SASA_sum = 0
            residue_list.append(SASA_df['residue'][i])
            residue_ID_list.append(SASA_df['residue_number'][i])
        if SASA_df['atom'][i].strip() == 'CA':
            #list indexes of CA 
            CA_atom_ID_list.append(SASA_df['index'][i])
    
    sasa_sum_df = pd.DataFrame.from_dict(Dict, orient='index', 
                columns=['SASA_residue']).reset_index(0).rename(columns={'index':'combined_ID'})
    sasa_sum_df['residue_ID'] = residue_ID_list
    sasa_sum_df['CA_ID'] = CA_atom_ID_list
    
        #insert predefined max sasa for each residue
    maxasa_list = []
    for i in sasa_sum_df['combined_ID']:
        for key, value in maxasa_dict.items():
            if i[7:] == key:
                maxasa_list.append(value)
    sasa_sum_df['maxASA'] = maxasa_list
    
    sasa_sum_df

Unnamed: 0,combined_ID,SASA_residue,residue_ID,CA_ID,maxASA
0,A_0024_GLY,100.846174,24,1,104
1,A_0025_HIS,127.637993,25,5,224
2,A_0026_MET,18.262181,26,15,224
3,A_0027_LYS,61.530393,27,23,236
4,A_0028_TYR,39.727088,28,32,263
...,...,...,...,...,...
1412,B_0736_ARG,0.125640,736,10772,274
1413,B_0737_PHE,34.730453,737,10783,240
1414,B_0738_ASP,79.101138,738,10794,193
1415,B_0739_VAL,81.799088,739,10802,174


In [106]:
    #calculate and insert relative SASA for each residue
    sasa_sum_df['RSA'] = sasa_sum_df['SASA_residue']/sasa_sum_df['maxASA']
    #filter for residues with SASA > 0.25 -- NEED TO FIND PAPER THAT RECOMMENDS GREATER THAT 0.25
    surface_residues_df = sasa_sum_df[sasa_sum_df['RSA'] >= 0.26].reset_index(0)
    surface_residues_df

Unnamed: 0,index,combined_ID,SASA_residue,residue_ID,CA_ID,maxASA,RSA
0,0,A_0024_GLY,100.846174,24,1,104,0.969675
1,1,A_0025_HIS,127.637993,25,5,224,0.569812
2,3,A_0027_LYS,61.530393,27,23,236,0.260722
3,13,A_0037_ASP,53.610179,37,96,193,0.277773
4,16,A_0040_PRO,77.247684,40,132,159,0.485834
...,...,...,...,...,...,...,...
418,1402,B_0726_ALA,63.733677,726,10692,129,0.494060
419,1406,B_0730_LYS,65.592781,730,10724,236,0.277936
420,1414,B_0738_ASP,79.101138,738,10794,193,0.409850
421,1415,B_0739_VAL,81.799088,739,10802,174,0.470110


In [121]:
    CA = KatG.select_atoms('name CA')
    sa_CA = AtomGroup(surface_residues_df['CA_ID'], KatG)
    distances = distance_array(CA.positions, sa_CA.positions)
    min_distances = []
    for i in distances:
        min_distances.append(min(i))

In [123]:
distance_df = pd.DataFrame(min_distances, columns=['Depth'])

In [128]:
    #insert chain and numbering
    chain_list, pdb_residue_list, resname_list = [], [], []
    for i in CA:
        chain_list.append(str(i.segment)[9])
        pdb_residue_list.append(i.resid)
        resname_list.append(i.resname)
    #create and fill depth dataframe
    distance_df['pdb_residue'] = pdb_residue_list
    distance_df['pdb_chain'] = chain_list

In [145]:
    #converts resnames to rescode IDs (ALA --> A e.g)
    rescode_list = []
    for i in resname_list:
        for k, v in res_codes.items():
            if i == k:
                rescode_list.append(v)
    distance_df['rescode'] = rescode_list
distance_df

Unnamed: 0,Depth,pdb_residue,pdb_chain,rescode
0,0.000000,24,A,G
1,0.000000,25,A,H
2,3.791682,26,A,M
3,0.000000,27,A,K
4,3.790789,28,A,Y
...,...,...,...,...
1414,3.407209,736,B,R
1415,1.528764,737,B,F
1416,1.454725,738,B,D
1417,4.155768,739,B,V


In [166]:
distance_df_unique_ID = []
distance_df_row_no = -1
for i in distance_df['Depth']:
    distance_df_row_no += 1
    Res = distance_df.loc[distance_df_row_no,'rescode']
    Num = distance_df.loc[distance_df_row_no,'pdb_residue']
    Chain = distance_df.loc[distance_df_row_no,'pdb_chain']
    Unique = str(Res+str(Num)+'_'+Chain)
    distance_df_unique_ID.append(Unique)
distance_df['Residue Unique ID'] = distance_df_unique_ID

In [167]:
KatG_mutations_df.rename(columns={'Unique ID':'Mutation Unique ID'}, inplace = True)

In [168]:
Residue_unique_ID = []
for i in KatG_mutations_df['Mutation Unique ID']:
    Res = str(i)[:-3]
    Chain = str(i)[-1]
    Unique = str(Res+'_'+Chain)
    Residue_unique_ID.append(Unique)
    
KatG_mutations_df['Residue Unique ID'] = Residue_unique_ID

In [178]:
distance_merge = distance_df.copy()
del distance_merge['pdb_residue'],distance_merge['pdb_chain'],distance_merge['rescode']

In [186]:
KatG_mutations_df.reset_index(inplace = True)

In [187]:
del KatG_mutations_df['index']
KatG_mutations_df

Unnamed: 0,Residue Unique ID,Mutation,KatG mutations,Chain,Mutation Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi,Start residue,End residue,"(Number of substitutions, min)","(Number of substitutions, max)","(Number of substitutions, count)"
0,A15_A,AC,A15C,A,A15C A,No centre of mass,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
1,A16_A,AC,A16C,A,A16C A,No centre of mass,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
2,A53_A,AC,A53C,A,A53C A,35.218287,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
3,A55_A,AC,A55C,A,A55C A,31.791704,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
4,A60_A,AC,A60C,A,A60C A,38.926044,-19.9,-32.0,-0.7,0.93,A,C,2,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28115,Y597_B,YW,Y597W,B,Y597W B,40.613433,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
28116,Y608_B,YW,Y608W,B,Y608W B,33.528173,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
28117,Y638_B,YW,Y638W,B,Y638W B,46.007273,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2
28118,Y678_B,YW,Y678W,B,Y678W B,48.498844,-34.2,-23.0,-0.4,-0.23,Y,W,2,2,2


In [188]:
KatG_mutations_df.set_index('Residue Unique ID', inplace = True)

In [190]:
distance_merge.set_index('Residue Unique ID', inplace = True)

In [192]:
KatG_mutations_df = KatG_mutations_df.join(distance_merge)
KatG_mutations_df

Unnamed: 0_level_0,Mutation,KatG mutations,Chain,Mutation Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi,Start residue,End residue,"(Number of substitutions, min)","(Number of substitutions, max)","(Number of substitutions, count)",Depth
Residue Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
A106_A,AC,A106C,A,A106C A,14.629585,-19.9,-32.0,-0.7,0.93,A,C,2,3,8,13.382314
A106_A,AD,A106D,A,A106D A,14.629585,-22.5,-44.0,5.3,3.23,A,D,1,2,8,13.382314
A106_A,AE,A106E,A,A106E A,14.629585,-49.8,-58.0,5.3,2.78,A,E,1,2,8,13.382314
A106_A,AF,A106F,A,A106F A,14.629585,-101.3,-76.1,-1.0,0.52,A,F,2,3,8,13.382314
A106_A,AG,A106G,A,A106G A,14.629585,28.5,14.0,2.2,0.03,A,G,1,2,16,13.382314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Y98_B,YR,Y98R,B,Y98R B,17.788426,20.2,7.0,3.2,-5.10,Y,R,2,3,12,4.837233
Y98_B,YS,Y98S,B,Y98S B,17.788426,104.6,76.1,-0.5,-0.02,Y,S,1,3,12,4.837233
Y98_B,YT,Y98T,B,Y98T B,17.788426,77.5,62.1,-0.6,0.06,Y,T,2,3,8,4.837233
Y98_B,YV,Y98V,B,Y98V B,17.788426,53.6,64.1,-5.5,-0.30,Y,V,2,3,8,4.837233


In [197]:
KatG_mutations_df

Unnamed: 0,Residue Unique ID,Mutation,KatG mutations,Chain,Mutation Unique ID,Distance from Site 1,d_volume,d_MW,d_hydropathy,d_Pi,Start residue,End residue,"(Number of substitutions, min)","(Number of substitutions, max)","(Number of substitutions, count)",Depth
0,A106_A,AC,A106C,A,A106C A,14.629585,-19.9,-32.0,-0.7,0.93,A,C,2,3,8,13.382314
1,A106_A,AD,A106D,A,A106D A,14.629585,-22.5,-44.0,5.3,3.23,A,D,1,2,8,13.382314
2,A106_A,AE,A106E,A,A106E A,14.629585,-49.8,-58.0,5.3,2.78,A,E,1,2,8,13.382314
3,A106_A,AF,A106F,A,A106F A,14.629585,-101.3,-76.1,-1.0,0.52,A,F,2,3,8,13.382314
4,A106_A,AG,A106G,A,A106G A,14.629585,28.5,14.0,2.2,0.03,A,G,1,2,16,13.382314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28153,Y98_B,YR,Y98R,B,Y98R B,17.788426,20.2,7.0,3.2,-5.10,Y,R,2,3,12,4.837233
28154,Y98_B,YS,Y98S,B,Y98S B,17.788426,104.6,76.1,-0.5,-0.02,Y,S,1,3,12,4.837233
28155,Y98_B,YT,Y98T,B,Y98T B,17.788426,77.5,62.1,-0.6,0.06,Y,T,2,3,8,4.837233
28156,Y98_B,YV,Y98V,B,Y98V B,17.788426,53.6,64.1,-5.5,-0.30,Y,V,2,3,8,4.837233
