# Figure 2

In [114]:
import numpy, pandas
import matplotlib
import matplotlib.pyplot as plt
import MDAnalysis

## Figure 2a

In [5]:
DATASET = pandas.read_csv('data/ds-traintest-phen.csv')
def find_position(row):
    return int(row.MUTATION[1:-1])
DATASET['POSITION'] = DATASET.apply(find_position, axis=1)
DATASET

Unnamed: 0,MUTATION,CONSISTENT_PHENOTYPE,POSITION
0,A102V,S,102
1,A134D,S,134
2,A134P,R,134
3,A134S,S,134
4,A134V,R,134
...,...,...,...
659,Y95N,S,95
660,Y99C,S,99
661,Y99D,S,99
662,Y99F,S,99


Make a crosstab so we can pick out codons which tend to have mutations associated with resistance etc

In [111]:
df = pandas.crosstab(DATASET.POSITION, DATASET.CONSISTENT_PHENOTYPE)
df['TOTAL'] = df['S'] + df['R']
df['PROP_R'] = df['R']/df['TOTAL']
df

CONSISTENT_PHENOTYPE,R,S,TOTAL,PROP_R
POSITION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,1,3,0.666667
2,0,1,1,0.000000
3,1,2,3,0.333333
4,4,1,5,0.800000
5,5,1,6,0.833333
...,...,...,...,...
181,1,1,2,0.500000
182,1,1,2,0.500000
183,1,2,3,0.333333
184,1,1,2,0.500000


What is the distribution?

In [None]:
df.PROP_R.value_counts().sort_index()

0.000000    38
0.166667     1
0.200000     3
0.250000    16
0.333333    25
0.400000     2
0.500000    21
0.600000     6
0.666667    19
0.750000     8
0.800000     6
0.833333     2
1.000000    37
Name: PROP_R, dtype: int64

First, let's pick up those codons where less than 20% of mutations are associated with resistance

In [109]:
df2 = df[df.PROP_R<=0.2]
'resid ' + ' '.join(str(i) for i in numpy.sort(df2.index.unique()))

'resid 2 11 15 20 25 33 37 39 40 45 50 53 56 60 61 64 73 74 80 83 84 87 91 95 98 99 100 102 107 111 112 113 121 126 141 144 148 157 167 169 178 185'

Then those where greater than 80% are associated with resistance

In [108]:
df2 = df[df.PROP_R>=0.8]
'resid ' + ' '.join(str(i) for i in numpy.sort(df2.index.unique()))

'resid 4 5 7 8 10 12 13 14 24 31 41 47 49 51 54 57 62 63 68 69 71 72 76 93 94 96 97 103 104 108 116 119 132 135 138 139 142 146 155 159 164 165 170 175 180'

..and finally those where between 40 and 60% of mutations are associated with resistance

In [112]:
df2 = df[(df.PROP_R>=0.4) & (df.PROP_R<=0.6)]
'resid ' + ' '.join(str(i) for i in numpy.sort(df2.index.unique()))

'resid 17 30 32 38 42 44 55 65 70 81 82 90 92 101 105 110 115 118 120 125 133 134 147 154 160 166 181 182 184'

Create a PDB file with the proportion of resistance stored in the `BETA` column

In [118]:
u = MDAnalysis.Universe('data/reference/3PL1-PZA.pdb')

positions = df[(df.PROP_R==0)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 0

positions = df[(df.PROP_R>0) & (df.PROP_R<0.2)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 16.667

positions = df[(df.PROP_R==0.2)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 20

positions = df[(df.PROP_R==0.25)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 25

positions = df[(df.PROP_R>0.25) & (df.PROP_R<0.4)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 33.3333

positions = df[(df.PROP_R==0.4)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 40

positions = df[(df.PROP_R==0.5)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 50

positions = df[(df.PROP_R==0.6)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 60

positions = df[(df.PROP_R>0.6) & (df.PROP_R<0.7)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 66.66667

positions = df[(df.PROP_R==0.75)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 75

positions = df[(df.PROP_R==0.8)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 80

positions = df[(df.PROP_R>0.8) & (df.PROP_R<0.9)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 83.3333

positions = df[(df.PROP_R==1)].index
a = u.select_atoms('resid ' + ' '.join(str(i) for i in positions))
a.atoms.tempfactors = 100

all = u.select_atoms('all')
all.write('data/reference/3PL1-PZA-beta.pdb')

