In [2]:
import pandas, numpy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import sbmlcore

In [3]:
random_state=42
mutations ={}
mapping_dict = {'S':0, 'R':1}

We will need to map these predictions from the published Suspect-PZA model to the different training/test sets later so let's read in now

In [6]:
suspectpza = pandas.read_csv('data/reference/ds-suspectpza-results.csv')
def create_mutation(row):
    return sbmlcore.amino_acid_3to1letter[row['Wild Type']] + str(row['Position']) + sbmlcore.amino_acid_3to1letter[row['Mutant']]

suspectpza['mutation'] = suspectpza.apply(create_mutation,axis=1)    
suspectpza.set_index('mutation', inplace=True)
suspectpza['prediction'] = suspectpza[' Prediction'].str[0].map(mapping_dict)
suspectpza.drop(columns=['Chain','Wild Type','Position','Mutant','Distance to ligand',' Prediction'],inplace=True)
suspectpza

Unnamed: 0_level_0,prediction
mutation,Unnamed: 1_level_1
C72G,1
D129G,1
T167N,0
L85P,1
V139L,1
...,...
I31T,1
D8Y,1
S84I,0
R154G,0


Now let's read in the testtrain dataset so we can split it into training and test datasets

In [7]:
df = pandas.read_csv('data/ds-traintest-phen-features.csv')
df.drop(columns=['secondary_structure_codes','d_MW','phi','d_volume','d_Pi','n_hbond_acceptors','rasp_wt_nlf'],inplace=True)
mutations['tt'] = df['mutation']
features = df.iloc[:,3:]
labels = df['phenotype'].map({'S':0, 'R':1})
features[:3]

Unnamed: 0,d_hydropathy_KD,d_rogov,psi,residue_sasa,n_hbond_donors,temp_factor,depth,deep_ddG,rasp_mt_nlf,rasp_score_ml_fermi,snap2_score,dist_FE2,dist_PZA,mcsm_stability_rsa,mcsm_stability_ddG,mapp_score
0,1.9,-0.389,-35.2,86.2,0.0,59.720001,2.103049,-0.77,2.433114,0.223921,20,29.948318,27.641663,54.8,-0.771,13.49
1,-2.6,-0.548,-35.2,86.2,0.0,59.720001,2.103049,-1.096,2.896883,0.281366,52,29.948318,27.641663,54.8,-0.367,25.97
2,2.3,-0.514,-35.2,86.2,0.0,59.720001,2.103049,-0.908,2.623741,0.258386,4,29.948318,27.641663,54.8,-0.912,30.01


Split the testtrain dataset and scale the numerical data (based on a fit *only* on the training dataset)

In [8]:
mutations_train, mutations_test, X_train, X_test, Y_train, Y_test = train_test_split(mutations['tt'], features, labels, test_size=0.3,
                                                    random_state=random_state)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

numerical_attribs = list(features)

headings = '\n'.join(i for i in numerical_attribs)

OUTPUT = open('data/ds-traintest-numericalattribs.txt', 'w')
OUTPUT.write(headings)
OUTPUT.close()

pipe = Pipeline([
    ('selector', DataFrameSelector(numerical_attribs)),
    ('scaling', RobustScaler())
    ])

X_train = pipe.fit_transform(X_train)    
Y_train = Y_train.squeeze().to_numpy()
Z_train = mutations_train.to_numpy()

X_test = pipe.transform(X_test)
Y_test = Y_test.squeeze().to_numpy()   
Z_test = mutations_test.to_numpy()

with open('data/ds-train.npy', 'wb') as f:
    numpy.save(f, Y_train)
    numpy.save(f, X_train)
    numpy.save(f, Z_train)

with open('data/ds-test.npy', 'wb') as f:
    numpy.save(f, Y_test)
    numpy.save(f, X_test)    
    numpy.save(f, Z_test)    

df.loc[df.mutation.isin(mutations_test), 'set'] = 'test'
df.loc[df.mutation.isin(mutations_train), 'set'] = 'train'
df.to_csv('data/ds-traintest-phen-features-set.csv',index=False)

Let's check the % resistance is about the same between the train and test datasets

In [10]:
pandas.crosstab(df.set,df.phenotype,margins=True)

phenotype,R,S,All
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,103,97,200
train,246,218,464
All,349,315,664


Now let's work out what the predictions using Suspect-PZA are for the training and test datasets for analysis later

In [11]:
df.set_index('mutation', inplace=True)

suspectpza = suspectpza.join(df[['phenotype', 'set']], how='outer')
suspectpza['traintest'] = suspectpza.phenotype.map(mapping_dict)

def split_testtrain(row):
    result = [None,None]
    if row.set=='train':
        result = [row.traintest,None]
    elif row.set=='test':
        result = [None, row.traintest]
    return pandas.Series(result)


suspectpza[['train', 'test']] = suspectpza.apply(split_testtrain, axis=1)
suspectpza.drop(columns=['phenotype','set','traintest'], inplace=True)
suspectpza

Unnamed: 0_level_0,prediction,phenotype,set,traintest
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A102E,1,,,
A102G,0,,,
A102P,0,,,
A102R,1,,,
A102S,1,,,
...,...,...,...,...
Y99C,0,S,train,0.0
Y99D,0,S,test,0.0
Y99F,0,S,test,0.0
Y99H,0,,,


Using the (fitted) pipeline we can repeat the process for the `validation` dataset

In [13]:
df = pandas.read_csv('data/ds-validation-phen-features.csv')
df.drop(columns=['secondary_structure_codes','d_MW','phi','d_volume','d_Pi','n_hbond_acceptors','rasp_wt_nlf'],inplace=True)
mutations['v'] = df['mutation']
features = df.iloc[:,3:]
labels = df['phenotype'].map({'S':0, 'R':1})

X_validate = pipe.transform(features)
Y_validate = labels.squeeze().to_numpy()
Z_validate = mutations['v'].to_numpy()

with open('data/ds-validation.npy', 'wb') as f:
    numpy.save(f, Y_validate)
    numpy.save(f, X_validate)
    numpy.save(f, Z_validate)

..and again label the mutations present in the `validation` dataset

In [67]:
df.set_index('mutation', inplace=True)
suspectpza = suspectpza.join(df[['phenotype']], how='outer')
suspectpza['validation'] = suspectpza.phenotype.map(mapping_dict)
suspectpza.drop(columns=['phenotype'], inplace=True)
suspectpza

Unnamed: 0_level_0,prediction,train,test,validation
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A102E,1,,,
A102G,0,,,
A102P,0,,,
A102R,1,,,0.0
A102S,1,,,
...,...,...,...,...
Y99C,0,0.0,,
Y99D,0,,0.0,
Y99F,0,,0.0,
Y99H,0,,,


In [68]:
df = pandas.read_csv('data/ds-mic-phen-features.csv')
df.drop(columns=['secondary_structure_codes','d_MW','phi','d_volume','d_Pi','n_hbond_acceptors','rasp_wt_nlf'],inplace=True)
mutations['mic'] = df['mutation']
features = df.iloc[:,3:]
labels = df['phenotype'].map({'S':0, 'R':1})

X_mic = pipe.transform(features)
Y_mic = labels.squeeze().to_numpy()
Z_mic = mutations['mic'].to_numpy()

with open('data/ds-mic.npy', 'wb') as f:
    numpy.save(f, Y_mic)
    numpy.save(f, X_mic)
    numpy.save(f, Z_mic)

In [69]:
df.set_index('mutation', inplace=True)
suspectpza = suspectpza.join(df[['phenotype']], how='outer')
suspectpza['mic'] = suspectpza.phenotype.map(mapping_dict)
suspectpza[:3]

Unnamed: 0_level_0,prediction,train,test,validation,phenotype,mic
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A102E,1,,,,,
A102G,0,,,,,
A102P,0,,,,,


In [70]:
assert len(Y_mic) == suspectpza.mic.notna().sum()
assert len(Y_test) == suspectpza.test.notna().sum()
assert len(Y_train) == suspectpza.train.notna().sum()
assert len(Y_validate) == suspectpza.validation.notna().sum()


In [71]:
suspectpza

Unnamed: 0_level_0,prediction,train,test,validation,phenotype,mic
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A102E,1,,,,,
A102G,0,,,,,
A102P,0,,,,,
A102R,1,,,0.0,,
A102S,1,,,,,
...,...,...,...,...,...,...
Y99C,0,0.0,,,,
Y99D,0,,0.0,,,
Y99F,0,,0.0,,,
Y99H,0,,,,,


In [72]:
suspectpza = suspectpza[~(suspectpza.test.isna() & suspectpza.train.isna() & suspectpza.validation.isna() & suspectpza.mic.isna())]
suspectpza.to_csv('data/reference/ds-suspectpza-results-sets.csv')

In [73]:
suspectpza[suspectpza['test'].notna()].index.to_numpy()

array(['A143D', 'A143V', 'A146E', 'A161T', 'A178D', 'A20E', 'A20S',
       'A20T', 'A28P', 'A30D', 'A30P', 'A30S', 'A36S', 'A36V', 'A38S',
       'A39S', 'A3V', 'A46E', 'A46S', 'A79S', 'A79T', 'A89P', 'A92S',
       'C138S', 'C184Y', 'D110N', 'D110V', 'D129E', 'D12E', 'D158N',
       'D166V', 'D49E', 'D49G', 'D49Y', 'D56N', 'D63N', 'D80N', 'D86E',
       'D86N', 'D8E', 'D8G', 'D8H', 'E107Q', 'E127D', 'E127K', 'E144G',
       'E15D', 'E15G', 'E15Q', 'E173K', 'E173V', 'E174G', 'E181K', 'E91D',
       'F13I', 'F13V', 'F13Y', 'F50Y', 'F58I', 'F58L', 'F58Y', 'F81S',
       'F94C', 'G105S', 'G108R', 'G124A', 'G124C', 'G124D', 'G132R',
       'G16S', 'G16V', 'G17S', 'G17V', 'G23A', 'G23V', 'G24D', 'G60D',
       'G75D', 'G75V', 'G78D', 'G97D', 'G97V', 'H137Q', 'H43P', 'H43Y',
       'H57P', 'H57Q', 'H57R', 'H71D', 'H71N', 'H71P', 'H82L', 'H82R',
       'I133N', 'I31N', 'I52N', 'I6F', 'I90F', 'I90S', 'I90T', 'K48T',
       'K96Q', 'K96T', 'L116P', 'L116R', 'L120R', 'L156P', 'L159Q',
       'L1

In [74]:
for i in ['test', 'train', 'validation', 'mic']:
    Y=suspectpza[suspectpza[i].notna()][i].squeeze().to_numpy()
    Z=suspectpza[suspectpza[i].notna()].prediction.squeeze().to_numpy()
    ZZ=suspectpza[suspectpza[i].notna()].index.to_numpy()
    with open('data/suspectpza-'+i+'.npy', 'wb') as f:
        numpy.save(f, Y)
        numpy.save(f, Z)
        numpy.save(f, ZZ)

In [75]:
suspectpza.validation.value_counts()

1.0    155
0.0     44
Name: validation, dtype: int64