In [9]:
import pandas, numpy, joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import sbmlcore

In [10]:
random_state=42
mutations ={}
mapping_dict = {'S':0, 'R':1, 'U':2}

Let's read in the testtrain dataset so we can split it into training and test datasets but now we can drop the six features shown by the univariate logistical regression to be not much better than random

In [11]:
df = pandas.read_csv('data/ds-traintest-phen-features.csv')

df.drop(columns=['secondary_structure_codes',
 'd_MW',
 'phi',
 'd_volume',
 'd_Pi',
 'n_hbond_acceptors'],inplace=True)

mutations['tt'] = df['mutation']
features = df.iloc[:,3:]
labels = df['phenotype'].map({'S':0, 'R':1})
features[:3]

Unnamed: 0,d_hydropathy_KD,d_rogov,psi,residue_sasa,n_hbond_donors,temp_factor,depth,deep_ddG,rasp_score_ml,snap2_score,dist_FE2,dist_PZA,mcsm_stability_rsa,mcsm_stability_ddG,mapp_score
0,1.9,-0.389,-35.2,86.2,0.0,59.720001,2.103049,-0.77,-0.107405,20,29.948318,27.641663,54.8,-0.771,13.49
1,-2.6,-0.548,-35.2,86.2,0.0,59.720001,2.103049,-1.096,0.655765,52,29.948318,27.641663,54.8,-0.367,25.97
2,2.3,-0.514,-35.2,86.2,0.0,59.720001,2.103049,-0.908,0.364061,4,29.948318,27.641663,54.8,-0.912,30.01


In [12]:
mutations_train, mutations_test, X_train, X_test, Y_train, Y_test = train_test_split(mutations['tt'], features, labels, test_size=0.3,
                                                    random_state=random_state)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

numerical_attribs = list(features)

headings = '\n'.join(i for i in numerical_attribs)

OUTPUT = open('data/ds-traintest-numericalattribs.txt', 'w')
OUTPUT.write(headings)
OUTPUT.close()

pipe = joblib.load('data/pipeline.pkl')

pipe = Pipeline([
    ('selector', DataFrameSelector(numerical_attribs)),
    ('scaling', RobustScaler())
    ])

X_train = pipe.fit_transform(X_train)    
Y_train = Y_train.squeeze().to_numpy()
Z_train = mutations_train.to_numpy()

# let's save the pipeline to preserve the scaling etc
joblib.dump(pipe, 'data/pipeline.pkl')

X_test = pipe.transform(X_test)
Y_test = Y_test.squeeze().to_numpy()   
Z_test = mutations_test.to_numpy()

with open('data/ds-train.npy', 'wb') as f:
    numpy.save(f, Y_train)
    numpy.save(f, X_train)
    numpy.save(f, Z_train)

with open('data/ds-test.npy', 'wb') as f:
    numpy.save(f, Y_test)
    numpy.save(f, X_test)    
    numpy.save(f, Z_test)    

df.loc[df.mutation.isin(mutations_test), 'set'] = 'test'
df.loc[df.mutation.isin(mutations_train), 'set'] = 'train'
df.to_csv('data/ds-traintest-phen-features-set.csv',index=False)

Let's check the % resistance is about the same between the train and test datasets

In [13]:
pandas.crosstab(df.set,df.phenotype,margins=True)

phenotype,R,S,All
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,103,97,200
train,246,218,464
All,349,315,664


Now let's work out what the predictions using Suspect-PZA are for the training and test datasets for analysis later

In [14]:
suspectpza = pandas.read_csv('data/reference/ds-suspectpza-results.csv')
def create_mutation(row):
    return sbmlcore.amino_acid_3to1letter[row['Wild Type']] + str(row['Position']) + sbmlcore.amino_acid_3to1letter[row['Mutant']]

suspectpza['mutation'] = suspectpza.apply(create_mutation,axis=1)    
suspectpza.set_index('mutation', inplace=True)
suspectpza['prediction'] = suspectpza[' Prediction'].str[0].map(mapping_dict)
suspectpza.drop(columns=['Chain','Wild Type','Position','Mutant','Distance to ligand',' Prediction'],inplace=True)
suspectpza

Unnamed: 0_level_0,prediction
mutation,Unnamed: 1_level_1
C72G,1
D129G,1
T167N,0
L85P,1
V139L,1
...,...
I31T,1
D8Y,1
S84I,0
R154G,0


In [15]:
df.set_index('mutation', inplace=True)

suspectpza = suspectpza.join(df[['phenotype', 'set']], how='outer')
suspectpza['traintest'] = suspectpza.phenotype.map(mapping_dict)

def split_testtrain(row):
    result = [None,None]
    if row.set=='train':
        result = [row.traintest,None]
    elif row.set=='test':
        result = [None, row.traintest]
    return pandas.Series(result)


suspectpza[['train', 'test']] = suspectpza.apply(split_testtrain, axis=1)
suspectpza.drop(columns=['phenotype','set','traintest'], inplace=True)
suspectpza

Unnamed: 0_level_0,prediction,train,test
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A102E,1,,
A102G,0,,
A102P,0,,
A102R,1,,
A102S,1,,
...,...,...,...
Y99C,0,0.0,
Y99D,0,,0.0
Y99F,0,,0.0
Y99H,0,,


Using the (fitted) pipeline we can repeat the process for the two types of `validation` dataset, depending whether just clinical samples or aggregated mutations

In [19]:
for i in ['samples','mutations']:
    df = pandas.read_csv('data/ds-validation-' + i + '-phen-features.csv')

    df.drop(columns=['secondary_structure_codes',
        'd_MW',
        'phi',
        'd_volume',
        'd_Pi',
        'n_hbond_acceptors'],inplace=True)

    mutations['v'] = df['mutation']
    features = df.iloc[:,3:]
    labels = df['phenotype'].map({'S':0, 'R':1, 'U':2})

    X_validate = pipe.transform(features)
    Y_validate = labels.squeeze().to_numpy()
    Z_validate = mutations['v'].to_numpy()

    with open('data/ds-validation-' + i + '.npy', 'wb') as f:
        numpy.save(f, Y_validate)
        numpy.save(f, X_validate)
        numpy.save(f, Z_validate)

..and again label the mutations present in the `validation` dataset

In [20]:
df.set_index('mutation', inplace=True)
suspectpza = suspectpza.join(df[['phenotype']], how='outer')
suspectpza['validation-mutations'] = suspectpza.phenotype.map(mapping_dict)
suspectpza.drop(columns=['phenotype'], inplace=True)
suspectpza

Unnamed: 0_level_0,prediction,train,test,validation-mutations
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A102E,1,,,
A102G,0,,,
A102P,0,,,
A102R,1,,,0.0
A102S,1,,,
...,...,...,...,...
Y99C,0,0.0,,
Y99D,0,,0.0,
Y99F,0,,0.0,
Y99H,0,,,


In [24]:
df = pandas.read_csv('data/ds-mic-phen-features.csv')

df.drop(columns=['secondary_structure_codes',
        'd_MW',
        'phi',
        'd_volume',
        'd_Pi',
        'n_hbond_acceptors'],inplace=True)


mutations['mic'] = df['mutation']
features = df.iloc[:,3:]
labels = df['phenotype'].map({'S':0, 'R':1})

X_mic = pipe.transform(features)
Y_mic = labels.squeeze().to_numpy()
Z_mic = mutations['mic'].to_numpy()

with open('data/ds-mic.npy', 'wb') as f:
    numpy.save(f, Y_mic)
    numpy.save(f, X_mic)
    numpy.save(f, Z_mic)

In [26]:
df.set_index('mutation', inplace=True)
suspectpza = suspectpza.join(df[['phenotype']], how='outer')
suspectpza['mic'] = suspectpza.phenotype.map(mapping_dict)
suspectpza.drop(columns=['phenotype'], inplace=True)
suspectpza[:3]

Unnamed: 0_level_0,prediction,train,test,validation-mutations,mic
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A102E,1,,,,
A102G,0,,,,
A102P,0,,,,


In [27]:
suspectpza.mic.value_counts()

mic
1.0    50
0.0     7
Name: count, dtype: int64

In [28]:
assert len(Y_mic) == suspectpza.mic.notna().sum(), suspectpza.mic.notna().sum()
assert len(Y_test) == suspectpza.test.notna().sum()
assert len(Y_train) == suspectpza.train.notna().sum()
assert len(Y_validate) == suspectpza['validation-mutations'].notna().sum()


In [29]:
suspectpza

Unnamed: 0_level_0,prediction,train,test,validation-mutations,mic
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A102E,1,,,,
A102G,0,,,,
A102P,0,,,,
A102R,1,,,0.0,
A102S,1,,,,
...,...,...,...,...,...
Y99C,0,0.0,,,
Y99D,0,,0.0,,
Y99F,0,,0.0,,
Y99H,0,,,,


In [30]:
for i in ['test', 'train', 'validation-mutations', 'mic']:
    Y=suspectpza[suspectpza[i].notna()][i].squeeze().to_numpy()
    Z=suspectpza[suspectpza[i].notna()].prediction.squeeze().to_numpy()
    ZZ=suspectpza[suspectpza[i].notna()].index.to_numpy()
    with open('data/suspectpza-'+i+'.npy', 'wb') as f:
        numpy.save(f, Y)
        numpy.save(f, Z)
        numpy.save(f, ZZ)

In [31]:
df = pandas.read_csv('data/ds-validation-samples-phen-features.csv')
df.set_index('mutation', inplace=True)
df[:3]

Unnamed: 0_level_0,segid,phenotype,d_volume,d_hydropathy_KD,d_Pi,d_MW,d_rogov,phi,psi,residue_sasa,...,temp_factor,depth,deep_ddG,rasp_score_ml,snap2_score,dist_FE2,dist_PZA,mcsm_stability_rsa,mcsm_stability_ddG,mapp_score
mutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M1I,A,R,3.8,2.6,0.28,-18.0,-0.452,0.0,-35.2,86.2,...,59.720001,2.103049,-0.407,0.076292,7,29.948318,27.641663,54.8,-0.771,21.99
M1I,A,S,3.8,2.6,0.28,-18.0,-0.452,0.0,-35.2,86.2,...,59.720001,2.103049,-0.407,0.076292,7,29.948318,27.641663,54.8,-0.771,21.99
M1I,A,S,3.8,2.6,0.28,-18.0,-0.452,0.0,-35.2,86.2,...,59.720001,2.103049,-0.407,0.076292,7,29.948318,27.641663,54.8,-0.771,21.99


In [34]:
suspectpza = suspectpza.join(df[['phenotype']], how='outer')
suspectpza.prediction.value_counts(dropna=False)

prediction
1    3513
0    1157
Name: count, dtype: int64

In [36]:
suspectpza['validation-samples'] = suspectpza.phenotype.map(mapping_dict)
suspectpza[:3]

In [38]:
for i in ['validation-samples']:
    Y=suspectpza[suspectpza[i].notna()][i].squeeze().to_numpy()
    Z=suspectpza[suspectpza[i].notna()].prediction.squeeze().to_numpy()
    ZZ=suspectpza[suspectpza[i].notna()].index.to_numpy()
    with open('data/suspectpza-'+i+'.npy', 'wb') as f:
        numpy.save(f, Y)
        numpy.save(f, Z)
        numpy.save(f, ZZ)

In [None]:
suspectpza = suspectpza[~(suspectpza.test.isna() & suspectpza.train.isna() & suspectpza['validation-mutations'].isna() & suspectpza.mic.isna())]
suspectpza.to_csv('data/reference/ds-suspectpza-results-sets.csv')