In [26]:
import pandas, numpy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [15]:
random_state=42

In [16]:
df = pandas.read_csv('data/ds-traintest-phen-features.csv')

mutations = df['mutation']
features = df.iloc[:,3:]
labels = df['phenotype'].map({'S':0, 'R':1})

features[:3]

Unnamed: 0,d_volume,d_hydropathy_KD,d_Pi,d_MW,d_rogov,phi,psi,residue_sasa,n_hbond_acceptors,n_hbond_donors,...,deep_ddG,rasp_wt_nlf,rasp_mt_nlf,rasp_score_ml_fermi,snap2_score,dist_FE2,dist_PZA,mcsm_stability_rsa,mcsm_stability_ddG,mapp_score
0,3.8,1.9,0.24,-18.0,-0.389,0.0,-35.2,86.2,2.0,0.0,...,-0.77,3.785957,2.433114,0.223921,20,29.948318,27.641663,54.8,-0.771,13.49
1,-46.8,-2.6,-0.14,-30.1,-0.548,0.0,-35.2,86.2,2.0,0.0,...,-1.096,3.785957,2.896883,0.281366,52,29.948318,27.641663,54.8,-0.367,25.97
2,-22.9,2.3,0.22,-32.1,-0.514,0.0,-35.2,86.2,2.0,0.0,...,-0.908,3.785957,2.623741,0.258386,4,29.948318,27.641663,54.8,-0.912,30.01


In [19]:
mutations_train, mutations_test, X_train, X_test, Y_train, Y_test = train_test_split(mutations, features, labels, test_size=0.3,
                                                    random_state=random_state)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

numerical_attribs = list(features)

pipe = Pipeline([
    ('selector', DataFrameSelector(numerical_attribs)),
    ('scaling', RobustScaler())
    ])

X_prepared = pipe.fit_transform(features)
Y_prepared = labels.squeeze().to_numpy()

X_train_prepared = pipe.fit_transform(X_train)    
Y_train_prepared = Y_train.squeeze().to_numpy()

X_test_prepared = pipe.fit_transform(X_test)
Y_test_prepared = Y_test.squeeze().to_numpy()   

In [49]:
headings = '\n'.join(i for i in numerical_attribs)
print(headings)

d_volume
d_hydropathy_KD
d_Pi
d_MW
d_rogov
phi
psi
residue_sasa
n_hbond_acceptors
n_hbond_donors
secondary_structure_codes
temp_factor
depth
deep_ddG
rasp_wt_nlf
rasp_mt_nlf
rasp_score_ml_fermi
snap2_score
dist_FE2
dist_PZA
mcsm_stability_rsa
mcsm_stability_ddG
mapp_score


In [50]:
OUTPUT = open('data/ds-traintest-numericalattribs.txt', 'w')
OUTPUT.write(headings)
OUTPUT.close()

In [24]:
df

Unnamed: 0,mutation,segid,phenotype,d_volume,d_hydropathy_KD,d_Pi,d_MW,d_rogov,phi,psi,...,deep_ddG,rasp_wt_nlf,rasp_mt_nlf,rasp_score_ml_fermi,snap2_score,dist_FE2,dist_PZA,mcsm_stability_rsa,mcsm_stability_ddG,mapp_score
0,M1L,A,R,3.8,1.9,0.24,-18.0,-0.389,0.00,-35.20,...,-0.770,3.785957,2.433114,0.223921,20,29.948318,27.641663,54.8,-0.771,13.49
1,M1T,A,R,-46.8,-2.6,-0.14,-30.1,-0.548,0.00,-35.20,...,-1.096,3.785957,2.896883,0.281366,52,29.948318,27.641663,54.8,-0.367,25.97
2,M1V,A,S,-22.9,2.3,0.22,-32.1,-0.514,0.00,-35.20,...,-0.908,3.785957,2.623741,0.258386,4,29.948318,27.641663,54.8,-0.912,30.01
3,R2Q,A,S,-29.6,1.0,-5.11,-28.0,0.375,-117.94,133.99,...,-1.144,3.035567,3.338515,0.331465,-26,27.459823,24.722696,27.5,-0.790,2.68
4,A3E,A,R,49.8,-5.3,-2.78,58.0,-0.033,-126.53,153.85,...,-5.340,2.483982,2.712832,0.554393,56,24.149025,21.557456,0.0,-2.528,35.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659,S185G,A,S,-28.9,0.4,0.29,-30.0,0.120,-73.76,0.00,...,-0.900,2.844502,2.560213,0.273806,69,27.305858,22.750537,12.8,-0.597,3.20
660,S185I,A,S,77.7,5.3,0.34,26.1,-0.026,-73.76,0.00,...,0.128,2.844502,2.854018,0.215031,62,27.305858,22.750537,12.8,-0.234,2.82
661,S185N,A,S,25.1,-2.7,-0.27,27.0,0.271,-73.76,0.00,...,-1.120,2.844502,3.127102,0.229262,53,27.305858,22.750537,12.8,-0.277,1.51
662,S185R,A,S,84.4,-3.7,5.08,69.1,0.127,-73.76,0.00,...,-0.811,2.844502,3.035567,0.274627,67,27.305858,22.750537,12.8,-0.124,2.78


In [37]:
df.loc[df.mutation.isin(mutations_test), 'set'] = 'test'
df.loc[df.mutation.isin(mutations_train), 'set'] = 'train'
df.to_csv('data/ds-traintest-phen-features-set.csv',index=False)

In [38]:
with open('data/ds-train.npy', 'wb') as f:
    numpy.save(f, Y_train_prepared)
    numpy.save(f, X_train_prepared)

with open('data/ds-test.npy', 'wb') as f:
    numpy.save(f, Y_test_prepared)
    numpy.save(f, X_test_prepared)    