In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# PREDICTING STATE

In [2]:
df = pd.read_csv('data/364_interaction_energies_state_function_v3.txt', sep='\t')

#drop rows with 'other' state
df = df.drop(df[df['State'] == 'Other'].index)

#drop index
df.drop('index', axis = 1, inplace=True)

#copy df to structure_df
structure_df = df.copy()
structure_df.drop(['PDBID','Function'], axis = 1, inplace = True)

In [3]:
structure_df['State'].value_counts()

Inactive        213
Active          121
Intermediate     29
Name: State, dtype: int64

In [4]:
# get columns with 'sum' in their name
sum_cols = [col for col in structure_df.columns if 'sum' in col]

# create empty list for residue numbers
resnums = []

# loop through sum columns and count interactions that don't have nonzero energies
for col in sum_cols:
    resnum = col[:4]
    resnums.append(resnum)

# drop columns from df in which > 10% of entries are NaN
for resnum in resnums:
    intenergysum_col = resnum + '_intenergysum'
    inttype1_col = resnum + '_inttype1'
    intenergy1_col = resnum + '_intenergy1'
    inttype2_col = resnum + '_inttype2'
    intenergy2_col = resnum + '_intenergy2'
    
    print('structures with interactions at position', resnum, ':', structure_df[intenergysum_col][structure_df[intenergysum_col] != 0].value_counts().sum())
    if structure_df[intenergysum_col][structure_df[intenergysum_col] != 0].value_counts().sum() < 10:
        structure_df.drop([intenergysum_col, inttype1_col, intenergy1_col, inttype2_col, intenergy2_col], axis = 1, inplace = True)
        print('dropped columns for residue: ', resnum, '\n')

#     # for intenergysum columns
#     if 'intenergysum' in col:
#         if df[col][df[col] != 0.0].value_counts().sum() < (0.20 * len(df)):
#             df.drop([col], axis = 1, inplace = True)
#             print('dropped column: ', col)

structures with interactions at position 1.21 : 0
dropped columns for residue:  1.21 

structures with interactions at position 1.22 : 0
dropped columns for residue:  1.22 

structures with interactions at position 1.23 : 0
dropped columns for residue:  1.23 

structures with interactions at position 1.24 : 0
dropped columns for residue:  1.24 

structures with interactions at position 1.25 : 0
dropped columns for residue:  1.25 

structures with interactions at position 1.26 : 0
dropped columns for residue:  1.26 

structures with interactions at position 1.27 : 1
dropped columns for residue:  1.27 

structures with interactions at position 1.28 : 0
dropped columns for residue:  1.28 

structures with interactions at position 1.29 : 0
dropped columns for residue:  1.29 

structures with interactions at position 1.30 : 2
dropped columns for residue:  1.30 

structures with interactions at position 1.31 : 5
dropped columns for residue:  1.31 

structures with interactions at position 1.

dropped columns for residue:  3.61 

structures with interactions at position 3.62 : 0
dropped columns for residue:  3.62 

structures with interactions at position 3.63 : 0
dropped columns for residue:  3.63 

structures with interactions at position 3.64 : 0
dropped columns for residue:  3.64 

structures with interactions at position 3.65 : 0
dropped columns for residue:  3.65 

structures with interactions at position 3.66 : 0
dropped columns for residue:  3.66 

structures with interactions at position 3.67 : 0
dropped columns for residue:  3.67 

structures with interactions at position 3.68 : 0
dropped columns for residue:  3.68 

structures with interactions at position 3.69 : 0
dropped columns for residue:  3.69 

structures with interactions at position 3.70 : 0
dropped columns for residue:  3.70 

structures with interactions at position 3.71 : 0
dropped columns for residue:  3.71 

structures with interactions at position 4.29 : 0
dropped columns for residue:  4.29 

struct

structures with interactions at position 5.87 : 0
dropped columns for residue:  5.87 

structures with interactions at position 6.15 : 0
dropped columns for residue:  6.15 

structures with interactions at position 6.16 : 0
dropped columns for residue:  6.16 

structures with interactions at position 6.17 : 0
dropped columns for residue:  6.17 

structures with interactions at position 6.18 : 0
dropped columns for residue:  6.18 

structures with interactions at position 6.19 : 0
dropped columns for residue:  6.19 

structures with interactions at position 6.20 : 0
dropped columns for residue:  6.20 

structures with interactions at position 6.21 : 0
dropped columns for residue:  6.21 

structures with interactions at position 6.22 : 0
dropped columns for residue:  6.22 

structures with interactions at position 6.23 : 0
dropped columns for residue:  6.23 

structures with interactions at position 6.24 : 0
dropped columns for residue:  6.24 

structures with interactions at position 6.

In [5]:
structure_df

Unnamed: 0,State,1.35_intenergysum,1.35_inttype1,1.35_intenergy1,1.35_inttype2,1.35_intenergy2,1.39_intenergysum,1.39_inttype1,1.39_intenergy1,1.39_inttype2,...,7.42_intenergysum,7.42_inttype1,7.42_intenergy1,7.42_inttype2,7.42_intenergy2,7.43_intenergysum,7.43_inttype1,7.43_intenergy1,7.43_inttype2,7.43_intenergy2
0,Active,0.0,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,,0.0,0.0,,0.0,,0.0
1,Active,0.0,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,,0.0,0.0,,0.0,,0.0
2,Active,0.0,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,,0.0,0.0,,0.0,,0.0
3,Active,0.0,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,,0.0,0.0,,0.0,,0.0
4,Active,0.0,,0.0,,0.0,0.0,,0.0,,...,-0.1,Hbond,-0.1,,0.0,-0.6,Hbond,-0.5,Hbond,-0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,Active,0.0,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,,0.0,-0.1,Arene,-0.1,,
360,Inactive,0.0,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,,0.0,0.0,,0.0,,0.0
361,Inactive,0.0,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,,0.0,-0.2,Arene,-0.2,,
362,Intermediate,0.0,Distance,0.0,Distance,0.0,-5.2,Hbond,-5.2,,...,0.0,,0.0,,0.0,0.0,,0.0,,0.0


In [6]:
actual_states = structure_df['State']

In [7]:
# label encoding

# create instance of labelencoder
labelencoder = LabelEncoder()

cols = [col for col in structure_df.columns if 'type' in col]

# loop though all columns and convert strings to categorical integer variables
for col in cols:
    structure_df[col] = labelencoder.fit_transform(structure_df[col])


# encode states as integers
# get columns with 'type' in their name
cols = [col for col in structure_df.columns if 'State' in col]

# loop though all columns and convert strings to categorical integer variables
for col in cols:
    structure_df[col] = labelencoder.fit_transform(structure_df[col])

In [8]:
labelencoder.classes_

array(['Active', 'Inactive', 'Intermediate'], dtype=object)

In [9]:
# assign target classes to y
y = structure_df['State']

# assign data to X
X = structure_df.drop(['State'], axis = 1)

# create actual_state column with non-encoded states
X['actual_state'] = actual_states

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
X_train['actual_state'].value_counts().sum()

272

In [12]:
def scale_impute(dataframe):
    # get colnames
    colnames = list(dataframe.drop(['actual_state'], axis = 1).columns)
    state_df = dataframe['actual_state']
    #state_df.reset_index(inplace=True)
    df = dataframe.drop(['actual_state'], axis = 1)

    # impute data
    from sklearn.impute import SimpleImputer
    my_imputer = SimpleImputer()
    df_imputed = pd.DataFrame(my_imputer.fit_transform(df))

    # scale data
    scaler = StandardScaler()
    to_scale = [col for col in df_imputed.columns.values]
    scaler.fit(df_imputed[to_scale])

    # predict z-scores on the test set
    df_imputed[to_scale] = scaler.transform(df_imputed[to_scale]) 

    # #rename columns
    df_imputed.columns = colnames

    # display scaled values
    display(df_imputed)
    
    return(df_imputed, state_df)

In [13]:
(X_train_imputed, X_train_states) = scale_impute(X_train)
(X_test_imputed, X_test_states) = scale_impute(X_test)

Unnamed: 0,1.35_intenergysum,1.35_inttype1,1.35_intenergy1,1.35_inttype2,1.35_intenergy2,1.39_intenergysum,1.39_inttype1,1.39_intenergy1,1.39_inttype2,1.39_intenergy2,...,7.42_intenergysum,7.42_inttype1,7.42_intenergy1,7.42_inttype2,7.42_intenergy2,7.43_intenergysum,7.43_inttype1,7.43_intenergy1,7.43_inttype2,7.43_intenergy2
0,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.047721,-2.512331,0.010820,1.215579,0.00000
1,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,-0.083056,-1.186565,-0.17628,2.032618,0.000000,0.150508,0.533824,0.161744,0.021942,0.14679
2,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.150508,0.533824,0.161744,0.021942,0.14679
3,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.047721,-0.481561,0.010820,1.215579,0.00000
4,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.150508,0.533824,0.161744,0.021942,0.14679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.150508,0.533824,0.161744,0.021942,0.14679
268,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.150508,0.533824,0.161744,0.021942,0.14679
269,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.150508,0.533824,0.161744,0.021942,0.14679
270,0.167637,0.135135,0.164854,-0.075059,0.092924,0.148474,0.12943,0.144143,-0.081957,0.082245,...,0.279979,0.435272,0.24648,0.058075,0.210169,0.150508,0.533824,0.161744,0.021942,0.14679


Unnamed: 0,1.35_intenergysum,1.35_inttype1,1.35_intenergy1,1.35_inttype2,1.35_intenergy2,1.39_intenergysum,1.39_inttype1,1.39_intenergy1,1.39_inttype2,1.39_intenergy2,...,7.42_intenergysum,7.42_inttype1,7.42_intenergy1,7.42_inttype2,7.42_intenergy2,7.43_intenergysum,7.43_inttype1,7.43_intenergy1,7.43_inttype2,7.43_intenergy2
0,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,0.528074,0.292855,0.204313,0.185323
1,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,-5.671783,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,0.528074,0.292855,0.204313,0.185323
2,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,-3.381779,-2.414051,-4.301933,1.182864,0.000000
3,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,0.528074,0.292855,0.204313,0.185323
4,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,0.528074,0.292855,0.204313,0.185323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,-1.433343,0.292855,0.204313,0.185323
87,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,0.528074,0.292855,0.204313,0.185323
88,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,0.528074,0.292855,0.204313,0.185323
89,0.110284,0.068789,0.110284,-0.241121,0.0,0.125085,0.226871,0.125085,-0.184637,0.0,...,0.220938,0.31046,0.229936,0.09932,0.18713,0.317042,0.528074,0.292855,0.204313,0.185323


In [14]:
X_train['actual_state']

60           Active
196          Active
304        Inactive
132    Intermediate
221        Inactive
           ...     
71         Inactive
106    Intermediate
271        Inactive
349        Inactive
102        Inactive
Name: actual_state, Length: 272, dtype: object

In [15]:
X_train_states = X_train_states.reset_index()
X_train_states.drop(['index'], axis = 1, inplace = True)

X_test_states = X_test_states.reset_index()
X_test_states.drop(['index'], axis = 1, inplace = True)

In [16]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100, random_state=1, class_weight = 'balanced_subsample')

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train_imputed,y_train)

RandomForestClassifier(class_weight='balanced_subsample', random_state=1)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# cross-validation
scores = cross_val_score(clf, X_train_imputed, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

# k-fold CV
kfold = KFold(n_splits=10, shuffle=True, random_state = 1)
kf_cv_scores = cross_val_score(clf, X_train_imputed, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

Mean cross-validation score: 0.75
K-fold CV average score: 0.77


In [18]:
cross_val_score(clf, X_train_imputed, y_train, cv=5)

array([0.70909091, 0.83636364, 0.74074074, 0.75925926, 0.7037037 ])

In [19]:
y_pred=clf.predict(X_test_imputed)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# reverse label encoding
y_pred_actual = labelencoder.inverse_transform(y_pred)
y_test_actual = labelencoder.inverse_transform(y_test)

data = {'y_Actual':    y_test_actual,
        'y_Predicted': y_pred_actual
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])

confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix, '\n')

# Model Accuracy, how often is the classifier correct?
acc = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average = 'weighted', labels=np.unique(y_pred))
recall = metrics.recall_score(y_test, y_pred, average = 'weighted', labels=np.unique(y_pred))
print("Accuracy:","{:.2f}".format(acc))
print("Precision:","{:.2f}".format(precision))
print("Recall:","{:.2f}".format(recall), '\n')

Predicted     Active  Inactive  Intermediate
Actual                                      
Active            21         6             0
Inactive           5        55             0
Intermediate       1         2             1 

Accuracy: 0.85
Precision: 0.85
Recall: 0.85 



## XGBoost

In [371]:
import xgboost as xgb

xgbc = xgb.XGBClassifier(use_label_encoder=False,
                         eval_metric='mlogloss',
                         n_estimators=500,
                         random_state = 1,
                         learning_rate = 0.05
                        )

In [372]:
xgbc

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='mlogloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=1, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=None,
              verbosity=None)

In [373]:
xgbc.fit(X_train_imputed, y_train)

from sklearn.model_selection import cross_val_score, KFold

# cross-validation
scores = cross_val_score(xgbc, X_train_imputed, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

# k-fold CV
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbc, X_train_imputed, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

from sklearn.metrics import confusion_matrix

y_pred = xgbc.predict(X_test_imputed)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred), '\n')


# reverse label encoding
y_pred_actual = labelencoder.inverse_transform(y_pred)
y_test_actual = labelencoder.inverse_transform(y_test)

data = {'y_Actual':    y_test_actual,
        'y_Predicted': y_pred_actual
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])

confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Mean cross-validation score: 0.76
K-fold CV average score: 0.76
Accuracy: 0.7582417582417582 

Predicted     Active  Inactive  Intermediate
Actual                                      
Active            17         9             1
Inactive           8        51             1
Intermediate       2         1             1


In [449]:
len(X_train)

272