In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
#import libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
from scipy.stats import spearmanr
from scipy.stats import pearsonr
import math
from math import sqrt
import pandas as pd
import numpy as np

In [3]:
# Read the data
df_TR = pd.read_csv("REFINED-MATRIX_Int_Frag_3481.csv") # Training Set
df_TS = pd.read_csv("CORE-MATRIX_Int_Frag_180.csv") # Test set

In [4]:
df_TR.shape

(3481, 2425)

In [5]:
df_TS.shape

(180, 2425)

In [6]:
df_TR.head()

Unnamed: 0,PDB_ID,Resolution,pKd,CN,CC,CO,CS,CH,HN,CCC,...,LYS_Aromatic_E/F,MET_Aromatic_E/F,PHE_Aromatic_E/F,PRO_Aromatic_E/F,SER_Aromatic_E/F,THR_Aromatic_E/F,TRP_Aromatic_E/F,TYR_Aromatic_E/F,VAL_Aromatic_E/F,GLU_Aromatic_E/F
0,184l,1.8,4.72,0,10,0,0,14,0,12,...,0,0,0,0,0,0,0,0,0,0
1,185l,1.8,3.54,2,8,0,0,6,1,9,...,0,0,0,0,0,0,0,0,0,0
2,186l,1.8,4.85,0,10,0,0,14,0,11,...,0,0,0,0,0,0,0,0,0,0
3,187l,1.8,3.37,0,8,0,0,10,0,10,...,0,0,0,0,0,0,0,0,0,0
4,188l,1.8,3.33,0,8,0,0,10,0,10,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_TS.head()

Unnamed: 0,PDB_ID,Resolution,pKd,CN,CC,CO,CS,CH,HN,CCC,...,LYS_Aromatic_E/F,MET_Aromatic_E/F,PHE_Aromatic_E/F,PRO_Aromatic_E/F,SER_Aromatic_E/F,THR_Aromatic_E/F,TRP_Aromatic_E/F,TYR_Aromatic_E/F,VAL_Aromatic_E/F,GLU_Aromatic_E/F
0,10gs,2.2,6.4,5,21,6,2,21,5,21,...,0,0,0,0,0,0,0,0,0,0
1,1bcu,2.0,3.28,4,14,0,0,7,4,17,...,0,0,0,0,0,0,0,0,0,0
2,1e66,2.1,9.89,3,20,0,0,17,2,27,...,0,0,0,0,0,0,0,0,0,0
3,1f8b,1.8,5.4,2,9,9,0,11,1,7,...,0,0,0,0,0,0,0,0,0,0
4,1f8c,1.7,7.4,3,9,8,0,11,4,7,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Traning Sets
y_df_TR = df_TR['pKd']
X_df_TR = df_TR.drop(['PDB_ID', 'Resolution', 'pKd'], axis=1)

In [9]:
X_df_TR.shape, y_df_TR.shape

((3481, 2422), (3481,))

In [10]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_df_TR, y_df_TR, test_size=0.2, random_state=123456)

In [11]:
# Test Sets
y_df_TS = df_TS['pKd']
X_df_TS = df_TS.drop(['PDB_ID', 'Resolution', 'pKd'], axis=1)

In [12]:
X_df_TS.shape, y_df_TS.shape

((180, 2422), (180,))

# Optimized parameters
## max_features = 'auto'
## n_estimators=100
## random_state = 1234

In [13]:
models_RF_train = {"RF": RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=1234,
                      verbose=0, warm_start=False)}

In [14]:
# Calculate the Training and Validation (Refined set) statistics
scores = {}
for m in models_RF_train:
    models_RF_train[m].fit(X_train, Y_train)
    scores[m + "_train_r2"] = models_RF_train[m].score(X_train, Y_train)
    Y_pred_valid_rf = models_RF_train[m].predict(X_valid)
    Y_pred_train_rf = models_RF_train[m].predict(X_train)
    scores[m + "_rmse_train"] = sqrt(mean_squared_error(Y_train, Y_pred_train_rf))
    scores[m + "_mae_train"] = mean_absolute_error(Y_train, Y_pred_train_rf)
    scores[m + "_pcc_train"] = pearsonr(Y_train, Y_pred_train_rf)
    scores[m + "_valid_r2"] = r2_score(Y_valid, Y_pred_valid_rf)
    scores[m + "_rmse_valid"] = sqrt(mean_squared_error(Y_valid, Y_pred_valid_rf))
    scores[m + "_mae_valid"] = mean_absolute_error(Y_valid, Y_pred_valid_rf)
    scores[m + "_pcc_valid"] = pearsonr(Y_valid, Y_pred_valid_rf)

scores_RF_train = pd.Series(scores).T
scores_RF_train

RF_train_r2                                          0.936698
RF_rmse_train                                        0.496215
RF_mae_train                                         0.381037
RF_pcc_train                        (0.9768039565933417, 0.0)
RF_valid_r2                                          0.556441
RF_rmse_valid                                          1.3269
RF_mae_valid                                            1.035
RF_pcc_valid     (0.7469407968886873, 2.620446316023756e-125)
dtype: object

In [15]:
# Calculate statistics for test set (Core set) based on RF model
scores = {}
for m in models_RF_train:
    Y_pred_test_rf = models_RF_train[m].predict(X_df_TS)
    scores[m + "_test_r2"] = r2_score(y_df_TS, Y_pred_test_rf)
    scores[m + "_rmse_test"] = sqrt(mean_squared_error(y_df_TS, Y_pred_test_rf))
    scores[m + "_mae_test"] = mean_absolute_error(y_df_TS, Y_pred_test_rf)
    scores[m + "_pcc_test"] = pearsonr(y_df_TS, Y_pred_test_rf)

scores_RF_test = pd.Series(scores).T
scores_RF_test

RF_test_r2                                         0.550425
RF_rmse_test                                        1.48926
RF_mae_test                                         1.22686
RF_pcc_test     (0.7714351905515262, 8.709166066145693e-37)
dtype: object

In [16]:
# Save the test prediction result
Pred_y = pd.DataFrame({'Y_pred_rf': Y_pred_test_rf})
Exp_y = pd.DataFrame(y_df_TS)
Prediction = pd.concat([Exp_y, Pred_y],axis=1)
Prediction.to_excel('RF_test_Pred_Values_Int_Frag.xls')