In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# PREDICTING FUNCTION

In [2]:
# read data for experimentally determined structures in complex with active molecules
df = pd.read_csv('data/364_interaction_energies_state_function_v4.txt', sep='\t')

#drop index
df.drop('index', axis = 1, inplace=True)

#copy df to structure_df
structure_df = df.copy()
structure_df.drop(['PDBID','State'], axis = 1, inplace = True)

In [3]:
# read data for best scoring docked poses of inactive molecules
df = pd.read_csv('data/dude_docking_data/dude_interaction_energies.txt', sep='\t')

#drop index
df.drop('index', axis = 1, inplace=True)

#copy df to sdock_tructure_df
dock_structure_df = df.copy()
dock_structure_df.drop(['PDBID','State'], axis = 1, inplace = True)

In [4]:
# join both datasets
frames = [structure_df, dock_structure_df] 
structure_df = pd.concat(frames)

In [5]:
structure_df['Function'].value_counts()

Inactive             285
Antagonist           183
Agonist              148
Inverse agonist       23
Agonist (partial)     10
Name: Function, dtype: int64

In [6]:
structure_df['1.21_intenergysum']

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
280   NaN
281   NaN
282   NaN
283   NaN
284   NaN
Name: 1.21_intenergysum, Length: 649, dtype: float64

In [7]:
# get columns with 'sum' in their name
sum_cols = [col for col in structure_df.columns if 'sum' in col]

# create empty list for residue numbers
resnums = []

# loop through sum columns and count interactions that don't have nonzero energies
for col in sum_cols:
    resnum = col[:4]
    resnums.append(resnum)

# drop columns from df in which > 10% of entries are NaN
for resnum in resnums:
    intenergysum_col = resnum + '_intenergysum'
    inttype1_col = resnum + '_inttype1'
    intenergy1_col = resnum + '_intenergy1'
    inttype2_col = resnum + '_inttype2'
    intenergy2_col = resnum + '_intenergy2'
    
    print('structures with interactions at position', resnum, ':', structure_df[intenergysum_col][structure_df[intenergysum_col] != 0].value_counts().sum())
    if structure_df[intenergysum_col][structure_df[intenergysum_col] != 0].value_counts().sum() < 10:
        structure_df.drop([intenergysum_col, inttype1_col, intenergy1_col, inttype2_col, intenergy2_col], axis = 1, inplace = True)
        print('dropped columns for residue: ', resnum, '\n')

structures with interactions at position 1.21 : 0
dropped columns for residue:  1.21 

structures with interactions at position 1.22 : 0
dropped columns for residue:  1.22 

structures with interactions at position 1.23 : 0
dropped columns for residue:  1.23 

structures with interactions at position 1.24 : 0
dropped columns for residue:  1.24 

structures with interactions at position 1.25 : 0
dropped columns for residue:  1.25 

structures with interactions at position 1.26 : 0
dropped columns for residue:  1.26 

structures with interactions at position 1.27 : 1
dropped columns for residue:  1.27 

structures with interactions at position 1.28 : 0
dropped columns for residue:  1.28 

structures with interactions at position 1.29 : 0
dropped columns for residue:  1.29 

structures with interactions at position 1.30 : 2
dropped columns for residue:  1.30 

structures with interactions at position 1.31 : 9
dropped columns for residue:  1.31 

structures with interactions at position 1.

dropped columns for residue:  3.64 

structures with interactions at position 3.65 : 0
dropped columns for residue:  3.65 

structures with interactions at position 3.66 : 0
dropped columns for residue:  3.66 

structures with interactions at position 3.67 : 0
dropped columns for residue:  3.67 

structures with interactions at position 3.68 : 0
dropped columns for residue:  3.68 

structures with interactions at position 3.69 : 0
dropped columns for residue:  3.69 

structures with interactions at position 3.70 : 0
dropped columns for residue:  3.70 

structures with interactions at position 3.71 : 0
dropped columns for residue:  3.71 

structures with interactions at position 4.29 : 0
dropped columns for residue:  4.29 

structures with interactions at position 4.30 : 0
dropped columns for residue:  4.30 

structures with interactions at position 4.31 : 0
dropped columns for residue:  4.31 

structures with interactions at position 4.32 : 0
dropped columns for residue:  4.32 

struct

dropped columns for residue:  6.49 

structures with interactions at position 6.50 : 2
dropped columns for residue:  6.50 

structures with interactions at position 6.51 : 361
structures with interactions at position 6.52 : 260
structures with interactions at position 6.53 : 1
dropped columns for residue:  6.53 

structures with interactions at position 6.54 : 17
structures with interactions at position 6.55 : 298
structures with interactions at position 6.56 : 0
dropped columns for residue:  6.56 

structures with interactions at position 6.57 : 3
dropped columns for residue:  6.57 

structures with interactions at position 6.58 : 49
structures with interactions at position 6.59 : 12
structures with interactions at position 6.60 : 3
dropped columns for residue:  6.60 

structures with interactions at position 6.61 : 1
dropped columns for residue:  6.61 

structures with interactions at position 6.62 : 3
dropped columns for residue:  6.62 

structures with interactions at position 6.63

In [11]:
structure_df

Unnamed: 0,Function,1.35_intenergysum,1.39_intenergysum,2.57_intenergysum,2.60_intenergysum,2.61_intenergysum,2.63_intenergysum,2.64_intenergysum,2.65_intenergysum,3.25_intenergysum,...,6.55_intenergysum,6.58_intenergysum,6.59_intenergysum,7.32_intenergysum,7.35_intenergysum,7.36_intenergysum,7.39_intenergysum,7.40_intenergysum,7.42_intenergysum,7.43_intenergysum
0,Agonist,0.0,0.0,0.0,0.0,0.0,0.0,-2.3,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agonist,0.0,0.0,0.0,-0.1,0.0,0.0,-0.1,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Agonist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,-0.4,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
3,Agonist,0.0,0.0,0.0,-1.8,0.0,0.0,,,-1.9,...,-0.1,0.0,0.0,,,0.0,-0.2,0.0,0.0,0.0
4,Agonist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1,-0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,Inactive,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.2
281,Inactive,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
282,Inactive,,0.0,0.0,0.0,-0.6,0.0,0.0,0.0,0.0,...,-0.4,0.0,0.0,0.0,0.0,0.0,-0.1,0.0,0.0,0.0
283,Inactive,,0.0,0.0,0.0,-0.1,0.0,0.0,-0.2,0.0,...,-0.2,0.0,0.0,0.0,0.0,0.0,-0.2,0.0,0.0,0.0


In [10]:
# drop non-sum columns

for col in structure_df.columns[1:]:
    if 'sum' not in col:
        structure_df.drop([col], axis = 1, inplace = True)

In [12]:
structure_df1 = structure_df.replace('Agonist (partial)', 'Agonist')

In [13]:
structure_df2 = structure_df1.replace('Inverse agonist', 'Antagonist')

In [14]:
actual_fxns = structure_df['Function']
actual_fxns1 = structure_df1['Function']
actual_fxns2 = structure_df2['Function']

In [15]:
# label encoding
def encode_labels(df):
    # create instance of labelencoder
    encoder = OrdinalEncoder()
    le = LabelEncoder()

    cols = [col for col in df.columns if 'type' in col]

    # loop though all columns and convert strings to categorical integer variables
    for col in cols:
        df[col] = encoder.fit_transform(np.array(df[col].tolist()).reshape(-1, 1))


    # encode states as integers
    # get columns with 'type' in their name
    cols = [col for col in df.columns if 'Function' in col]

    # loop though all columns and convert strings to categorical integer variables
    for col in cols:
        df[col] = le.fit_transform(df[col])
        
    return (df, le)

In [16]:
(structure_df, encoder) = encode_labels(structure_df)
(structure_df1, encoder1) = encode_labels(structure_df1)
(structure_df2, encoder2) = encode_labels(structure_df2)

In [17]:
encoder.classes_

array(['Agonist', 'Agonist (partial)', 'Antagonist', 'Inactive',
       'Inverse agonist'], dtype=object)

In [18]:
# assign target classes to y
y = structure_df['Function']
y1 = structure_df1['Function']
y2 = structure_df2['Function']

# assign data to X
X = structure_df.drop(['Function'], axis = 1)
X1 = structure_df1.drop(['Function'], axis = 1)
X2 = structure_df2.drop(['Function'], axis = 1)

# create actual_state column with non-encoded states
X['actual_fxn'] = actual_fxns
X1['actual_fxn'] = actual_fxns1
X2['actual_fxn'] = actual_fxns2

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.25, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.25, random_state=42)

In [20]:
def scale_impute(dataframe):
    # get colnames
    colnames = list(dataframe.drop(['actual_fxn'], axis = 1).columns)
    fxns_df = dataframe['actual_fxn']
    #state_df.reset_index(inplace=True)
    df = dataframe.drop(['actual_fxn'], axis = 1)

    # impute data
    from sklearn.impute import SimpleImputer
    my_imputer = SimpleImputer()
    df_imputed = pd.DataFrame(my_imputer.fit_transform(df))

    # scale data
    scaler = StandardScaler()
    to_scale = [col for col in df_imputed.columns.values]
    scaler.fit(df_imputed[to_scale])

    # predict z-scores on the test set
    df_imputed[to_scale] = scaler.transform(df_imputed[to_scale]) 

    # #rename columns
    df_imputed.columns = colnames

    # display scaled values
    display(df_imputed)
    
    return(df_imputed, fxns_df)

In [21]:
(X_train_imputed, X_train_fxns) = scale_impute(X_train)
(X_test_imputed, X_test_fxns) = scale_impute(X_test)

(X_train_imputed1, X_train_fxns1) = scale_impute(X_train1)
(X_test_imputed1, X_test_fxns1) = scale_impute(X_test1)

(X_train_imputed2, X_train_fxns2) = scale_impute(X_train2)
(X_test_imputed2, X_test_fxns2) = scale_impute(X_test2)

Unnamed: 0,1.35_intenergysum,1.39_intenergysum,2.57_intenergysum,2.60_intenergysum,2.61_intenergysum,2.63_intenergysum,2.64_intenergysum,2.65_intenergysum,3.25_intenergysum,3.28_intenergysum,...,6.55_intenergysum,6.58_intenergysum,6.59_intenergysum,7.32_intenergysum,7.35_intenergysum,7.36_intenergysum,7.39_intenergysum,7.40_intenergysum,7.42_intenergysum,7.43_intenergysum
0,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.489706,0.197572,0.087397,0.1462,0.057763,0.103455,0.245605,0.118411,0.202464,0.113632
1,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.534055,0.197572,0.087397,0.1462,-0.198338,0.103455,0.213377,0.118411,0.202464,0.113632
2,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.755798,0.197572,0.087397,0.1462,-0.262363,0.014789,0.213377,0.118411,0.202464,0.113632
3,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.525998,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632
4,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-1.159249,0.197572,0.087397,0.1462,0.121788,0.103455,0.310062,0.118411,0.202464,0.113632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,1.260374e-01,0.096267,0.174807,0.129828,-5.812456,0.084369,0.195312,1.779136e-01,0.145582,0.033090,...,0.525998,0.197572,0.087397,0.1462,0.121788,0.103455,0.342286,0.118411,0.202464,0.113632
482,1.260374e-01,0.096267,0.174807,-3.300124,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.304255,-0.011779,-0.069019,0.1462,-0.070287,0.103455,0.310062,0.118411,0.202464,0.113632
483,1.605638e-17,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.304255,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632
484,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,-2.567546e-17,0.145582,0.209025,...,-1.070551,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632


Unnamed: 0,1.35_intenergysum,1.39_intenergysum,2.57_intenergysum,2.60_intenergysum,2.61_intenergysum,2.63_intenergysum,2.64_intenergysum,2.65_intenergysum,3.25_intenergysum,3.28_intenergysum,...,6.55_intenergysum,6.58_intenergysum,6.59_intenergysum,7.32_intenergysum,7.35_intenergysum,7.36_intenergysum,7.39_intenergysum,7.40_intenergysum,7.42_intenergysum,7.43_intenergysum
0,0.115743,0.152864,0.145185,-0.480213,0.220934,0.100659,0.063589,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.403584,0.131101,0.193667,0.328884
1,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.071151,0.284036,-1.891632,0.131101,0.193667,0.328884
2,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,-0.114020,0.20384,0.151806,-0.336520,...,0.414777,0.000000,0.000000,0.160857,0.158263,-3.777179,0.426086,0.131101,0.193667,0.328884
3,0.115743,0.152864,-0.114872,0.165352,0.220934,0.100659,0.241197,0.00000,0.151806,-0.065282,...,0.269941,0.212804,0.096356,-0.019533,0.071151,0.284036,0.201065,0.131101,0.193667,0.328884
4,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,-0.418032,0.212804,0.096356,0.160857,0.158263,0.284036,0.426086,0.131101,0.193667,0.328884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0.115743,0.152864,0.145185,0.165352,-1.164152,0.100659,-0.114020,0.00000,0.151806,0.205955,...,-0.852541,0.212804,0.096356,0.160857,0.158263,0.284036,0.381082,0.131101,0.193667,0.328884
159,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,-2.116653,0.131101,0.193667,0.328884
160,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.021048,0.131101,-6.119875,0.328884
161,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.000000,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.426086,0.131101,0.193667,0.328884


Unnamed: 0,1.35_intenergysum,1.39_intenergysum,2.57_intenergysum,2.60_intenergysum,2.61_intenergysum,2.63_intenergysum,2.64_intenergysum,2.65_intenergysum,3.25_intenergysum,3.28_intenergysum,...,6.55_intenergysum,6.58_intenergysum,6.59_intenergysum,7.32_intenergysum,7.35_intenergysum,7.36_intenergysum,7.39_intenergysum,7.40_intenergysum,7.42_intenergysum,7.43_intenergysum
0,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.489706,0.197572,0.087397,0.1462,0.057763,0.103455,0.245605,0.118411,0.202464,0.113632
1,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.534055,0.197572,0.087397,0.1462,-0.198338,0.103455,0.213377,0.118411,0.202464,0.113632
2,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.755798,0.197572,0.087397,0.1462,-0.262363,0.014789,0.213377,0.118411,0.202464,0.113632
3,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.525998,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632
4,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-1.159249,0.197572,0.087397,0.1462,0.121788,0.103455,0.310062,0.118411,0.202464,0.113632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,1.260374e-01,0.096267,0.174807,0.129828,-5.812456,0.084369,0.195312,1.779136e-01,0.145582,0.033090,...,0.525998,0.197572,0.087397,0.1462,0.121788,0.103455,0.342286,0.118411,0.202464,0.113632
482,1.260374e-01,0.096267,0.174807,-3.300124,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.304255,-0.011779,-0.069019,0.1462,-0.070287,0.103455,0.310062,0.118411,0.202464,0.113632
483,1.605638e-17,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.304255,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632
484,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,-2.567546e-17,0.145582,0.209025,...,-1.070551,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632


Unnamed: 0,1.35_intenergysum,1.39_intenergysum,2.57_intenergysum,2.60_intenergysum,2.61_intenergysum,2.63_intenergysum,2.64_intenergysum,2.65_intenergysum,3.25_intenergysum,3.28_intenergysum,...,6.55_intenergysum,6.58_intenergysum,6.59_intenergysum,7.32_intenergysum,7.35_intenergysum,7.36_intenergysum,7.39_intenergysum,7.40_intenergysum,7.42_intenergysum,7.43_intenergysum
0,0.115743,0.152864,0.145185,-0.480213,0.220934,0.100659,0.063589,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.403584,0.131101,0.193667,0.328884
1,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.071151,0.284036,-1.891632,0.131101,0.193667,0.328884
2,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,-0.114020,0.20384,0.151806,-0.336520,...,0.414777,0.000000,0.000000,0.160857,0.158263,-3.777179,0.426086,0.131101,0.193667,0.328884
3,0.115743,0.152864,-0.114872,0.165352,0.220934,0.100659,0.241197,0.00000,0.151806,-0.065282,...,0.269941,0.212804,0.096356,-0.019533,0.071151,0.284036,0.201065,0.131101,0.193667,0.328884
4,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,-0.418032,0.212804,0.096356,0.160857,0.158263,0.284036,0.426086,0.131101,0.193667,0.328884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0.115743,0.152864,0.145185,0.165352,-1.164152,0.100659,-0.114020,0.00000,0.151806,0.205955,...,-0.852541,0.212804,0.096356,0.160857,0.158263,0.284036,0.381082,0.131101,0.193667,0.328884
159,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,-2.116653,0.131101,0.193667,0.328884
160,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.021048,0.131101,-6.119875,0.328884
161,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.000000,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.426086,0.131101,0.193667,0.328884


Unnamed: 0,1.35_intenergysum,1.39_intenergysum,2.57_intenergysum,2.60_intenergysum,2.61_intenergysum,2.63_intenergysum,2.64_intenergysum,2.65_intenergysum,3.25_intenergysum,3.28_intenergysum,...,6.55_intenergysum,6.58_intenergysum,6.59_intenergysum,7.32_intenergysum,7.35_intenergysum,7.36_intenergysum,7.39_intenergysum,7.40_intenergysum,7.42_intenergysum,7.43_intenergysum
0,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.489706,0.197572,0.087397,0.1462,0.057763,0.103455,0.245605,0.118411,0.202464,0.113632
1,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.534055,0.197572,0.087397,0.1462,-0.198338,0.103455,0.213377,0.118411,0.202464,0.113632
2,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-2.755798,0.197572,0.087397,0.1462,-0.262363,0.014789,0.213377,0.118411,0.202464,0.113632
3,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.525998,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632
4,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,-1.159249,0.197572,0.087397,0.1462,0.121788,0.103455,0.310062,0.118411,0.202464,0.113632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,1.260374e-01,0.096267,0.174807,0.129828,-5.812456,0.084369,0.195312,1.779136e-01,0.145582,0.033090,...,0.525998,0.197572,0.087397,0.1462,0.121788,0.103455,0.342286,0.118411,0.202464,0.113632
482,1.260374e-01,0.096267,0.174807,-3.300124,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.304255,-0.011779,-0.069019,0.1462,-0.070287,0.103455,0.310062,0.118411,0.202464,0.113632
483,1.605638e-17,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,1.779136e-01,0.145582,0.209025,...,0.304255,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632
484,1.260374e-01,0.096267,0.174807,0.129828,0.164173,0.084369,0.195312,-2.567546e-17,0.145582,0.209025,...,-1.070551,0.197572,0.087397,0.1462,0.121788,0.103455,0.342290,0.118411,0.202464,0.113632


Unnamed: 0,1.35_intenergysum,1.39_intenergysum,2.57_intenergysum,2.60_intenergysum,2.61_intenergysum,2.63_intenergysum,2.64_intenergysum,2.65_intenergysum,3.25_intenergysum,3.28_intenergysum,...,6.55_intenergysum,6.58_intenergysum,6.59_intenergysum,7.32_intenergysum,7.35_intenergysum,7.36_intenergysum,7.39_intenergysum,7.40_intenergysum,7.42_intenergysum,7.43_intenergysum
0,0.115743,0.152864,0.145185,-0.480213,0.220934,0.100659,0.063589,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.403584,0.131101,0.193667,0.328884
1,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.071151,0.284036,-1.891632,0.131101,0.193667,0.328884
2,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,-0.114020,0.20384,0.151806,-0.336520,...,0.414777,0.000000,0.000000,0.160857,0.158263,-3.777179,0.426086,0.131101,0.193667,0.328884
3,0.115743,0.152864,-0.114872,0.165352,0.220934,0.100659,0.241197,0.00000,0.151806,-0.065282,...,0.269941,0.212804,0.096356,-0.019533,0.071151,0.284036,0.201065,0.131101,0.193667,0.328884
4,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,-0.418032,0.212804,0.096356,0.160857,0.158263,0.284036,0.426086,0.131101,0.193667,0.328884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0.115743,0.152864,0.145185,0.165352,-1.164152,0.100659,-0.114020,0.00000,0.151806,0.205955,...,-0.852541,0.212804,0.096356,0.160857,0.158263,0.284036,0.381082,0.131101,0.193667,0.328884
159,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,-2.116653,0.131101,0.193667,0.328884
160,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.151806,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.021048,0.131101,-6.119875,0.328884
161,0.115743,0.152864,0.145185,0.165352,0.220934,0.100659,0.241197,0.20384,0.000000,0.205955,...,0.414777,0.212804,0.096356,0.160857,0.158263,0.284036,0.426086,0.131101,0.193667,0.328884


In [22]:
X_train_fxns = X_train_fxns.reset_index()
X_train_fxns.drop(['index'], axis = 1, inplace = True)

X_test_fxns = X_test_fxns.reset_index()
X_test_fxns.drop(['index'], axis = 1, inplace = True)

X_train_fxns1 = X_train_fxns1.reset_index()
X_train_fxns1.drop(['index'], axis = 1, inplace = True)

X_test_fxns1 = X_test_fxns1.reset_index()
X_test_fxns1.drop(['index'], axis = 1, inplace = True)

X_train_fxns2 = X_train_fxns2.reset_index()
X_train_fxns2.drop(['index'], axis = 1, inplace = True)

X_test_fxns2 = X_test_fxns2.reset_index()
X_test_fxns2.drop(['index'], axis = 1, inplace = True)

In [23]:
def train_test_predict(train_df, train_y, test_df, test_y, encoder):
    #Import Random Forest Model
    from sklearn.ensemble import RandomForestClassifier

    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=500, random_state=1)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(train_df, train_y)
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import cross_val_score, KFold
    import numpy as np

    # cross-validation
    scores = cross_val_score(clf, train_df, train_y, cv=5)
    print("Mean cross-validation score: %.2f" % scores.mean())

    # k-fold CV
    kfold = KFold(n_splits=10, shuffle=True, random_state = 1)
    kf_cv_scores = cross_val_score(clf, train_df, train_y, cv=kfold)
    print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
    
    # test set predictions
    y_pred = clf.predict(test_df)

    #Import scikit-learn metrics module for accuracy calculation
    from sklearn import metrics

    # reverse label encoding
    y_pred_actual = encoder.inverse_transform(y_pred)
    y_test_actual = encoder.inverse_transform(test_y)

    data = {'y_Actual':    y_test_actual,
            'y_Predicted': y_pred_actual
            }

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])

    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    print ('\n', confusion_matrix, '\n')
    
     # Model Accuracy, how often is the classifier correct?
    acc = metrics.accuracy_score(test_y, y_pred)
    precision = metrics.precision_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    recall = metrics.recall_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    print("Accuracy:","{:.2f}".format(acc))
    print("Precision:","{:.2f}".format(precision))
    print("Recall:","{:.2f}".format(recall), '\n')

In [24]:
train_test_predict(X_train_imputed, y_train, X_test_imputed, y_test, encoder)

Mean cross-validation score: 0.79
K-fold CV average score: 0.80

 Predicted          Agonist  Antagonist  Inactive
Actual                                          
Agonist                 23           9         4
Agonist (partial)        1           0         2
Antagonist               5          39         8
Inactive                 1           2        57
Inverse agonist          1          10         1 

Accuracy: 0.73
Precision: 0.73
Recall: 0.80 



In [25]:
train_test_predict(X_train_imputed1, y_train1, X_test_imputed1, y_test1, encoder1)

Mean cross-validation score: 0.79
K-fold CV average score: 0.81

 Predicted        Agonist  Antagonist  Inactive
Actual                                        
Agonist               26           8         5
Antagonist             6          39         7
Inactive               1           3        56
Inverse agonist        1          10         1 

Accuracy: 0.74
Precision: 0.74
Recall: 0.80 



In [26]:
train_test_predict(X_train_imputed2, y_train2, X_test_imputed2, y_test2, encoder2)

Mean cross-validation score: 0.80
K-fold CV average score: 0.83

 Predicted   Agonist  Antagonist  Inactive
Actual                                   
Agonist          25           9         5
Antagonist        7          50         7
Inactive          1           3        56 

Accuracy: 0.80
Precision: 0.80
Recall: 0.80 



## GridSearch for best parameters

In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [59]:
params_to_test = {'n_estimators':[10, 100, 500, 1000, 2000, 5000, 10000]}

In [61]:
rf_model = RandomForestClassifier(random_state=1)

grid_search = GridSearchCV(rf_model, param_grid=params_to_test, cv=10, scoring='accuracy')

grid_search.fit(X_train_imputed1, y_train1)

best_params = grid_search.best_params_ 

In [62]:
best_params

{'n_estimators': 500}

## XGBoost

In [33]:
def train_test_predict_xgboost(train_df, train_y, test_df, test_y, encoder):
    #Import Random Forest Model
    from sklearn.ensemble import RandomForestClassifier
    
    import xgboost as xgb

    xgbc = xgb.XGBClassifier(use_label_encoder=False,
                             eval_metric='mlogloss',
                             n_estimators=500,
                             random_state = 1,
                             learning_rate = 0.05
                            )


    #Train the model using the training sets y_pred=clf.predict(X_test)
    xgbc.fit(train_df, train_y)
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import cross_val_score, KFold
    import numpy as np

    # cross-validation
    scores = cross_val_score(xgbc, train_df, train_y, cv=5)
    print("Mean cross-validation score: %.2f" % scores.mean())

    # k-fold CV
    kfold = KFold(n_splits=10, shuffle=True, random_state = 1)
    kf_cv_scores = cross_val_score(xgbc, train_df, train_y, cv=kfold)
    print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
    
    # test set predictions
    y_pred = xgbc.predict(test_df)

    #Import scikit-learn metrics module for accuracy calculation
    from sklearn import metrics

    # reverse label encoding
    y_pred_actual = encoder.inverse_transform(y_pred)
    y_test_actual = encoder.inverse_transform(test_y)

    data = {'y_Actual':    y_test_actual,
            'y_Predicted': y_pred_actual
            }

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])

    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    print ('\n', confusion_matrix, '\n')
    
     # Model Accuracy, how often is the classifier correct?
    acc = metrics.accuracy_score(test_y, y_pred)
    precision = metrics.precision_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    recall = metrics.recall_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    print("Accuracy:","{:.2f}".format(acc))
    print("Precision:","{:.2f}".format(precision))
    print("Recall:","{:.2f}".format(recall), '\n')

In [34]:
train_test_predict_xgboost(X_train_imputed, y_train, X_test_imputed, y_test, encoder)

Mean cross-validation score: 0.84
K-fold CV average score: 0.85

 Predicted          Agonist  Antagonist  Inactive  Inverse agonist
Actual                                                           
Agonist                 29           2         4                1
Agonist (partial)        1           0         2                0
Antagonist              10          40         2                0
Inactive                 1           0        59                0
Inverse agonist          1           5         4                2 

Accuracy: 0.80
Precision: 0.79
Recall: 0.81 



In [35]:
train_test_predict_xgboost(X_train_imputed1, y_train1, X_test_imputed1, y_test1, encoder1)

Mean cross-validation score: 0.85
K-fold CV average score: 0.86

 Predicted        Agonist  Antagonist  Inactive  Inverse agonist
Actual                                                         
Agonist               31           2         5                1
Antagonist            10          40         2                0
Inactive               1           0        59                0
Inverse agonist        1           4         5                2 

Accuracy: 0.81
Precision: 0.80
Recall: 0.81 



In [37]:
train_test_predict_xgboost(X_train_imputed2, y_train2, X_test_imputed2, y_test2, encoder2)

Mean cross-validation score: 0.85
K-fold CV average score: 0.89

 Predicted   Agonist  Antagonist  Inactive
Actual                                   
Agonist          31           4         4
Antagonist       10          47         7
Inactive          1           0        59 

Accuracy: 0.84
Precision: 0.85
Recall: 0.84 



## LightGBM

In [52]:
def train_test_predict_lgbm(train_df, train_y, test_df, test_y, encoder):
    #Import Random Forest Model
    import lightgbm as lgb
    
    # create classifier
    clf = lgb.LGBMClassifier(n_estimators = 100, random_state = 1)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(train_df, train_y)
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import cross_val_score, KFold
    import numpy as np

    # cross-validation
    scores = cross_val_score(clf, train_df, train_y, cv=5)
    print("Mean cross-validation score: %.2f" % scores.mean())

    # k-fold CV
    kfold = KFold(n_splits=10, shuffle=True, random_state = 1)
    kf_cv_scores = cross_val_score(clf, train_df, train_y, cv=kfold)
    print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
    
    # test set predictions
    y_pred = clf.predict(test_df)

    #Import scikit-learn metrics module for accuracy calculation
    from sklearn import metrics

    # reverse label encoding
    y_pred_actual = encoder.inverse_transform(y_pred)
    y_test_actual = encoder.inverse_transform(test_y)

    data = {'y_Actual':    y_test_actual,
            'y_Predicted': y_pred_actual
            }

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])

    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    print ('\n', confusion_matrix, '\n')
    
     # Model Accuracy, how often is the classifier correct?
    acc = metrics.accuracy_score(test_y, y_pred)
    precision = metrics.precision_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    recall = metrics.recall_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    print("Accuracy:","{:.2f}".format(acc))
    print("Precision:","{:.2f}".format(precision))
    print("Recall:","{:.2f}".format(recall), '\n')

In [53]:
train_test_predict_lgbm(X_train_imputed, y_train, X_test_imputed, y_test, encoder)

Mean cross-validation score: 0.84
K-fold CV average score: 0.86

 Predicted          Agonist  Agonist (partial)  Antagonist  Inactive  \
Actual                                                                
Agonist                 27                  0           5         4   
Agonist (partial)        1                  1           0         1   
Antagonist               7                  0          40         5   
Inactive                 0                  0           0        60   
Inverse agonist          0                  0           5         6   

Predicted          Inverse agonist  
Actual                              
Agonist                          0  
Agonist (partial)                0  
Antagonist                       0  
Inactive                         0  
Inverse agonist                  1   

Accuracy: 0.79
Precision: 0.81
Recall: 0.79 



In [54]:
train_test_predict_lgbm(X_train_imputed1, y_train1, X_test_imputed1, y_test1, encoder1)

Mean cross-validation score: 0.84
K-fold CV average score: 0.87

 Predicted        Agonist  Antagonist  Inactive  Inverse agonist
Actual                                                         
Agonist               30           3         6                0
Antagonist            10          39         3                0
Inactive               0           0        60                0
Inverse agonist        0           5         6                1 

Accuracy: 0.80
Precision: 0.81
Recall: 0.80 



In [55]:
train_test_predict_lgbm(X_train_imputed2, y_train2, X_test_imputed2, y_test2, encoder2)

Mean cross-validation score: 0.87
K-fold CV average score: 0.90

 Predicted   Agonist  Antagonist  Inactive
Actual                                   
Agonist          30           3         6
Antagonist       11          44         9
Inactive          0           0        60 

Accuracy: 0.82
Precision: 0.84
Recall: 0.82 



## SVM

In [77]:
def train_test_predict_svm(train_df, train_y, test_df, test_y, encoder):
    from sklearn import svm

    #Create a svm Classifier
    clf = svm.SVC(kernel='rbf', tol = 1e-4) # Linear Kernel

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(train_df, train_y)
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import cross_val_score, KFold
    import numpy as np

    # cross-validation
    scores = cross_val_score(clf, train_df, train_y, cv=5)
    print("Mean cross-validation score: %.2f" % scores.mean())

    # k-fold CV
    kfold = KFold(n_splits=10, shuffle=True, random_state = 1)
    kf_cv_scores = cross_val_score(clf, train_df, train_y, cv=kfold)
    print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
    
    # test set predictions
    y_pred = clf.predict(test_df)

    #Import scikit-learn metrics module for accuracy calculation
    from sklearn import metrics

    # reverse label encoding
    y_pred_actual = encoder.inverse_transform(y_pred)
    y_test_actual = encoder.inverse_transform(test_y)

    data = {'y_Actual':    y_test_actual,
            'y_Predicted': y_pred_actual
            }

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])

    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    print ('\n', confusion_matrix, '\n')
    
     # Model Accuracy, how often is the classifier correct?
    acc = metrics.accuracy_score(test_y, y_pred)
    precision = metrics.precision_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    recall = metrics.recall_score(test_y, y_pred, average = 'weighted', labels=np.unique(y_pred))
    print("Accuracy:","{:.2f}".format(acc))
    print("Precision:","{:.2f}".format(precision))
    print("Recall:","{:.2f}".format(recall), '\n')

In [78]:
train_test_predict_svm(X_train_imputed1, y_train1, X_test_imputed1, y_test1, encoder1)

Mean cross-validation score: 0.81
K-fold CV average score: 0.83

 Predicted        Agonist  Antagonist  Inactive
Actual                                        
Agonist               32           1         6
Antagonist            12          32         8
Inactive               0           1        59
Inverse agonist        3           4         5 

Accuracy: 0.75
Precision: 0.77
Recall: 0.81 

