# XGBOOST MODEL

In [1]:
# Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler
# Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
# Read file
data = pd.read_csv("features_table.csv", sep = ",")
data.head()

Unnamed: 0,1,2,3,4,5,7,8,9,10,11,...,182,183,184,185,188,189,190,191,code,potency
0,47,411,860,1562,2151,11,86,212,393,592,...,0,0,0,0,0,0,0,0,1--1,higher
1,45,329,674,1456,1886,6,69,174,385,540,...,0,0,0,0,0,0,0,0,1--10,lower
2,39,308,641,1254,1672,3,63,154,343,458,...,0,0,0,0,0,0,0,0,1--100,lower
3,51,406,789,1419,1876,9,80,184,369,501,...,0,0,0,0,0,0,0,0,1--101,lower
4,14,148,298,676,965,7,40,101,164,232,...,0,0,0,0,0,0,0,0,1--102,higher


In [3]:
# Select features columns
X_train = data.iloc[:,0:141]

# Shape and head of features data frame
print(X_train.shape)
X_train.head()

(668, 141)


Unnamed: 0,1,2,3,4,5,7,8,9,10,11,...,180,181,182,183,184,185,188,189,190,191
0,47,411,860,1562,2151,11,86,212,393,592,...,0,0,0,0,0,0,0,0,0,0
1,45,329,674,1456,1886,6,69,174,385,540,...,0,0,0,0,0,0,0,0,0,0
2,39,308,641,1254,1672,3,63,154,343,458,...,0,0,0,0,0,0,0,0,0,0
3,51,406,789,1419,1876,9,80,184,369,501,...,0,0,0,0,0,0,0,0,0,0
4,14,148,298,676,965,7,40,101,164,232,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Select potency column (label)
y_train = data['potency']

# Shape and head of potency data frame
print(y_train.shape)
y_train.head()

(668,)


0    higher
1     lower
2     lower
3     lower
4    higher
Name: potency, dtype: object

In [5]:
# Higher potency = 1, Lower potency = 0
y_train = np.where(data['potency']=='higher', 1, 0)

print(y_train.shape)
y_train

(668,)


array([1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,

In [6]:
# Features Normalization
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)

# Visualize data normalized
X_train

array([[ 0.28989026,  0.4006065 ,  0.5561509 , ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [ 0.14052914, -0.39882596, -0.49990089, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [-0.30755423, -0.60355866, -0.68726492, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       ...,
       [ 0.43925139,  0.17637544,  0.27794371, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [-1.7264849 , -1.63697135, -1.23800161, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [ 0.14052914, -0.18434408, -0.17627212, ..., -0.08684168,
        -0.08684168, -0.1478281 ]])

In [7]:
# Split data
# Test size set to 0.2 (20%)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [8]:
############# Training and Cross Validation
classifier = GradientBoostingClassifier(random_state=0)

############# Grid Search with Scikit-Learn

grid_param = {
     "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
}

gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [9]:
import time
start_time = time.time()

###################### Fit the model
gd_sr.fit(X_train, y_train)

######################## Best parameter
best_parameters = gd_sr.best_params_
print(best_parameters)

########################## Best score
best_result = gd_sr.best_score_
print(best_result)

print(" %s seconds " % (time.time() - start_time))



{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'min_samples_split': 0.28181818181818186, 'n_estimators': 10, 'subsample': 0.95}
0.7696629213483146
 522.6243476867676 seconds 


In [12]:
# Redifine classifier with the best parameters found
classifier = GradientBoostingClassifier(criterion = 'friedman_mse', learning_rate = 0.2, loss = 'deviance', max_depth = 5, max_features = 'log2', 
                                        min_samples_leaf =  0.1, min_samples_split = 0.28181818181818186, n_estimators = 10, subsample =  0.95,
                                        random_state=0)

In [13]:
best_result = gd_sr.best_score_
print(best_result)

0.7696629213483146


In [15]:
classifier.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=5,
                           max_features='log2', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=0.1,
                           min_samples_split=0.28181818181818186,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=0.95, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [16]:
# Make predictions
y_test_predictions = classifier.predict(X_test)

## METRICS

In [17]:
# Matthews correlation coefficient
matthews_corrcoef(y_test, y_test_predictions)

0.6703294550993836

In [18]:
# Accuracy score
accuracy_score(y_test, y_test_predictions)

0.835820895522388

In [19]:
# Confusion matrix
confusion_matrix(y_test, y_test_predictions)

array([[48, 14],
       [ 8, 64]])

In [20]:
# Precission score
precision_score(y_test, y_test_predictions, average='macro')

0.8388278388278387

In [21]:
# F1-score
f1_score(y_test, y_test_predictions)

0.8533333333333333