# RANDOM FOREST MODEL

In [1]:
# Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
# Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
# Read file
data = pd.read_csv("features_table.csv", sep = ",")
data.head()

Unnamed: 0,1,2,3,4,5,7,8,9,10,11,...,182,183,184,185,188,189,190,191,code,potency
0,47,411,860,1562,2151,11,86,212,393,592,...,0,0,0,0,0,0,0,0,1--1,higher
1,45,329,674,1456,1886,6,69,174,385,540,...,0,0,0,0,0,0,0,0,1--10,lower
2,39,308,641,1254,1672,3,63,154,343,458,...,0,0,0,0,0,0,0,0,1--100,lower
3,51,406,789,1419,1876,9,80,184,369,501,...,0,0,0,0,0,0,0,0,1--101,lower
4,14,148,298,676,965,7,40,101,164,232,...,0,0,0,0,0,0,0,0,1--102,higher


In [3]:
# Select features columns
X_train = data.iloc[:,0:141]

# Shape and head of features data frame
print(X_train.shape)
X_train.head()

(668, 141)


Unnamed: 0,1,2,3,4,5,7,8,9,10,11,...,180,181,182,183,184,185,188,189,190,191
0,47,411,860,1562,2151,11,86,212,393,592,...,0,0,0,0,0,0,0,0,0,0
1,45,329,674,1456,1886,6,69,174,385,540,...,0,0,0,0,0,0,0,0,0,0
2,39,308,641,1254,1672,3,63,154,343,458,...,0,0,0,0,0,0,0,0,0,0
3,51,406,789,1419,1876,9,80,184,369,501,...,0,0,0,0,0,0,0,0,0,0
4,14,148,298,676,965,7,40,101,164,232,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Select potency column (label)
y_train = data['potency']

# Shape and head of potency data frame
print(y_train.shape)
y_train.head()

(668,)


0    higher
1     lower
2     lower
3     lower
4    higher
Name: potency, dtype: object

In [5]:
# Higher potency = 1, Lower potency = 0
y_train = np.where(data['potency']=='higher', 1, 0)

print(y_train.shape)
y_train

(668,)


array([1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,

In [6]:
# Features Normalization
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)

# Visualize data normalized
X_train

array([[ 0.28989026,  0.4006065 ,  0.5561509 , ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [ 0.14052914, -0.39882596, -0.49990089, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [-0.30755423, -0.60355866, -0.68726492, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       ...,
       [ 0.43925139,  0.17637544,  0.27794371, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [-1.7264849 , -1.63697135, -1.23800161, ..., -0.08684168,
        -0.08684168, -0.1478281 ],
       [ 0.14052914, -0.18434408, -0.17627212, ..., -0.08684168,
        -0.08684168, -0.1478281 ]])

In [7]:
# Split data
# Test size set to 0.2 (20%)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [8]:
############# Training and Cross Validation
classifier = RandomForestClassifier(n_estimators=300, random_state=0)

############# Grid Search with Scikit-Learn

grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)
                     
###################### Fit the model
gd_sr.fit(X_train, y_train)

######################## Best parameter
best_parameters = gd_sr.best_params_
print(best_parameters)

########################## Best score
best_result = gd_sr.best_score_
print(best_result)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 100}
0.8071161048689138


In [11]:
# Redifine classifier with the best parameters found
classifier = RandomForestClassifier(n_estimators=100, criterion='gini', bootstrap = True, random_state=0)

In [12]:
best_result = gd_sr.best_score_
print(best_result)

0.8071161048689138


In [13]:
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [14]:
# Make predictions
y_test_predictions = classifier.predict(X_test)

## METRICS

In [15]:
# Matthews correlation coefficient
matthews_corrcoef(y_test, y_test_predictions)

0.6242329036347697

In [16]:
# Accuracy score
accuracy_score(y_test, y_test_predictions)

0.8059701492537313

In [18]:
# Confusion matrix
confusion_matrix(y_test, y_test_predictions)

array([[51,  7],
       [19, 57]])

In [19]:
# Precission score
precision_score(y_test, y_test_predictions, average='macro')

0.8095982142857143

In [20]:
# F1-score
f1_score(y_test, y_test_predictions)

0.8142857142857143