In [1]:
# Parameters
# Refer to injected parameters for any run.ipynb (similar cell below)
compound = "caspofungin"

In [2]:
# Parameters
compound = "caspofungin"


# Notebook to optimize Random Forest Classifier

Initially written by M. Giguere and adapted by R. Durand, this notebook reshapes the dataframe of classified DMS data from single mutants of FKS1-HS1 to build a training set. The model, a Random Forest Classifier, uses a combination of position and amino acid properties (from [Expasy ProtScale](https://web.expasy.org/protscale/)) as features. It is meant to be used to predict echinocandin resistance for wild-type FKS1-HS1 sequences found in homologs. 

A randomized search is used to hypertune parameters.

## Import libraries

In [3]:
import pandas as pd
import numpy as np

np.bool = np.bool_
np.int = np.int_

from sklearn.metrics import (
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler

import pickle

## Specify paths

In [4]:
data = "../classified/BY4741_FKS1-HS1/refined_classification.csv"
aa_properties = "../../general_data/aminoAcidProperties.txt"
outpath = "../ML/"

## Import amino acid properties

In [5]:
df_aa_prop = pd.read_table(aa_properties).rename(columns={"Aminoacid.1.letter": "aa"})
df_aa_prop

Unnamed: 0,aa,alpha_helix_chou,alpha_helix_deleage,alpha_helix_levitt,aminoacid_composition_swissprot_bairoch,antiparallel_beta_strand_lifson,average_area_buried_folding_rose,average_flexibility_bhaskaran,average_surrounding_hydrophobicity_manavalan,beta_sheet_chou,...,recognition_factors,refractivity,relative_mutability_ala100_dayhoff,retention_coefficient_hfba_browne,retention_coefficient_ph2.1_meek,retention_coefficient_ph7.4_meek,retention_coefficient_tfa_browne,total_beta_strand_lifson,transmembrane_tendency_zhao,levy_propensity
0,A,1.42,1.489,1.29,8.25,0.9,86.6,0.36,12.97,0.83,...,78,4.34,100,3.9,-0.1,0.5,7.3,0.92,0.38,0.0062
1,C,0.7,0.966,1.11,1.37,1.24,132.3,0.35,14.63,1.19,...,89,35.77,20,-14.3,-2.2,-6.8,-9.2,1.16,-0.3,1.0372
2,D,1.01,0.924,1.04,5.45,0.47,97.8,0.51,10.85,0.54,...,81,12.0,106,-2.8,-2.8,-8.2,-2.9,0.48,-3.27,-0.7485
3,E,1.51,1.504,1.44,6.75,0.62,113.9,0.5,11.89,0.37,...,78,17.26,102,-7.5,-7.5,-16.9,-7.1,0.61,-2.9,-0.7893
4,F,1.13,1.195,1.07,3.86,1.23,194.1,0.31,14.0,1.38,...,81,29.4,41,14.7,13.9,13.2,19.2,1.25,1.98,1.2727
5,G,0.57,0.51,0.56,7.07,0.56,62.9,0.54,12.43,0.75,...,84,0.0,49,-2.3,-0.5,0.0,-1.2,0.61,-0.19,-0.1771
6,H,1.0,1.003,1.22,2.27,1.12,155.8,0.32,12.16,0.87,...,84,21.81,66,2.0,0.8,-3.5,-2.1,0.93,-1.44,0.1204
7,I,1.08,1.003,0.97,5.96,1.54,158.0,0.46,15.67,1.6,...,88,19.06,96,11.0,11.8,13.9,6.6,1.81,1.97,1.1109
8,K,1.16,1.172,1.23,5.84,0.74,115.5,0.47,11.36,0.74,...,87,21.29,56,-2.5,-3.2,0.1,-3.7,0.7,-3.46,-1.1806
9,L,1.21,1.236,1.3,9.66,1.26,164.1,0.37,14.9,1.3,...,85,18.78,40,15.0,10.0,8.8,20.0,1.3,1.82,0.9138


## Reshape dataframe

In [6]:
all_comp = pd.read_csv(data)

# Restrict to selected drug
df = (
    all_comp[all_comp.compound == compound][["compound", "aa_seq", "sensres"]]
    .copy()
    .rename(columns={"sensres": "resistance"})
)
print(len(df.aa_seq.unique()))
df

177


Unnamed: 0,compound,aa_seq,resistance
170,caspofungin,*LVLSLRDP,sensitive
171,caspofungin,ALVLSLRDP,resistant
172,caspofungin,DLVLSLRDP,resistant
173,caspofungin,ELVLSLRDP,resistant
174,caspofungin,F*VLSLRDP,sensitive
...,...,...,...
689,caspofungin,FLVGSLRDP,sensitive
693,caspofungin,FLVKSLRDP,resistant
697,caspofungin,FLVLSLRDN,resistant
701,caspofungin,FLVLSLRDQ,resistant


In [7]:
# Explode aa_seq into many columns
list_pos = np.arange(639, 648)
wtaa_cols = [f"aa{x}" for x in list_pos]
df[wtaa_cols] = df["aa_seq"].apply(lambda x: pd.Series(list(x)))

In [8]:
# Merge dataframe with AAproperties
merged = df.copy()

for i in list_pos:
    AA = df_aa_prop.set_index("aa").add_suffix(f"_aa{i}")
    AA.index.name = f"aa{i}"
    merged = pd.merge(
        left=merged,
        right=AA.reset_index(),
        how="inner",
        suffixes=(None, f"_aa{i}"),
        on=f"aa{i}",
    )

merged

Unnamed: 0,compound,aa_seq,resistance,aa639,aa640,aa641,aa642,aa643,aa644,aa645,...,recognition_factors_aa647,refractivity_aa647,relative_mutability_ala100_dayhoff_aa647,retention_coefficient_hfba_browne_aa647,retention_coefficient_ph2.1_meek_aa647,retention_coefficient_ph7.4_meek_aa647,retention_coefficient_tfa_browne_aa647,total_beta_strand_lifson_aa647,transmembrane_tendency_zhao_aa647,levy_propensity_aa647
0,caspofungin,ALVLSLRDP,resistant,A,L,V,L,S,L,R,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
1,caspofungin,DLVLSLRDP,resistant,D,L,V,L,S,L,R,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
2,caspofungin,ELVLSLRDP,resistant,E,L,V,L,S,L,R,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
3,caspofungin,FAVLSLRDP,sensitive,F,A,V,L,S,L,R,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
4,caspofungin,FCVLSLRDP,sensitive,F,C,V,L,S,L,R,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,caspofungin,FLVGSLRDP,sensitive,F,L,V,G,S,L,R,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
164,caspofungin,FLVKSLRDP,resistant,F,L,V,K,S,L,R,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
165,caspofungin,FLVLSLRDN,resistant,F,L,V,L,S,L,R,...,94,13.28,134,-2.8,-1.6,0.8,-5.7,0.60,-1.62,-0.2693
166,caspofungin,FLVLSLRDQ,resistant,F,L,V,L,S,L,R,...,87,17.56,93,1.8,-2.5,-4.8,-0.3,0.95,-1.84,-0.4114


In [9]:
merged[["compound", "aa_seq", "resistance"]].to_csv(
    f"{outpath}/{compound}_training.csv", index=False
)

In [10]:
# Get training data for machine learning.
cols_of_interest = [x for x in merged.columns if "_aa" in x]
All_singles_x = merged[cols_of_interest]
All_singles_x

Unnamed: 0,alpha_helix_chou_aa639,alpha_helix_deleage_aa639,alpha_helix_levitt_aa639,aminoacid_composition_swissprot_bairoch_aa639,antiparallel_beta_strand_lifson_aa639,average_area_buried_folding_rose_aa639,average_flexibility_bhaskaran_aa639,average_surrounding_hydrophobicity_manavalan_aa639,beta_sheet_chou_aa639,beta_sheet_deleage_aa639,...,recognition_factors_aa647,refractivity_aa647,relative_mutability_ala100_dayhoff_aa647,retention_coefficient_hfba_browne_aa647,retention_coefficient_ph2.1_meek_aa647,retention_coefficient_ph7.4_meek_aa647,retention_coefficient_tfa_browne_aa647,total_beta_strand_lifson_aa647,transmembrane_tendency_zhao_aa647,levy_propensity_aa647
0,1.42,1.489,1.29,8.25,0.90,86.6,0.36,12.97,0.83,0.709,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
1,1.01,0.924,1.04,5.45,0.47,97.8,0.51,10.85,0.54,0.541,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
2,1.51,1.504,1.44,6.75,0.62,113.9,0.50,11.89,0.37,0.567,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
3,1.13,1.195,1.07,3.86,1.23,194.1,0.31,14.00,1.38,1.393,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
4,1.13,1.195,1.07,3.86,1.23,194.1,0.31,14.00,1.38,1.393,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,1.13,1.195,1.07,3.86,1.23,194.1,0.31,14.00,1.38,1.393,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
164,1.13,1.195,1.07,3.86,1.23,194.1,0.31,14.00,1.38,1.393,...,91,10.93,56,5.6,8.0,6.1,5.1,0.40,-1.44,-0.1799
165,1.13,1.195,1.07,3.86,1.23,194.1,0.31,14.00,1.38,1.393,...,94,13.28,134,-2.8,-1.6,0.8,-5.7,0.60,-1.62,-0.2693
166,1.13,1.195,1.07,3.86,1.23,194.1,0.31,14.00,1.38,1.393,...,87,17.56,93,1.8,-2.5,-4.8,-0.3,0.95,-1.84,-0.4114


In [11]:
# Get labels
All_singles_y = merged["resistance"]
All_singles_y

0      resistant
1      resistant
2      resistant
3      sensitive
4      sensitive
         ...    
163    sensitive
164    resistant
165    resistant
166    resistant
167    resistant
Name: resistance, Length: 168, dtype: object

In the following sections, some code was initially written by F.D. Rouleau and M. Giguere, then adapted by R. Durand.

## Training

In [12]:
# Train-test split on all single mutants
X_train, X_test, y_train, y_test = train_test_split(
    All_singles_x, All_singles_y, test_size=0.3, random_state=18
)

In [13]:
# Scale
scaler = StandardScaler()
scaler.fit(X_train)
X_train_s = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_s = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

## Hypertuning

In [14]:
# RandomizedSearchCV

# Copy paste cell to perform tests, then copy param_grid in cell below

max_depth = [
    int(x) for x in np.linspace(1, 50, num=10)
]  # in our case the complexity does not warrant a large tree depth and instead often leads to overtraining
max_depth.append(None)

param_grid = {
    "max_depth": max_depth,
    "min_samples_split": [2, 5, 10, 20, 30, 40, 50],
    "n_estimators": [int(x) for x in np.linspace(start=50, stop=300, num=20)],
    "min_samples_leaf": [1, 2, 5, 10, 20, 30, 50, 100],
    "max_features": ("sqrt", "log2"),
    "bootstrap": [True, False],
    "class_weight": ["balanced", None],
}

for i in range(0, 5):
    CV_rf = RandomizedSearchCV(
        RandomForestClassifier(),
        param_grid,
        n_jobs=-1,
        cv=5,
        scoring="balanced_accuracy",
    )
    CV_rf.fit(X_train_s, y_train)

    print(CV_rf.best_params_)

{'n_estimators': 63, 'min_samples_split': 40, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 28, 'class_weight': 'balanced', 'bootstrap': False}


{'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 11, 'class_weight': 'balanced', 'bootstrap': False}


{'n_estimators': 63, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 39, 'class_weight': 'balanced', 'bootstrap': False}


{'n_estimators': 181, 'min_samples_split': 40, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 44, 'class_weight': 'balanced', 'bootstrap': True}


{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 6, 'class_weight': 'balanced', 'bootstrap': False}


In [15]:
#### WORKING CELL COPIED FROM ABOVE - EDIT AND RUN AS NEEDED UNTIL CONVERGENCE OF PARAMETER VALUES


# RandomizedSearchCV

max_depth = [int(x) for x in np.linspace(5, 30, num=10)]

param_grid = {
    "max_depth": max_depth,
    "min_samples_split": [2, 5, 10, 20, 30, 40, 50],
    "n_estimators": [int(x) for x in np.linspace(start=10, stop=300, num=20)],
    "min_samples_leaf": [1, 2, 5],
    "max_features": ("sqrt", "log2"),
    "bootstrap": [True, False],
    "class_weight": ["balanced", None],
}

for i in range(0, 10):
    CV_rf = RandomizedSearchCV(
        RandomForestClassifier(),
        param_grid,
        n_jobs=-1,
        cv=5,
        scoring="balanced_accuracy",
    )
    CV_rf.fit(X_train_s, y_train)

    print(CV_rf.best_params_)

{'n_estimators': 300, 'min_samples_split': 40, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 24, 'class_weight': 'balanced', 'bootstrap': True}


{'n_estimators': 25, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 24, 'class_weight': 'balanced', 'bootstrap': True}


{'n_estimators': 116, 'min_samples_split': 40, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 10, 'class_weight': 'balanced', 'bootstrap': False}


{'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 27, 'class_weight': None, 'bootstrap': True}


{'n_estimators': 193, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 21, 'class_weight': 'balanced', 'bootstrap': True}


{'n_estimators': 116, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 5, 'class_weight': 'balanced', 'bootstrap': False}


{'n_estimators': 55, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 16, 'class_weight': 'balanced', 'bootstrap': False}


{'n_estimators': 101, 'min_samples_split': 30, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 13, 'class_weight': 'balanced', 'bootstrap': True}


{'n_estimators': 284, 'min_samples_split': 50, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 7, 'class_weight': 'balanced', 'bootstrap': True}


{'n_estimators': 55, 'min_samples_split': 40, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 27, 'class_weight': 'balanced', 'bootstrap': True}


In [16]:
# Use Gridsearch & Random Forest

print(compound)

if compound == "caspofungin":
    param_grid = {
        "max_depth": [int(x) for x in np.linspace(5, 15, num=5)],
        "min_samples_split": [10, 20, 50],
        "n_estimators": [int(x) for x in np.linspace(start=50, stop=300, num=5)],
        "min_samples_leaf": [2, 5],
        "max_features": ["log2", "sqrt"],
        "bootstrap": [True, False],
        "class_weight": ["balanced"],
        "random_state": [18],
    }

elif compound == "anidulafungin":
    param_grid = {
        "max_depth": [int(x) for x in np.linspace(5, 30, num=5)],
        "min_samples_split": [5, 10, 50],
        "n_estimators": [int(x) for x in np.linspace(start=20, stop=300, num=5)],
        "min_samples_leaf": [1, 2, 5],
        "max_features": ["log2", "sqrt"],
        "bootstrap": [False],
        "class_weight": ["balanced"],
        "random_state": [18],
    }

elif compound == "micafungin":
    param_grid = {
        "max_depth": [int(x) for x in np.linspace(10, 30, num=5)],
        "min_samples_split": [2, 5],
        "n_estimators": [int(x) for x in np.linspace(start=30, stop=260, num=5)],
        "min_samples_leaf": [1, 2, 5],
        "max_features": ["log2", "sqrt"],
        "bootstrap": [True, False],
        "class_weight": ["balanced", None],
        "random_state": [18],
    }

else:
    print("Wrong compound")


model = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    n_jobs=-1,
    cv=5,
    scoring="balanced_accuracy",
)
model.fit(X_train_s, y_train)

print(model.best_params_)

caspofungin


{'bootstrap': False, 'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 20, 'n_estimators': 175, 'random_state': 18}


In [17]:
# Random Forest on optimized parameters

print(compound)

if compound == "caspofungin":
    param_grid = {
        "bootstrap": False,
        "class_weight": "balanced",
        "max_depth": 5,
        "max_features": "sqrt",
        "min_samples_leaf": 5,
        "min_samples_split": 20,
        "n_estimators": 175,
        "random_state": 18,
    }
    # param_grid = {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 20, 'n_estimators': 175, 'random_state': 18}


elif compound == "anidulafungin":
    param_grid = {
        "bootstrap": False,
        "class_weight": "balanced",
        "max_depth": 11,
        "max_features": "sqrt",
        "min_samples_leaf": 1,
        "min_samples_split": 5,
        "n_estimators": 230,
        "random_state": 18,
    }
    # param_grid = {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 30, 'n_estimators': 250, 'random_state': 18}

elif compound == "micafungin":
    param_grid = {
        "bootstrap": False,
        "class_weight": "balanced",
        "max_depth": 15,
        "max_features": "log2",
        "min_samples_leaf": 1,
        "min_samples_split": 5,
        "n_estimators": 30,
        "random_state": 18,
    }
    # param_grid = {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 80, 'random_state': 18}

else:
    print("Wrong compound")


model = RandomForestClassifier(**param_grid)
model.fit(X_train_s, y_train)

caspofungin


0,1,2
,n_estimators,175
,criterion,'gini'
,max_depth,5
,min_samples_split,20
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [18]:
# Predict class of train set to evaluate overfitting
train_pred = model.predict(X_train_s)  # train_pred previously y_pred_train

# Predict class probability of train set
train_pred_prob = model.predict_proba(X_train_s)

# Predict class of test set
test_pred = model.predict(X_test_s)  # test_pred previously y_pred

# Predict class probability of test set
test_pred_prob = model.predict_proba(X_test_s)

## Performance on training

In [19]:
# Display performance metrics

print(
    "Balanced accuracy on train set: %0.1f %%"
    % (100 * balanced_accuracy_score(y_train, train_pred))
)
print(
    "Balanced accuracy on test set: %0.1f %%\n\n"
    % (100 * balanced_accuracy_score(y_test, test_pred))
)
print(classification_report(y_train, train_pred))
print(classification_report(y_test, test_pred))
cm_train = confusion_matrix(y_train, train_pred)
our_cm = cm_train
TN = our_cm[0, 0]
TP = our_cm[1, 1]
FN = our_cm[1, 0]
FP = our_cm[0, 1]
acc = round((TP + TN) / np.sum(our_cm), 2)  # accuracy
tpr = round(TP / (TP + FN), 2)  # true positive rate, sensitivity, recall
tnr = round(TN / (TN + FP), 2)  # true negative rate, specificity
ppv = round(TP / (TP + FP), 2)  # positive predictive value, precision
npv = round(TN / (TN + FN), 2)  # negative predictive value
print(f"Train set - Accuracy: {acc}, TPR: {tpr}, TNR: {tnr}, PPV: {ppv}, NPV: {npv}")
cm_test = confusion_matrix(y_test, test_pred)
our_cm = cm_test
TN = our_cm[0, 0]
TP = our_cm[1, 1]
FN = our_cm[1, 0]
FP = our_cm[0, 1]
acc = round((TP + TN) / np.sum(our_cm), 2)  # accuracy
tpr = round(TP / (TP + FN), 2)  # true positive rate, sensitivity, recall
tnr = round(TN / (TN + FP), 2)  # true negative rate, specificity
ppv = round(TP / (TP + FP), 2)  # positive predictive value, precision
npv = round(TN / (TN + FN), 2)  # negative predictive value
print(f"Test set - Accuracy: {acc}, TPR: {tpr}, TNR: {tnr}, PPV: {ppv}, NPV: {npv}")

Balanced accuracy on train set: 87.1 %
Balanced accuracy on test set: 83.5 %


              precision    recall  f1-score   support

   resistant       0.98      0.76      0.86        67
   sensitive       0.75      0.98      0.85        50

    accuracy                           0.85       117
   macro avg       0.87      0.87      0.85       117
weighted avg       0.88      0.85      0.86       117

              precision    recall  f1-score   support

   resistant       0.95      0.71      0.82        28
   sensitive       0.73      0.96      0.83        23

    accuracy                           0.82        51
   macro avg       0.84      0.84      0.82        51
weighted avg       0.85      0.82      0.82        51

Train set - Accuracy: 0.85, TPR: 0.98, TNR: 0.76, PPV: 0.75, NPV: 0.98
Test set - Accuracy: 0.82, TPR: 0.96, TNR: 0.71, PPV: 0.73, NPV: 0.95


## Export model

In [20]:
with open(f"{outpath}/{compound}_model.pkl", "wb") as m:
    pickle.dump(model, m)