In [None]:
import pandas as pd

AllDesc_file = '/users/file_classification_Smiles_AllDesc.txt'
AllDesc_df = pd.read_csv(AllDesc_file, index_col=0)
print(AllDesc_df)
smiles = AllDesc_df['SMILES']
labels = AllDesc_df['Classification'] 
print("Number of data points in the dataset:")
print(len(smiles))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.inspection import permutation_importance
import numpy as np

# Load the Data
AllDesc_df = pd.read_csv(AllDesc_file)

# Feature Selection
descnm = ['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 
          'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 
          'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 
          'FpDensityMorgan3', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 
          'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 
          'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 
          'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 
          'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 
          'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 
          'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 
          'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 
          'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10', 
          'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 
          'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 
          'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 
          'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 
          'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', 'MolLogP', 
          'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 
          'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 
          'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 
          'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 
          'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 
          'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 
          'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 
          'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 
          'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 
          'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 
          'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 
          'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 
          'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea']

features_org = AllDesc_df[descnm]
labels = AllDesc_df['Classification']

# Split the data set (labels and features) into training set and test set 
feat_train, feat_test, label_train, label_test = train_test_split(features_org, labels, test_size=0.33, random_state=100)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define the Model
RF_Name = RandomForestClassifier(n_estimators=75, random_state=100, max_depth=5, max_features='sqrt')

#Fit the Model
RF_Namne.fit(feat_train, label_train)

# Make predictions
predictions = RF_Name.predict(feat_test)
prob_predictions = RF_Name.predict_proba(feat_test)[:, 1]

#Test Set Accuracy
accuracy = accuracy_score(label_test, predictions)

# Helix (pos_label = Helix Binder)
precision_Helix = precision_score(label_test, predictions, pos_label='Helix Binder')
recall_Helix = recall_score(label_test, predictions, pos_label='Helix Binder')
f1_Helix = f1_score(label_test, predictions, pos_label='Helix Binder')

# Loop (pos_label = Loop Binder)
precision_Loop = precision_score(label_test, predictions, pos_label='Loop Binder')
recall_Loop = recall_score(label_test, predictions, pos_label='Loop Binder')
f1_Loop = f1_score(label_test, predictions, pos_label='Loop Binder')

# Confusion Matrix
conf_matrixH = confusion_matrix(label_test, predictions, labels=['Loop Binder', 'Helix Binder'])
conf_matrixL = confusion_matrix(label_test, predictions, labels=['Helix Binder', 'Loop Binder'])

# Training Set Accuracy
training_accuracy = RF_Name.score(feat_train, label_train)

# Print Metrics
print(f"Training set accuracy: {training_accuracy:.5f}")
print(f"Test set accuracy: {accuracy:.5f}")
print(f"Precision Pos=Helix: {precision_Helix:.5f}")
print(f"Recall Pos=Helix: {recall_Helix:.5f}")
print(f"F1 Pos=Helix: {f1_Helix:.5f}")
print(f"Precision Pos=Loop: {precision_Loop:.5f}")
print(f"Recall Pos=Loop: {recall_Loop:.5f}")
print(f"F1 Pos=Loop: {f1_Loop:.5f}")
print("Confusion Matrix L = 1:")
print(conf_matrixL)
print("Confusion Matrix H = 1:")
print(conf_matrixH)

#Heatmap Confusion Matrix - Helix Binder
print(f"Helix Binders")

group_names = ['True Negative', 'False Positive', 'False Negative','True Positive']
group_counts = ["{0:0.0f}".format(value) for value in
                conf_matrixH.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_counts)]
          
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize=(5, 4))
sns.heatmap(conf_matrixH, annot=labels, fmt='', cmap='Greys')
Output_Helix = '/users/HeatMap_Helix_CM_RF_Name.png'
plt.savefig(Output_Helix)

#Heatmap Confusion Matrix - Loop Binder
print(f"Loop Binders")

group_names = ['True Negative', 'False Positive', 'False Negative','True Positive']
group_counts = ["{0:0.0f}".format(value) for value in
                conf_matrixL.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_counts)]
          
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize=(5, 4))
sns.heatmap(conf_matrixL, annot=labels, fmt='', cmap='Greys')
Output_Loop = '/users/HeatMap_Loop_CM_RF_Name.png'
plt.savefig(Output_Loop)

In [None]:
# Perform permutation importance
perm_importance = permutation_importance(RF_Name, feat_test, label_test, n_repeats=10, random_state=100, n_jobs=-1)

# Print permutation importances
for i in perm_importance.importances_mean.argsort()[::-1]:
    if perm_importance.importances_mean[i] - 2 * perm_importance.importances_std[i] > 0:
        print(f"{descnm[i]:<30} {perm_importance.importances_mean[i]:.3f} +/- {perm_importance.importances_std[i]:.3f}")

# Get the results
sorted_idx = perm_importance.importances_mean.argsort()
features = np.array(descnm)[sorted_idx]
importances_mean = perm_importance.importances_mean[sorted_idx]
importances_std = perm_importance.importances_std[sorted_idx]

# Plotting the results using stem plot
plt.figure(figsize=(35, 15))
markerline, stemlines, baseline = plt.stem(features, importances_mean, linefmt='-', markerfmt='o', basefmt=' ')
plt.errorbar(features, importances_mean, yerr=importances_std, fmt='none', ecolor='red', capsize=4)
plt.xticks(rotation=90, fontsize='12')
plt.yticks(fontsize='15')
plt.xlabel("Feature", fontsize='30')
plt.ylabel("Permutation Importance", fontsize='30')
plt.title("Permutation Importance of Features", fontsize='30')
plt.margins(0.01)
plt.tight_layout()


output_file = '/users/Plot_Permutation_Importance_RF_Name.png'
plt.savefig(output_file)