In [1]:
import pandas as pd

In [17]:
# 15,618 compounds
Mtb_published_regression_AC_Cleaned = pd.read_csv('/rds/general/user/qg622/home/Y2/Antibacterial/data/Mtb_published_regression_AC_Cleaned.csv')
# 1,193 compounds
RCB_Mtb_inhibition_20072019 = pd.read_csv('/rds/general/user/qg622/home/Y2/Antibacterial/data/RCB_Mtb_inhibition_2007-2019.csv')

# 258 compounds
Mtb_testset_2017 = pd.read_csv('/rds/general/user/qg622/home/Y2/Antibacterial/data/2017_Mtb_testset.csv')
# 1,196 compounds
Mtb_predictions_Vadim_100nM = pd.read_csv('/rds/general/user/qg622/home/Y2/Antibacterial/data/Mtb_predictions_Vadim_100nM.csv')

combined_data = pd.concat([Mtb_published_regression_AC_Cleaned, RCB_Mtb_inhibition_20072019, Mtb_testset_2017, Mtb_predictions_Vadim_100nM], ignore_index=True)

chembl = pd.read_csv('/rds/general/user/qg622/home/Y2/Antibacterial/data/data_from_pubchem_chembl/chembl.csv')
pubchem = pd.read_csv('/rds/general/user/qg622/home/Y2/Antibacterial/data/data_from_pubchem_chembl/pubchem.tsv', sep='\t')
# Drop rows where 'SMILES' is na
pubchem = pubchem.dropna(subset=['SMILES'])
chembl = chembl.dropna(subset=['SMILES'])

In [18]:
# Filter PubChem data for rows where 'Standard Type' contains 'MIC'
pubchem_filtered = pubchem[pubchem['acname'].str.contains('MIC', case=False, na=False)]

# Filter ChEMBL data for rows where 'acname' contains 'MIC'
chembl_filtered = chembl[chembl['Standard Type'].str.contains('MIC', case=False, na=False)]


# Identify molecules in ChEMBL not in Ekins
molecules_in_chembl_not_in_ekins = chembl_filtered[~chembl_filtered['SMILES'].isin(combined_data['SMILES'])]

# Identify molecules in PubChem not in Ekins
molecules_in_pubchem_not_in_ekins = pubchem_filtered[~pubchem_filtered['SMILES'].isin(combined_data['SMILES'])]

# Print the outputs
print("Molecules in ChEMBL not in Ekins:")
print(len(molecules_in_chembl_not_in_ekins))
print("\nMolecules in PubChem not in Ekins:")
print(len(molecules_in_pubchem_not_in_ekins))


Molecules in ChEMBL not in Ekins:
139

Molecules in PubChem not in Ekins:
148


In [19]:
molecules_in_chembl_not_in_ekins.to_csv('./data/data_from_pubchem_chembl/molecules_in_chembl_not_in_ekins.csv', index=False)
molecules_in_pubchem_not_in_ekins.to_csv('./data/data_from_pubchem_chembl/molecules_in_pubchem_not_in_ekins.csv', index=False)

In [20]:
len(molecules_in_chembl_not_in_ekins.SMILES.unique())

113

In [21]:
len(molecules_in_pubchem_not_in_ekins.SMILES.unique())

143

In [29]:
len(set(list(molecules_in_chembl_not_in_ekins.SMILES.unique())+list(molecules_in_pubchem_not_in_ekins.SMILES.unique())))

256

# Results calculation

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr, gaussian_kde
from sklearn.metrics import (accuracy_score, auc, confusion_matrix,
                             matthews_corrcoef, mean_absolute_error,
                             mean_squared_error, r2_score, roc_curve)




def calculate_resutls(y_test, y_pred):
    y_pred = np.array(y_pred).reshape(-1)
    y_test = np.array(y_test).reshape(-1)
    pearson_rp = pearsonr(y_test, y_pred)[0]
    spearman_rs = spearmanr(y_test, y_pred)[0]
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    obs = [0 if i < 6 else 1 for i in y_test]
    pred = [0 if i < 6 else 1 for i in y_pred] 
    mcc = matthews_corrcoef(obs, pred)
    accuracy = accuracy_score(obs, pred)
    cm = pd.DataFrame(confusion_matrix(obs, pred))
    try:
        hit_rate = (cm.iloc[1, 1] / (cm.iloc[1, 1] + cm.iloc[0, 1]) * 100)
    except:
        hit_rate = 0
    if hit_rate is np.nan:
        hit_rate = 0
    fpr, tpr, thresholds = roc_curve(obs, y_pred)
    roc_auc = auc(fpr, tpr)
    
    test_results = {
        'y_test': y_test,
        'y_pred': y_pred,
        'pearson_rp': np.float64(pearson_rp),
        'spearman_rs': np.float64(spearman_rs),
        'rmse': np.float64(rmse),
        'r2': np.float64(r2),
        'accuracy': np.float64(accuracy),
        'mcc': np.float64(mcc),
        'hit_rate': np.float64(hit_rate),
        'mae': np.float64(mae),
        'cm': cm,
        'roc_auc': np.float64(roc_auc),
        'tp': float(cm.iloc[1, 1]),
        'fp': float(cm.iloc[0, 1]),
        'tn': float(cm.iloc[0, 0]),
        'fn': float(cm.iloc[1, 0])
    }
    print(f'RMSE: {test_results["rmse"]:.3f}|MAE: {test_results["mae"]:.3f}|$R_p$: {test_results["pearson_rp"]:.3f}|$R_s$: {test_results["spearman_rs"]:.3f}|MCC: {test_results["mcc"]:.3f}|ROC AUC: {test_results["roc_auc"]:.3f}|Hit rate (%): {test_results["hit_rate"]:.1f}')

    return test_results

In [4]:
root = '/rds/general/user/qg622/home/'

df = pd.read_csv(f'{root}Y2/Antibacterial/results/Mtb_20leftout_ypred.csv')
df_rcb = pd.read_csv(f'{root}Y2/Antibacterial/results/rcb_ypred.csv')
df_2017 = pd.read_csv(f'{root}Y2/Antibacterial/results/2017_ypred.csv')
                 


In [5]:
calculate_resutls(df['activity'], df['pred'])

RMSE: 0.615|MAE: 0.466|$R_p$: 0.675|$R_s$: 0.627|MCC: 0.518|ROC AUC: 0.709|Hit rate (%): 71.5


{'y_test': array([4.9742 , 5.39794, 6.69627, ..., 4.79588, 4.87128, 5.00877]),
 'y_pred': array([5.1838684, 4.9218383, 6.4496317, ..., 5.2007904, 4.4993353,
        5.057248 ]),
 'pearson_rp': 0.6749657408682188,
 'spearman_rs': 0.6274087699235991,
 'rmse': 0.6153878313023959,
 'r2': 0.43621697168454077,
 'accuracy': 0.9106914212548015,
 'mcc': 0.5176408190498287,
 'hit_rate': 71.48936170212767,
 'mae': 0.4657545190460948,
 'cm':       0    1
 0  2677   67
 1   212  168,
 'roc_auc': 0.7088441767684517,
 'tp': 168.0,
 'fp': 67.0,
 'tn': 2677.0,
 'fn': 212.0}