In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import rdkit.Chem as Chem
from scipy.stats import spearmanr
from sklearn.metrics import r2_score

In [2]:
def calculate_percentile(subset_df, experimental_value):
    return (subset_df['True'] <= experimental_value).mean() * 100

In [4]:
# read in substrate key
substrates = pd.read_csv('substrate_key.csv')
substrate_key = dict(zip(substrates['smiles'], substrates['substrate_number']))

In [8]:
# read in all model prediction files

df_xscramble = pd.read_csv('../Modeling/Results/mlp_preds_Xscrambled.csv')
df_yscramble = pd.read_csv('../Modeling/Results/mlp_preds_yscrambled.csv')
df_OHE = pd.read_csv('../Modeling/Results/mlp_preds_ohe.csv')

df_rf_random = pd.read_csv('../Modeling/Results/RF_preds_random.csv')
df_lgbm_random = pd.read_csv('../Modeling/Results/LGBM_preds_random.csv')
df_svm_random = pd.read_csv('../Modeling/Results/SVM_preds_random.csv')

df_random = pd.read_csv('../Modeling/Results/mlp_preds_random.csv')
df_loocv = pd.read_csv('../Modeling/Results/mlp_preds_loocv.csv')
df_external = pd.read_csv('../Modeling/Results/mlp_preds_external.csv')

df_ohe_loocv = pd.read_csv('../Modeling/Results/mlp_preds_ohe_loocv.csv')
df_ohe_external = pd.read_csv('../Modeling/Results/mlp_preds_ohe_external.csv')

dfs = {'xscramble': df_xscramble, 'yscramble': df_yscramble, 'OHE': df_OHE,
       'rf_random': df_rf_random, 'lgbm_random': df_lgbm_random, 'svm_random': df_svm_random,
       'mlp_random': df_random, 'mlp_loocv': df_loocv, 'mlp_external': df_external,
       'ohe_loocv': df_ohe_loocv, 'ohe_external': df_ohe_external,
       }

In [10]:
# calculate APYR per substrate

percentile_results = {}

for key, value in dfs.items():
    df_ = value
    percentile_results[key] = {}
    
    df_ome = df_[df_['boronic_acid']=='COc1ccc(B(O)O)cc1']
    df_cf3 = df_[df_['boronic_acid']=='OB(O)c1ccc(C(F)(F)F)cc1']
    
    subsets = {'ome': df_ome, 'cf3': df_cf3}
    
    for k, v in subsets.items():
        if v.shape[0] == 0:
            break
        percentile_results[key][k] = []
        percents = []

        for substrate, substrate_df in v.groupby('sulfonamide'):

            # Get the maximum predicted yield for the current substrate
            max_predicted = substrate_df['Predicted'].max()

            # Filter conditions where predicted yield is within 5% of the maximum predicted yield
            threshold = max_predicted - 0.05
            filtered_df = substrate_df[substrate_df['Predicted'] >= threshold]

            # If there are no conditions within 5% of the max, skip this substrate
            if filtered_df.empty:
                filtered_df = substrate_df[substrate_df['Predicted'] == max_predicted]

            # Calculate percentiles for the experimental yields of the filtered conditions
            percentiles = []
            for _, row in filtered_df.iterrows():
                percentile = calculate_percentile(substrate_df, row['True'])
                percentiles.append(percentile)

            # Calculate the average percentile for this substrate
            avg_percentile = sum(percentiles) / len(percentiles) if percentiles else None

            # Store the result for this substrate
            percents.append({
                'substrate': substrate,
                'average_percentile': avg_percentile,
                'number_of_conditions': len(filtered_df),
                'max_predicted': max_predicted
            })

        percentile_results[key][k] = pd.DataFrame(percents)

concat_df = pd.concat({outer_key: pd.concat(inner_dict, names=['boronic_acid']) 
                       for outer_key, inner_dict in percentile_results.items()}, 
                      names=['dataset'])

concat_df = concat_df.reset_index().rename(columns={'level_2': 'original_index'})
#concat_df.to_csv('../../../../Downloads/average_percentile_metrics_final_2024-10-28.csv', index=False)

In [11]:
concat_df

Unnamed: 0,dataset,boronic_acid,original_index,substrate,average_percentile,number_of_conditions,max_predicted
0,xscramble,ome,0,CC(C)(C)OC(=O)N1CCC(S(N)(=O)=O)CC1,61.718750,2,0.558521
1,xscramble,ome,1,CCOC(=O)c1ccccc1S(N)(=O)=O,73.255814,2,0.633508
2,xscramble,ome,2,CCOc1ccc(S(N)(=O)=O)cc1OCC,67.201166,1,0.593551
3,xscramble,ome,3,CN(C)C(=O)c1cccnc1S(N)(=O)=O,22.204724,6,0.445201
4,xscramble,ome,4,CN(C)CCS(N)(=O)=O,100.000000,1,0.499819
...,...,...,...,...,...,...,...
327,ohe_external,ome,17,NS(=O)(=O)c1ccc(Cl)s1,62.500000,2,0.516113
328,ohe_external,ome,18,NS(=O)(=O)c1ccc(OC(F)(F)F)cc1Cl,63.333333,2,0.516113
329,ohe_external,ome,19,NS(=O)(=O)c1cccc2nonc12,68.750000,2,0.516113
330,ohe_external,ome,20,NS(=O)(=O)c1ccccc1,87.500000,2,0.516113


In [12]:
# generate a dataframe with the conditions corresponding to the top 5% yield of predicted yields for loocv

preds = {}

for k, v in df_loocv.groupby(['sulfonamide', 'boronic_acid']):
    preds[k] = []
    
    max_predicted = v['Predicted'].max()

    # Filter conditions where predicted yield is within 5% of the maximum predicted yield
    threshold = max_predicted - 0.05
    filtered_df = v[v['Predicted'] >= threshold]

    # Calculate percentiles for the experimental yields of the filtered conditions

    percentiles = []
    for _, row in filtered_df.iterrows():
        percentile = calculate_percentile(v, row['True'])
        percentiles.append(percentile)

    filtered_df['Percentile'] = percentiles
    
    preds[k] = filtered_df
    
preds = pd.concat(preds.values())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Percentile'] = percentiles


In [13]:
# Calculate Spearmans correlation coefficient per data split

# Initialize a dictionary to store the results
spearman_results = {}

# Loop over each dataset split and calculate Spearman's correlation
for key, value in dfs.items():
    spearman_results[key] = {}
    for sulf, df_ in value.groupby('sulfonamide'):

        # Ensure both columns 'Predicted_Yield' and 'Experimental_Yield' exist
        if 'Predicted' in df_.columns and 'True' in df_.columns:
            spearman_corr, p_value = spearmanr(df_['Predicted'], df_['True'])

            # Store the result in the dictionary
            spearman_results[key][sulf] = {
                'Spearman_Correlation': spearman_corr,
                'P_value': p_value
            }
        else:
            print(f"Missing columns in dataset split: {key}")

# combine the results into a df
spearman_df = pd.DataFrame()
for k in spearman_results.keys():
    
    split_df = pd.DataFrame(spearman_results[k]).T.reset_index().rename(columns={'index': 'smiles'})
    split_df['dataset'] = k
    split_df = split_df[['dataset', 'smiles', 'Spearman_Correlation', 'P_value']]
    spearman_df = pd.concat([spearman_df, split_df])

In [15]:
spearman_df

Unnamed: 0,dataset,smiles,Spearman_Correlation,P_value
0,xscramble,CC(C)(C)OC(=O)N1CCC(S(N)(=O)=O)CC1,-0.066100,0.079666
1,xscramble,CCOC(=O)c1ccccc1S(N)(=O)=O,-0.064609,0.018760
2,xscramble,CCOc1ccc(S(N)(=O)=O)cc1OCC,-0.033281,0.384112
3,xscramble,CN(C)C(=O)c1cccnc1S(N)(=O)=O,-0.049412,0.213705
4,xscramble,CN(C)CCS(N)(=O)=O,-0.018696,0.494745
...,...,...,...,...
17,ohe_external,NS(=O)(=O)c1ccc(Cl)s1,0.479412,0.060239
18,ohe_external,NS(=O)(=O)c1ccc(OC(F)(F)F)cc1Cl,-0.217857,0.435393
19,ohe_external,NS(=O)(=O)c1cccc2nonc12,0.035294,0.896753
20,ohe_external,NS(=O)(=O)c1ccccc1,0.858824,0.000020


In [16]:
import statistics as stats

for d in concat_df['dataset'].unique():
    print(d)
    print('avg: ' + str(np.average(concat_df[concat_df['dataset']==d]['average_percentile'])))
    print('stdev: ' + str(stats.stdev(concat_df[concat_df['dataset']==d]['average_percentile'])))
    print('    ')

xscramble
avg: 49.016105724914205
stdev: 24.71361731986486
    
yscramble
avg: 60.888508711686484
stdev: 20.291859380676684
    
OHE
avg: 94.21881057827476
stdev: 7.922178717186792
    
rf_random
avg: 90.89683854561332
stdev: 15.33156363017017
    
lgbm_random
avg: 92.10599553488325
stdev: 13.577824628466072
    
svm_random
avg: 90.32994944331972
stdev: 14.362225082523006
    
mlp_random
avg: 94.82569505631923
stdev: 6.269607318183303
    
mlp_loocv
avg: 85.13075575575576
stdev: 14.480942754085481
    
mlp_external
avg: 75.23516414141415
stdev: 15.853107618429767
    
ohe_loocv
avg: 83.39506917631918
stdev: 16.573902206008142
    
ohe_external
avg: 77.25378787878788
stdev: 19.97183412331372
    
