In [62]:
import pandas as pd
import numpy as np

res_df = pd.read_csv('../datasets/ams_order_results.csv.gz')

# remove 13 cpds that were not ordered
# remove 4 cpds that were ordered but failed to disolve 
res_df = res_df[~pd.isna(res_df['Hit'])]

---
## Total Hits of Ordered Compounds

In [63]:
print('Total hits: {0:.0f} out of {1:.0f}. Hit Rate: {2:.2f} %'.format(res_df['Hit'].sum(), res_df.shape[0], 
                                                           100.0*(res_df['Hit'].sum()/res_df.shape[0])))

Total hits: 420 out of 1024. Hit Rate: 41.02 %


---
## Hits by Selection/Model Type

In [64]:
RF_all = res_df[res_df['Selection'].isin(['Prediction', 'Both'])]
BL_all = res_df[res_df['Selection'].isin(['Baseline', 'Both'])]
RF_but_not_BL = res_df[res_df['Selection'].isin(['Prediction'])]
BL_but_not_RF = res_df[res_df['Selection'].isin(['Baseline'])]
overlap_selection =  res_df[res_df['Selection'] == 'Both']

summary_by_selection = []
summary_by_selection.append(['RF or BL All', res_df.shape[0], res_df['Hit'].sum()]) # RF OR BL All
summary_by_selection.append(['RF All', RF_all.shape[0], RF_all['Hit'].sum()]) # RF All
summary_by_selection.append(['BL All', BL_all.shape[0], BL_all['Hit'].sum()]) # BL All
summary_by_selection.append(['RF AND BL overlap', overlap_selection.shape[0], overlap_selection['Hit'].sum()]) # RF AND BL overlap
summary_by_selection.append(['RF but not BL', RF_but_not_BL.shape[0], RF_but_not_BL['Hit'].sum()])
summary_by_selection.append(['BL but not RF', BL_but_not_RF.shape[0], BL_but_not_RF['Hit'].sum()])

summary_df = pd.DataFrame(data=summary_by_selection,
                          columns=[0, 'Count', 'Hits'])
summary_df = summary_df.set_index(0)
summary_df.index.name = None

summary_df['Misses'] = summary_df['Count'] - summary_df['Hits']
summary_df['Hit Rate'] = 100.0*(summary_df['Hits'] / summary_df['Count'])
summary_df

Unnamed: 0,Count,Hits,Misses,Hit Rate
RF or BL All,1024,420.0,604.0,41.015625
RF All,701,339.0,362.0,48.359486
BL All,705,264.0,441.0,37.446809
RF AND BL overlap,382,183.0,199.0,47.905759
RF but not BL,319,156.0,163.0,48.902821
BL but not RF,323,81.0,242.0,25.077399


- We can see that the RF model when given similar budget to the Baseline achieves a ~10% increase in hit rate (37.4% vs 48.4%). 
- RF and the baseline agree in selecting 382 cpds. Thus they agree on 183 of their hits. 
- RF selects 319 cpds not selected by the baseline. It finds 156 hits among these. 
- Baseline selects 323 cpds not selected by the RF. It finds 81 hits among these. 
- From the above, the RF manages to find 75 more hits than the baseline. 
- Need to see diversity/novelty of RF vs BL hits. 

---
## Look at rf_rank for cpds selected by Baseline but not by RF
How close were some of the hits selected by the baseline to the RF model.

In [65]:
rank_rf_bl = BL_but_not_RF.sort_values('rf_rank')[['rf_rank', 'Hit']]
rank_rf_bl

Unnamed: 0,rf_rank,Hit
148,1778.0,0.0
200,1779.0,0.0
186,1793.0,0.0
1022,1801.0,1.0
474,1823.0,0.0
308,1826.0,1.0
844,1846.0,1.0
797,1874.0,1.0
687,1898.0,1.0
959,1899.0,1.0


In [66]:
print('Top-2500 RF hit increase estimate: {} hits'.format(rank_rf_bl[rank_rf_bl['rf_rank'] < 2500]['Hit'].sum()))
print('Top-5000 RF hit increase estimate: {} hits'.format(rank_rf_bl[rank_rf_bl['rf_rank'] < 5000]['Hit'].sum()))
print('Top-10000 RF hit increase estimate: {} hits'.format(rank_rf_bl[rank_rf_bl['rf_rank'] < 10000]['Hit'].sum()))

Top-2500 RF hit increase estimate: 20.0 hits
Top-5000 RF hit increase estimate: 44.0 hits
Top-10000 RF hit increase estimate: 59.0 hits


By looking at cpds selected by baseline but not by RF, we can look at how many of those hits the RF model would have found if given more budget. 