# Collect Metrics from HADDOCK 3 Experiment Outputs

In [7]:
import os
import pandas as pd

In [8]:
## Read in Experiments worksheet
experiments = pd.read_excel('../../Experiments.xlsx', sheet_name='Experiments')#.head(0)
## For testing...
# experiments = experiments.append({'experiment_id': 'TEST_5A3I'}, ignore_index=True)

# experiments.head()

In [27]:
## Create empty dataframe to store results
metrics_df = pd.DataFrame(columns=['experiment_id', 'cluster_rank', 'cluster_id', 'n', 'under_eval',
                                   'score', 'score_std', 'irmsd', 'irmsd_std',
                                   'fnat', 'fnat_std', 'lrmsd', 'lrmsd_std', 'dockq', 'dockq_std',
                                   'air', 'air_std', 'bsa', 'bsa_std',
                                   'desolv', 'desolv_std', 'elec', 'elec_std',
                                   'total', 'total_std', 'vdw', 'vdw_std',
                                   'caprieval_rank', 'best_pdb_path'])

# metrics_df

In [17]:
## Create empty dataframe to store results
ss_df = pd.DataFrame()

In [18]:
for index, experiment in experiments.iterrows():
    experiment_id = experiment['experiment_id']

    ## Check if the output/ directory exists
    outputs_dir = f"../../data/experiments/{experiment_id}/output/10_caprieval"
    if os.path.exists(outputs_dir):
        print(f"Getting metrics for experiment: {experiment_id}")

        ## Read in the individual metrics file
        ss_metrics_path = f"{outputs_dir}/capri_ss.tsv"
        ss_df_iter = pd.read_csv(ss_metrics_path, sep='\t', comment='#')
        ss_df_iter['experiment_id'] = experiment_id

        ## If row has the lowest vdw value, flag 'best_pdb_flag' as True (otherwise False)
        ss_df_iter['best_pdb_flag'] = ss_df_iter['vdw'].rank(method='min', ascending=True)
        ss_df_iter['best_pdb_flag'] = ss_df_iter['best_pdb_flag'].apply(lambda x: True if x == 1 else False)
        
        ss_df = ss_df.append(ss_df_iter)

Getting metrics for experiment: 12H5__EPI1158808
Getting metrics for experiment: FLD21.140__EPI355443


In [77]:
ss_df.columns

Index(['model', 'md5', 'caprieval_rank', 'score', 'irmsd', 'fnat', 'lrmsd',
       'ilrmsd', 'dockq', 'cluster-id', 'cluster-ranking',
       'model-cluster-ranking', 'air', 'angles', 'bonds', 'bsa', 'cdih',
       'coup', 'dani', 'desolv', 'dihe', 'elec', 'improper', 'rdcs', 'rg',
       'total', 'vdw', 'vean', 'xpcs', 'experiment_id', 'best_pdb_flag'],
      dtype='object')

In [78]:
# ss_df.columns.values

## Columns to discard
discard_cols = ['md5', 'cluster-id', 'cluster-ranking', 'model-cluster-ranking',
                'angles', 'bonds', 'cdih', 'coup', 'dani', 'improper',
                'rdcs', 'rg', 'vean', 'xpcs']

## Columns to keep
meta_cols = ['experiment_id', 'model', 'best_pdb_flag', 'caprieval_rank']

metric_cols = ['score', 'irmsd', 'fnat', 'lrmsd', 'ilrmsd',
              'dockq', 'air', 'bsa', 'desolv', 'dihe',
              'elec', 'total', 'vdw']

In [90]:
## Get the metrics from the best PDB
ss_best = ss_df[ss_df['best_pdb_flag'] == True].drop(columns=discard_cols)

## add "_best" to the column names except for 'experiment_id' and model
ss_best.columns = [f"{col}_best" if col in metric_cols else col for col in ss_best.columns]

In [91]:
## Summarize the results by experiment_id, summarizing the metrics
ss_summary = ss_df.groupby(['experiment_id']).agg({'score': ['min','mean', 'std'],
                                                   'irmsd': ['min', 'mean', 'std'],
                                                   'fnat': ['min','mean', 'std'],
                                                   'lrmsd': ['min','mean', 'std'],
                                                   'dockq': ['max','mean', 'std'],
                                                   'air': ['min','mean', 'std'],
                                                   'bsa': ['max','mean', 'std'],
                                                   'desolv': ['min','mean', 'std'],
                                                   'elec': ['min','mean', 'std'],
                                                   'total': ['min','mean', 'std'],
                                                   'vdw': ['min','mean', 'std']}).reset_index()

## Flatten the column names
ss_summary.columns = ['experiment_id'] + ['_'.join(col).strip() for col in ss_summary.columns.values if col[0] in metric_cols]

## Join the summary and best metrics
ss_summary = ss_summary.merge(ss_best, on=['experiment_id'], how='left')

## Reorder columns starting with the meta columns and then alphabetically thereafter
ss_summary = ss_summary[meta_cols + sorted([col for col in ss_summary.columns if col not in meta_cols])]

## Remove the 'best_pdb_flag' column
ss_summary = ss_summary.drop(columns=['best_pdb_flag'])

In [92]:
ss_summary

Unnamed: 0,experiment_id,model,caprieval_rank,air_best,air_mean,air_min,air_std,bsa_best,bsa_max,bsa_mean,...,score_min,score_std,total_best,total_mean,total_min,total_std,vdw_best,vdw_mean,vdw_min,vdw_std
0,12H5__EPI1158808,../08_mdscoring/mdscoring_5.pdb,1,0.0,0.0,0.0,0.0,2320.38,2320.38,2129.0825,...,-163.029,16.511306,-426.065,-386.461875,-460.116,50.024223,-88.917,-72.71325,-88.917,11.899694
1,FLD21.140__EPI355443,../08_mdscoring/mdscoring_10.pdb,1,0.0,0.0,0.0,0.0,2249.86,2249.86,2112.828889,...,-138.378,15.918808,-416.168,-356.645556,-416.168,37.354104,-75.733,-64.392556,-75.733,7.767551


In [93]:
## Write out to CSV
ss_summary.to_csv('caprieval_ss_summary.csv', index=False)