# Collect Metrics from HADDOCK 3 Experiment Outputs
## capri_ss.tsv Files

In [1]:
import os
import pandas as pd

In [2]:
## Read in Experiments worksheet
experiments = pd.read_excel('../../Experiments.xlsx', sheet_name='Experiments')#.head(0)
## For testing...
# experiments = experiments.append({'experiment_id': 'TEST_5A3I'}, ignore_index=True)

# experiments.head()

In [3]:
## Create empty dataframe to store results
ss_df = pd.DataFrame()

In [4]:
for index, experiment in experiments.iterrows():
    experiment_id = experiment['experiment_id']

    ## Check if the output/ directory exists
    # outputs_dir = f"../../data/experiments_test/{experiment_id}/output/10_caprieval"
    outputs_dir = f"../../data/results/{experiment_id}/output"
    if os.path.exists(outputs_dir):
        print(f"Getting metrics for experiment: {experiment_id}")

        ## Read in the individual metrics file
        ss_metrics_path = f"{outputs_dir}/10_caprieval/capri_ss.tsv"
        ## Check if the SS file exists:
        if os.path.exists(ss_metrics_path):
            print(f"\tSS file exists...")
            ss_df_iter = pd.read_csv(ss_metrics_path, sep='\t', comment='#')
            ss_df_iter['experiment_id'] = experiment_id

            ## If row has the lowest vdw value, flag 'best_pdb_flag' as True (otherwise False)
            ss_df_iter['best_pdb_flag'] = ss_df_iter['vdw'].rank(method='min', ascending=True)
            ss_df_iter['best_pdb_flag'] = ss_df_iter['best_pdb_flag'].apply(lambda x: True if x == 1 else False)
            
            ss_df = ss_df.append(ss_df_iter)

Getting metrics for experiment: H5.3__EPI2800361
Getting metrics for experiment: AVFluIgG01__EPI454493
Getting metrics for experiment: 13D4__EPI687187
Getting metrics for experiment: FLD194__EPI340788
Getting metrics for experiment: 13D4__EPI1158808
Getting metrics for experiment: H5.3__EPI1328383
Getting metrics for experiment: 12H5__EPI467792
Getting metrics for experiment: 12H5__EPI355443
Getting metrics for experiment: 3C11__EPI275862
Getting metrics for experiment: FLD21.140__EPI275758
Getting metrics for experiment: 13D4__EPI275790
Getting metrics for experiment: 100F4__EPI3158642
Getting metrics for experiment: 3C11__EPI340399
Getting metrics for experiment: 3C11__EPI234494
Getting metrics for experiment: 13D4__EPI275974
Getting metrics for experiment: AVFluIgG01__EPI1381432
Getting metrics for experiment: FLD194__EPI214889
Getting metrics for experiment: 3C11__EPI2580113
Getting metrics for experiment: FLD194__EPI235171
Getting metrics for experiment: 65C6__EPI25011
Getting met

In [8]:
ss_df.columns

Index(['model', 'md5', 'caprieval_rank', 'score', 'irmsd', 'fnat', 'lrmsd',
       'ilrmsd', 'dockq', 'cluster-id', 'cluster-ranking',
       'model-cluster-ranking', 'air', 'angles', 'bonds', 'bsa', 'cdih',
       'coup', 'dani', 'desolv', 'dihe', 'elec', 'improper', 'rdcs', 'rg',
       'total', 'vdw', 'vean', 'xpcs', 'experiment_id', 'best_pdb_flag'],
      dtype='object')

In [9]:
# ss_df.columns.values

## Columns to discard
discard_cols = ['md5', 'cluster-id', 'cluster-ranking', 'model-cluster-ranking',
                'angles', 'bonds', 'cdih', 'coup', 'dani', 'improper',
                'rdcs', 'rg', 'vean', 'xpcs']

## Columns to keep
meta_cols = ['experiment_id', 'model', 'best_pdb_flag', 'caprieval_rank']

metric_cols = ['score', 'irmsd', 'fnat', 'lrmsd', 'ilrmsd',
              'dockq', 'air', 'bsa', 'desolv', 'dihe',
              'elec', 'total', 'vdw']

In [10]:
## Get the metrics from the best PDB
ss_best = ss_df[ss_df['best_pdb_flag'] == True].drop(columns=discard_cols)

## add "_best" to the column names except for 'experiment_id' and model
ss_best.columns = [f"{col}_best" if col in metric_cols else col for col in ss_best.columns]

In [11]:
## Summarize the results by experiment_id, summarizing the metrics
ss_summary = ss_df.groupby(['experiment_id']).agg({'score': ['min','mean', 'std'],
                                                   'irmsd': ['min', 'mean', 'std'],
                                                   'fnat': ['min','mean', 'std'],
                                                   'lrmsd': ['min','mean', 'std'],
                                                   'dockq': ['max','mean', 'std'],
                                                   'air': ['min','mean', 'std'],
                                                   'bsa': ['max','mean', 'std'],
                                                   'desolv': ['min','mean', 'std'],
                                                   'elec': ['min','mean', 'std'],
                                                   'total': ['min','mean', 'std'],
                                                   'vdw': ['min','mean', 'std']}).reset_index()

## Flatten the column names
ss_summary.columns = ['experiment_id'] + ['_'.join(col).strip() for col in ss_summary.columns.values if col[0] in metric_cols]

## Join the summary and best metrics
ss_summary = ss_summary.merge(ss_best, on=['experiment_id'], how='left')

## Reorder columns starting with the meta columns and then alphabetically thereafter
ss_summary = ss_summary[meta_cols + sorted([col for col in ss_summary.columns if col not in meta_cols])]

## Remove the 'best_pdb_flag' column
ss_summary = ss_summary.drop(columns=['best_pdb_flag'])

In [12]:
ss_summary

Unnamed: 0,experiment_id,model,caprieval_rank,air_best,air_mean,air_min,air_std,bsa_best,bsa_max,bsa_mean,...,score_min,score_std,total_best,total_mean,total_min,total_std,vdw_best,vdw_mean,vdw_min,vdw_std
0,100F4__EPI101477,../08_mdscoring/mdscoring_1.pdb,1,0.0,0.0,0.0,0.0,2489.36,2489.36,2193.21,...,-152.676,14.390202,-384.496,-331.5614,-384.496,31.555943,-95.826,-79.5587,-95.826,11.011542
1,100F4__EPI101875,../08_mdscoring/mdscoring_6.pdb,1,0.0,0.0,0.0,0.0,2278.42,2278.42,2121.084,...,-144.845,11.83539,-299.983,-277.666,-316.884,26.342276,-97.768,-77.5758,-97.768,8.901668
2,12H5__EPI101875,../08_mdscoring/mdscoring_4.pdb,5,0.0,0.0,0.0,0.0,1975.49,1975.49,1688.56,...,-131.849,6.581853,-225.688,-290.055333,-369.805,57.607843,-70.406,-59.976778,-70.406,5.182947


In [93]:
## Write out to CSV
ss_summary.to_csv('caprieval_ss_summary.csv', index=False)