# Collect Metrics from HADDOCK 3 Experiment Outputs
## PDB REMARK Metrics

In [1]:
import os
import pandas as pd
from biopandas.pdb import PandasPdb

In [2]:
## Function to return a dataframe of REMARK lines with energy scores from a PDB file
def get_remark_energy_df(pdb_path):
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_path)
    pdb_df = ppdb.df['OTHERS']
    ## get REMARK Lines
    remark_lines = pdb_df[pdb_df['record_name'] == 'REMARK']
    ## Get the energy scores
    energies_header = remark_lines[remark_lines['entry'].str.contains('total,')].iloc[0]['entry']
    energies_values = remark_lines[remark_lines['entry'].str.contains('energies: ')].iloc[0]['entry']
    energies_header = energies_header.replace(' ', '').split(',')
    energies_values = energies_values.split(': ')[-1].split(',')
    ## Make a dataframe
    energies_df = pd.DataFrame([energies_values], columns=energies_header)
    ## Get the desolvation energy and buried surface area
    de_value = remark_lines[remark_lines['entry'].str.contains('Desolvation energy:')].iloc[0]['entry']
    bsa_value = remark_lines[remark_lines['entry'].str.contains('buried surface area:')].iloc[0]['entry']
    de_value = de_value.split(': ')[-1].split(',')
    bsa_value = bsa_value.split(': ')[-1].split(',')
    ## Append to dataframe
    energies_df['desolv'] = de_value
    energies_df['bsa'] = bsa_value
    return energies_df

In [3]:
## Read in Experiments worksheet
experiments = pd.read_excel('../../Experiments.xlsx', sheet_name='Experiments')#.head(0)
## For testing...
# experiments = experiments.append({'experiment_id': 'TEST_5A3I'}, ignore_index=True)

# experiments.head()

In [4]:
## Create a dataframe to store the energy scores
haddock_energy_df = pd.DataFrame()

In [5]:
for index, experiment in experiments.iterrows():
    experiment_id = experiment['experiment_id']

    ## Check if the output/ directory exists
    # outputs_dir = f"../../data/experiments_test/{experiment_id}/output/10_caprieval"
    outputs_dir = f"../../data/results/{experiment_id}/output"
    if os.path.exists(outputs_dir):
        # print(f"Getting metrics for experiment: {experiment_id}")

        ## Read in the individual metrics file
        clustfcc_path = f"{outputs_dir}/09_clustfcc/clustfcc.tsv"
        ## Check if the FCC file exists:
        if os.path.exists(clustfcc_path):
            # print(f"\tClustFCC file exists...")
            clustfcc_df = pd.read_csv(clustfcc_path, sep='\t', comment='#')
            best_model = clustfcc_df.iloc[0]['model_name']

            for index, model in clustfcc_df.iterrows():
                model_name = model['model_name']
                haddock_score = model['score']

                ## Read in REMARK lines of PDB to get metrics
                pdb_path = f"{outputs_dir}/08_mdscoring/{model_name}"

                model_df = get_remark_energy_df(pdb_path)
                model_df['experiment_id'] = experiment_id
                model_df['model_name'] = model_name
                model_df['haddock_score'] = haddock_score
                model_df['best_model'] = model_name == best_model

                ## Append to the main dataframe
                # haddock_energy_df = haddock_energy_df.append(model_df, ignore_index=True)
                haddock_energy_df = pd.concat([haddock_energy_df, model_df], ignore_index=True)

In [7]:
## rename total energy columns
# haddock_energy_df = haddock_energy_df.rename(columns={'            total': 'total'})

In [6]:
haddock_energy_df.head()

Unnamed: 0,total,bonds,angles,improper,dihe,vdw,elec,air,cdih,coup,...,vean,dani,xpcs,rg,desolv,bsa,experiment_id,model_name,haddock_score,best_model
0,-293.204,0,0,0,0,-70.9005,-222.303,0,0,0,...,0,0,0,0,-9.71438,1906.58,H5.3__EPI2800361,mdscoring_2.pdb,-125.08,True
1,-296.149,0,0,0,0,-61.1804,-234.968,0,0,0,...,0,0,0,0,-1.67054,1798.04,H5.3__EPI2800361,mdscoring_1.pdb,-109.84,False
2,-239.974,0,0,0,0,-58.3351,-181.639,0,0,0,...,0,0,0,0,-12.6004,1629.27,H5.3__EPI2800361,mdscoring_3.pdb,-107.26,False
3,-253.808,0,0,0,0,-57.9537,-195.854,0,0,0,...,0,0,0,0,-9.7026,1628.78,H5.3__EPI2800361,mdscoring_5.pdb,-106.83,False
4,-176.442,0,0,0,0,-59.6672,-116.774,0,0,0,...,0,0,0,0,-13.3127,1575.13,H5.3__EPI2800361,mdscoring_4.pdb,-96.33,False


In [7]:
## Reorder columns
# haddock_energy_df_subset = haddock_energy_df[['experiment_id', 'model_name', 'haddock_score', 'vdw', 'elec', 'desolv', 'air', 'bsa', 'total']]

## Save the dataframe to a file
haddock_energy_df.to_csv('full_metrics.csv', index=False)


In [8]:
meta_cols = ['experiment_id', 'model_name', 'best_model']
metric_cols = ['haddock_score', 'vdw', 'elec', 'total', 'desolv', 'bsa']
discard_cols = [col for col in haddock_energy_df.columns if col not in meta_cols + metric_cols]

In [9]:
## Fix metric columns to be float
for col in metric_cols:
    haddock_energy_df[col] = haddock_energy_df[col].astype(float)

In [10]:
## Get the metrics from the best PDB
best_df = haddock_energy_df[haddock_energy_df['best_model'] == True].drop(columns=discard_cols)

## add "_best" to the column names except for 'experiment_id' and model
best_df.columns = [f"{col}_best" if col in metric_cols else col for col in best_df.columns]

best_df

Unnamed: 0,total_best,vdw_best,elec_best,desolv_best,bsa_best,experiment_id,model_name,haddock_score_best,best_model
0,-293.204,-70.9005,-222.303,-9.714380,1906.58,H5.3__EPI2800361,mdscoring_2.pdb,-125.08,True
5,-457.699,-59.2844,-398.415,4.387480,1834.48,AVFluIgG01__EPI454493,mdscoring_5.pdb,-134.58,True
15,-332.098,-64.9787,-267.119,-12.358200,2030.32,13D4__EPI687187,mdscoring_2.pdb,-130.76,True
25,-344.726,-74.0325,-270.693,6.837410,2057.39,FLD194__EPI340788,mdscoring_1.pdb,-121.33,True
35,-432.380,-103.9800,-328.401,-14.243500,2328.86,13D4__EPI1158808,mdscoring_4.pdb,-183.90,True
...,...,...,...,...,...,...,...,...,...
17580,-257.672,-61.9519,-195.720,-9.953200,1710.13,AVFluIgG01__EPI3358339,mdscoring_1.pdb,-111.05,True
17589,-234.465,-51.9564,-182.509,-5.601900,1515.16,H5.3__EPI3358339,mdscoring_2.pdb,-94.06,True
17597,-572.162,-95.4312,-476.731,-3.396350,2591.28,12H5__EPI3358339,mdscoring_3.pdb,-194.17,True
17606,-275.158,-88.1618,-186.996,-0.593567,2257.26,FLD194__EPI3358339,mdscoring_7.pdb,-126.15,True


In [11]:
## Summarize the results by experiment_id, summarizing the metrics
summary_df = haddock_energy_df.groupby(['experiment_id']).agg({'haddock_score': ['min', 'mean', 'max', 'std'],
                                                   'bsa': ['min', 'mean', 'max', 'std'],
                                                   'desolv': ['min', 'mean', 'max', 'std'],
                                                   'elec': ['min', 'mean', 'max', 'std'],
                                                   'total': ['min', 'mean', 'max', 'std'],
                                                   'vdw': ['min', 'mean', 'max', 'std']}).reset_index()

## Flatten the column names
summary_df.columns = ['experiment_id'] + ['_'.join(col).strip() for col in summary_df.columns.values if col[0] in metric_cols]

## Join the summary and best metrics
summary_df = summary_df.merge(best_df, on=['experiment_id'], how='left')

## Reorder columns starting with the meta columns and then alphabetically thereafter
summary_df = summary_df[meta_cols + sorted([col for col in summary_df.columns if col not in meta_cols])]

## Remove the 'best_pdb_flag' column
summary_df = summary_df.drop(columns=['best_model'])

In [12]:
summary_df

Unnamed: 0,experiment_id,model_name,bsa_best,bsa_max,bsa_mean,bsa_min,bsa_std,desolv_best,desolv_max,desolv_mean,...,total_best,total_max,total_mean,total_min,total_std,vdw_best,vdw_max,vdw_mean,vdw_min,vdw_std
0,100F4__EPI101477,mdscoring_1.pdb,2489.36,2489.36,2193.210,1872.05,169.194572,0.884684,1.64994,-0.008279,...,-384.496,-297.726,-331.5614,-384.496,31.555943,-95.8264,-60.9915,-79.55872,-95.8264,11.011752
1,100F4__EPI101843,mdscoring_1.pdb,2119.87,2119.87,1864.976,1645.39,139.236097,10.543400,11.23880,6.815673,...,-342.299,-226.121,-285.5899,-342.299,44.571376,-69.5806,-54.2197,-63.70992,-69.5806,5.507458
2,100F4__EPI101875,mdscoring_6.pdb,2278.42,2278.42,2121.084,1995.20,96.284216,-6.634460,8.87359,-2.709008,...,-299.983,-232.368,-277.6660,-316.884,26.342276,-97.7681,-66.7991,-77.57578,-97.7681,8.901679
3,100F4__EPI105698,mdscoring_9.pdb,2298.98,2357.72,2210.712,2045.72,101.961028,4.961610,10.97920,6.380264,...,-309.841,-261.931,-312.6339,-357.212,27.696873,-89.3576,-54.4627,-74.30865,-89.3576,9.672789
4,100F4__EPI107813,mdscoring_2.pdb,2457.20,2457.20,2254.410,2040.74,133.822794,12.996800,13.04750,8.983050,...,-459.815,-294.580,-392.6997,-472.745,63.883094,-92.6668,-65.4496,-75.17936,-92.6668,8.123491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1799,H5M9__EPI839728,mdscoring_10.pdb,2403.71,2403.71,2039.787,1883.95,148.028146,3.700050,6.74896,0.054910,...,-542.023,-285.601,-409.6040,-542.023,72.602584,-87.3332,-53.4843,-67.49255,-87.3332,9.415405
1800,H5M9__EPI893474,mdscoring_4.pdb,2193.81,2215.39,1888.962,1346.08,294.580924,-34.090700,-18.31660,-28.338530,...,-450.131,-146.999,-264.9159,-450.131,106.113525,-81.3142,-35.9050,-63.71721,-81.3142,14.084651
1801,H5M9__EPI940674,mdscoring_2.pdb,2225.26,2414.18,2194.100,1929.95,129.734054,3.147680,10.92250,2.937184,...,-500.550,-352.635,-441.5855,-500.550,43.568098,-77.3231,-56.2349,-70.01376,-82.5066,8.774875
1802,H5M9__EPI9640,mdscoring_4.pdb,2630.31,2630.31,2195.309,2021.77,183.528722,-37.830700,-18.21440,-31.721210,...,-405.999,-206.201,-306.8762,-405.999,69.217515,-86.9604,-74.4822,-81.92368,-90.8386,5.497224


In [13]:
## Save the summary dataframe to a file
summary_df.to_csv('summary_metrics.csv', index=False)