# Collect Metrics from HADDOCK 3 Experiment Outputs
## PDB REMARK Metrics

In [1]:
import os
import pandas as pd
from biopandas.pdb import PandasPdb

In [2]:
## Function to return a dataframe of REMARK lines with energy scores from a PDB file
def get_remark_energy_df(pdb_path):
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_path)
    pdb_df = ppdb.df['OTHERS']
    ## get REMARK Lines
    remark_lines = pdb_df[pdb_df['record_name'] == 'REMARK']
    ## Get the energy scores
    energies_header = remark_lines[remark_lines['entry'].str.contains('total,')].iloc[0]['entry']
    energies_values = remark_lines[remark_lines['entry'].str.contains('energies: ')].iloc[0]['entry']
    energies_header = energies_header.replace(' ', '').split(',')
    energies_values = energies_values.split(': ')[-1].split(',')
    ## Make a dataframe
    energies_df = pd.DataFrame([energies_values], columns=energies_header)
    ## Get the desolvation energy and buried surface area
    de_value = remark_lines[remark_lines['entry'].str.contains('Desolvation energy:')].iloc[0]['entry']
    bsa_value = remark_lines[remark_lines['entry'].str.contains('buried surface area:')].iloc[0]['entry']
    de_value = de_value.split(': ')[-1].split(',')
    bsa_value = bsa_value.split(': ')[-1].split(',')
    ## Append to dataframe
    energies_df['desolv'] = de_value
    energies_df['bsa'] = bsa_value
    return energies_df

In [3]:
## Read in Experiments worksheet
experiments = pd.read_excel('../../Experiments.xlsx', sheet_name='Experiments')#.head(0)
## For testing...
# experiments = experiments.append({'experiment_id': 'TEST_5A3I'}, ignore_index=True)

# experiments.head()

In [4]:
## Create a dataframe to store the energy scores
haddock_energy_df = pd.DataFrame()

In [5]:
for index, experiment in experiments.iterrows():
    experiment_id = experiment['experiment_id']

    ## Check if the output/ directory exists
    # outputs_dir = f"../../data/experiments_test/{experiment_id}/output/10_caprieval"
    outputs_dir = f"../../data/results/{experiment_id}/output"
    if os.path.exists(outputs_dir):
        # print(f"Getting metrics for experiment: {experiment_id}")

        ## Read in the individual metrics file
        clustfcc_path = f"{outputs_dir}/09_clustfcc/clustfcc.tsv"
        ## Check if the FCC file exists:
        if os.path.exists(clustfcc_path):
            # print(f"\tClustFCC file exists...")
            clustfcc_df = pd.read_csv(clustfcc_path, sep='\t', comment='#')
            best_model = clustfcc_df.iloc[0]['model_name']

            for index, model in clustfcc_df.iterrows():
                model_name = model['model_name']
                haddock_score = model['score']

                ## Read in REMARK lines of PDB to get metrics
                pdb_path = f"{outputs_dir}/08_mdscoring/{model_name}"

                model_df = get_remark_energy_df(pdb_path)
                model_df['experiment_id'] = experiment_id
                model_df['model_name'] = model_name
                model_df['haddock_score'] = haddock_score
                model_df['best_model'] = model_name == best_model

                ## Append to the main dataframe
                # haddock_energy_df = haddock_energy_df.append(model_df, ignore_index=True)
                haddock_energy_df = pd.concat([haddock_energy_df, model_df], ignore_index=True)

In [7]:
## rename total energy columns
# haddock_energy_df = haddock_energy_df.rename(columns={'            total': 'total'})

In [6]:
haddock_energy_df.head()

Unnamed: 0,total,bonds,angles,improper,dihe,vdw,elec,air,cdih,coup,...,vean,dani,xpcs,rg,desolv,bsa,experiment_id,model_name,haddock_score,best_model
0,-366.545,0,0,0,0,-71.0052,-295.54,0,0,0,...,0,0,0,0,-13.1443,1913.26,FLD194__YP_308669.1,mdscoring_2.pdb,-143.26,True
1,-374.306,0,0,0,0,-57.8712,-316.435,0,0,0,...,0,0,0,0,-7.03946,1699.06,FLD194__YP_308669.1,mdscoring_8.pdb,-128.2,False
2,-351.645,0,0,0,0,-62.1045,-289.54,0,0,0,...,0,0,0,0,-6.02133,1765.15,FLD194__YP_308669.1,mdscoring_10.pdb,-126.03,False
3,-341.893,0,0,0,0,-59.1243,-282.768,0,0,0,...,0,0,0,0,-8.031,1714.84,FLD194__YP_308669.1,mdscoring_5.pdb,-123.71,False
4,-347.168,0,0,0,0,-59.7354,-287.432,0,0,0,...,0,0,0,0,-6.44042,1759.46,FLD194__YP_308669.1,mdscoring_3.pdb,-123.66,False


In [7]:
## Reorder columns
# haddock_energy_df_subset = haddock_energy_df[['experiment_id', 'model_name', 'haddock_score', 'vdw', 'elec', 'desolv', 'air', 'bsa', 'total']]

## Save the dataframe to a file
haddock_energy_df.to_csv('full_metrics.csv', index=False)


In [8]:
meta_cols = ['experiment_id', 'model_name', 'best_model']
metric_cols = ['haddock_score', 'vdw', 'elec', 'total', 'desolv', 'bsa']
discard_cols = [col for col in haddock_energy_df.columns if col not in meta_cols + metric_cols]

In [9]:
## Fix metric columns to be float
for col in metric_cols:
    haddock_energy_df[col] = haddock_energy_df[col].astype(float)

In [10]:
## Get the metrics from the best PDB
best_df = haddock_energy_df[haddock_energy_df['best_model'] == True].drop(columns=discard_cols)

## add "_best" to the column names except for 'experiment_id' and model
best_df.columns = [f"{col}_best" if col in metric_cols else col for col in best_df.columns]

best_df

Unnamed: 0,total_best,vdw_best,elec_best,desolv_best,bsa_best,experiment_id,model_name,haddock_score_best,best_model
0,-366.545,-71.0052,-295.54,-13.1443,1913.26,FLD194__YP_308669.1,mdscoring_2.pdb,-143.26,True
10,-354.275,-73.2893,-280.986,3.94494,1774.51,H5.3__YP_308669.1,mdscoring_1.pdb,-125.54,True
20,-318.151,-115.665,-202.486,-47.2402,2785.14,65C6__YP_308669.1,mdscoring_2.pdb,-203.4,True
30,-273.023,-59.5087,-213.514,-6.44934,1774.16,100F4__YP_308669.1,mdscoring_2.pdb,-108.66,True
40,-405.459,-83.3447,-322.115,-17.7333,2167.31,3C11__YP_308669.1,mdscoring_3.pdb,-165.5,True
50,-452.712,-59.2069,-393.505,11.4281,1665.44,AVFluIgG01__YP_308669.1,mdscoring_2.pdb,-126.48,True
60,-385.602,-82.32,-303.282,13.5362,2278.86,FLD21.140__YP_308669.1,mdscoring_2.pdb,-129.44,True
70,-527.47,-88.1391,-439.33,-34.8402,2509.71,H5M9__YP_308669.1,mdscoring_1.pdb,-210.85,True
80,-224.452,-99.4782,-124.974,-39.8949,2451.79,13D4__YP_308669.1,mdscoring_3.pdb,-164.37,True
88,-362.397,-58.8937,-303.504,0.919882,1458.03,AVFluIgG03__YP_308669.1,mdscoring_4.pdb,-118.67,True


In [11]:
## Summarize the results by experiment_id, summarizing the metrics
summary_df = haddock_energy_df.groupby(['experiment_id']).agg({'haddock_score': ['min', 'mean', 'max', 'std'],
                                                   'bsa': ['min', 'mean', 'max', 'std'],
                                                   'desolv': ['min', 'mean', 'max', 'std'],
                                                   'elec': ['min', 'mean', 'max', 'std'],
                                                   'total': ['min', 'mean', 'max', 'std'],
                                                   'vdw': ['min', 'mean', 'max', 'std']}).reset_index()

## Flatten the column names
summary_df.columns = ['experiment_id'] + ['_'.join(col).strip() for col in summary_df.columns.values if col[0] in metric_cols]

## Join the summary and best metrics
summary_df = summary_df.merge(best_df, on=['experiment_id'], how='left')

## Reorder columns starting with the meta columns and then alphabetically thereafter
summary_df = summary_df[meta_cols + sorted([col for col in summary_df.columns if col not in meta_cols])]

## Remove the 'best_pdb_flag' column
summary_df = summary_df.drop(columns=['best_model'])

In [12]:
summary_df

Unnamed: 0,experiment_id,model_name,bsa_best,bsa_max,bsa_mean,bsa_min,bsa_std,desolv_best,desolv_max,desolv_mean,...,total_best,total_max,total_mean,total_min,total_std,vdw_best,vdw_max,vdw_mean,vdw_min,vdw_std
0,100F4__AAT73273.1,mdscoring_8.pdb,2035.28,2035.28,1869.221,1643.57,108.076808,1.16299,14.1306,2.43838,...,-266.42,-156.12,-218.1571,-269.286,38.491213,-67.9428,-56.3198,-64.21763,-67.9428,3.958831
1,100F4__WYN03019.1,mdscoring_1.pdb,1946.33,1989.63,1835.67,1707.52,90.54302,1.81619,7.58335,-0.134731,...,-265.289,-173.68,-229.1409,-265.289,26.385022,-71.5234,-56.5829,-67.45902,-73.8372,5.598631
2,100F4__YP_308669.1,mdscoring_2.pdb,1774.16,1774.16,1631.4,1482.17,95.539894,-6.44934,3.95852,-4.318197,...,-273.023,-151.218,-222.96,-273.023,39.834203,-59.5087,-42.0628,-55.05737,-61.5103,6.010659
3,12H5__AAT73273.1,mdscoring_2.pdb,2369.87,2498.8,2324.539,2160.74,95.482327,-14.8061,-12.241,-16.21246,...,-545.71,-466.177,-506.2439,-589.549,45.530418,-77.0364,-53.0692,-68.48722,-79.0584,7.648401
4,12H5__WYN03019.1,mdscoring_1.pdb,3031.07,3031.07,2531.798,2304.98,229.706702,-14.5555,-14.2294,-20.63088,...,-667.699,-350.468,-451.5763,-667.699,95.891683,-96.4252,-58.6318,-75.19136,-96.4252,10.110915
5,12H5__YP_308669.1,mdscoring_4.pdb,2870.92,2870.92,2702.682,2472.69,122.67608,-32.329,-19.0637,-32.03648,...,-688.549,-378.319,-531.7856,-688.549,100.095637,-106.539,-93.8748,-102.75443,-118.808,8.562616
6,13D4__AAT73273.1,mdscoring_1.pdb,2405.51,2405.51,2245.043,2036.1,120.855786,-10.9315,-0.344272,-6.004612,...,-557.985,-420.521,-525.8166,-601.375,53.175592,-87.795,-60.7659,-77.58593,-87.795,8.231218
7,13D4__WYN03019.1,mdscoring_6.pdb,1980.97,2090.05,1845.553333,1614.26,155.266377,-9.86129,2.56239,-6.44917,...,-235.202,-149.766,-190.822333,-246.093,32.418921,-85.7513,-47.9538,-69.755544,-85.7513,11.947595
8,13D4__YP_308669.1,mdscoring_3.pdb,2451.79,2669.11,2371.03625,2271.23,136.094797,-39.8949,-26.4457,-33.311975,...,-224.452,-182.342,-232.162375,-294.237,33.267166,-99.4782,-75.9147,-87.758725,-100.732,9.712107
9,3C11__AAT73273.1,mdscoring_1.pdb,2512.99,2748.27,2567.878571,2390.74,126.928566,-19.3317,-8.43162,-14.68836,...,-521.438,-434.989,-477.471286,-521.438,39.142421,-92.3186,-80.5852,-91.195271,-98.7223,5.570761


In [13]:
## Save the summary dataframe to a file
summary_df.to_csv('summary_metrics.csv', index=False)