# Collect Metrics from HADDOCK 3 Experiment Outputs
## PDB REMARK Metrics

In [24]:
import os
import pandas as pd
from biopandas.pdb import PandasPdb

In [25]:
## Function to return a dataframe of REMARK lines with energy scores from a PDB file
def get_remark_energy_df(pdb_path):
    ppdb = PandasPdb()
    ppdb.read_pdb(pdb_path)
    pdb_df = ppdb.df['OTHERS']
    ## get REMARK Lines
    remark_lines = pdb_df[pdb_df['record_name'] == 'REMARK']
    ## Get the energy scores
    energies_header = remark_lines[remark_lines['entry'].str.contains('total,')].iloc[0]['entry']
    energies_values = remark_lines[remark_lines['entry'].str.contains('energies: ')].iloc[0]['entry']
    energies_header = energies_header.split(',')
    energies_values = energies_values.split(': ')[-1].split(',')
    ## Make a dataframe
    energies_df = pd.DataFrame([energies_values], columns=energies_header)
    ## Get the desolvation energy and buried surface area
    de_value = remark_lines[remark_lines['entry'].str.contains('Desolvation energy:')].iloc[0]['entry']
    bsa_value = remark_lines[remark_lines['entry'].str.contains('buried surface area:')].iloc[0]['entry']
    de_value = de_value.split(': ')[-1].split(',')
    bsa_value = bsa_value.split(': ')[-1].split(',')
    ## Append to dataframe
    energies_df['desolv'] = de_value
    energies_df['bsa'] = bsa_value
    return energies_df

In [26]:
## Read in Experiments worksheet
experiments = pd.read_excel('../../Experiments.xlsx', sheet_name='Experiments')#.head(0)
## For testing...
# experiments = experiments.append({'experiment_id': 'TEST_5A3I'}, ignore_index=True)

# experiments.head()

In [33]:
## Create a dataframe to store the energy scores
haddock_energy_df = pd.DataFrame()

In [34]:
for index, experiment in experiments.iterrows():
    experiment_id = experiment['experiment_id']

    ## Check if the output/ directory exists
    # outputs_dir = f"../../data/experiments_test/{experiment_id}/output/10_caprieval"
    outputs_dir = f"../../data/results/{experiment_id}/output"
    if os.path.exists(outputs_dir):
        # print(f"Getting metrics for experiment: {experiment_id}")

        ## Read in the individual metrics file
        clustfcc_path = f"{outputs_dir}/09_clustfcc/clustfcc.tsv"
        ## Check if the SS file exists:
        if os.path.exists(clustfcc_path):
            # print(f"\tClustFCC file exists...")
            clustfcc_df = pd.read_csv(clustfcc_path, sep='\t', comment='#')
            best_model = clustfcc_df.iloc[0]['model_name']

            for index, model in clustfcc_df.iterrows():
                model_name = model['model_name']
                haddock_score = model['score']

                ## Read in REMARK lines of PDB to get metrics
                pdb_path = f"{outputs_dir}/08_mdscoring/{model_name}"

                model_df = get_remark_energy_df(pdb_path)
                model_df['experiment_id'] = experiment_id
                model_df['model_name'] = model_name
                model_df['haddock_score'] = haddock_score
                model_df['best_model'] = model_name == best_model

                ## Append to the main dataframe
                haddock_energy_df = haddock_energy_df.append(model_df, ignore_index=True)




Getting metrics for experiment: H5.3__EPI2800361
Getting metrics for experiment: AVFluIgG01__EPI454493
	ClustFCC file exists...
Getting metrics for experiment: 13D4__EPI687187
	ClustFCC file exists...
Getting metrics for experiment: FLD194__EPI340788
	ClustFCC file exists...
Getting metrics for experiment: 13D4__EPI1158808
	ClustFCC file exists...
Getting metrics for experiment: H5.3__EPI1328383
Getting metrics for experiment: 12H5__EPI467792
	ClustFCC file exists...
Getting metrics for experiment: 12H5__EPI355443
	ClustFCC file exists...
Getting metrics for experiment: 3C11__EPI275862
	ClustFCC file exists...
Getting metrics for experiment: FLD21.140__EPI275758
	ClustFCC file exists...
Getting metrics for experiment: 13D4__EPI275790
	ClustFCC file exists...
Getting metrics for experiment: 100F4__EPI3158642
	ClustFCC file exists...
Getting metrics for experiment: 3C11__EPI340399
	ClustFCC file exists...
Getting metrics for experiment: 3C11__EPI234494
	ClustFCC file exists...
Getting me

In [38]:
haddock_energy_df

Unnamed: 0,total,bonds,angles,improper,dihe,vdw,elec,air,cdih,coup,rdcs,vean,dani,xpcs,rg,desolv,bsa,experiment_id,model_name,haddock_score
0,-457.699,0,0,0,0,-59.2844,-398.415,0,0,0,0,0,0,0,0,4.38748,1834.48,AVFluIgG01__EPI454493,mdscoring_5.pdb,-134.58
1,-391.874,0,0,0,0,-71.2821,-320.592,0,0,0,0,0,0,0,0,6.25026,1819.62,AVFluIgG01__EPI454493,mdscoring_6.pdb,-129.15
2,-454.118,0,0,0,0,-62.6626,-391.455,0,0,0,0,0,0,0,0,12.9567,1907.67,AVFluIgG01__EPI454493,mdscoring_2.pdb,-128.00
3,-340.512,0,0,0,0,-71.6936,-268.818,0,0,0,0,0,0,0,0,-0.780262,2000.43,AVFluIgG01__EPI454493,mdscoring_7.pdb,-126.24
4,-419.698,0,0,0,0,-57.2754,-362.422,0,0,0,0,0,0,0,0,4.22794,1593.3,AVFluIgG01__EPI454493,mdscoring_3.pdb,-125.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17385,-265.028,0,0,0,0,-64.5099,-200.518,0,0,0,0,0,0,0,0,5.34697,1848,FLD194__EPI808932,mdscoring_7.pdb,-99.27
17386,-268.558,0,0,0,0,-58.5129,-210.045,0,0,0,0,0,0,0,0,4.79726,1782.54,FLD194__EPI808932,mdscoring_3.pdb,-95.72
17387,-289.629,0,0,0,0,-52.4469,-237.182,0,0,0,0,0,0,0,0,8.85921,1826.16,FLD194__EPI808932,mdscoring_10.pdb,-91.02
17388,-273.179,0,0,0,0,-46.7802,-226.399,0,0,0,0,0,0,0,0,4.76302,1676.56,FLD194__EPI808932,mdscoring_1.pdb,-87.30


In [39]:
## Reorder columns
# haddock_energy_df_subset = haddock_energy_df[['experiment_id', 'model_name', 'haddock_score', 'vdw', 'elec', 'desolv', 'air', 'bsa', 'total']]

## Save the dataframe to a file
haddock_energy_df.to_csv('haddock_energy_scores.csv', index=False)


In [None]:
meta_cols = ['experiment_id', 'model_name']

metric_cols = ['score', 'irmsd', 'fnat', 'lrmsd', 'ilrmsd',
              'dockq', 'air', 'bsa', 'desolv', 'dihe',
              'elec', 'total', 'vdw']