In [1]:
benchmark_path = '/Users/rodrigo/repos/gdock/benchmark'
data_file = f'{benchmark_path}/benchmark_v1.0.0.dat'

In [2]:
import glob
import os
import pandas as pd

In [3]:
if not os.path.isfile(data_file):
    print(f'Data file not found, reading from {benchmark_path}')
    data = []
    for target in [f for f in glob.glob(f'{benchmark_path}/*') if not '.' in f]:
        name = target.split('/')[-1]
        data_f = f'{benchmark_path}/{name}/run/analysis/gdock.dat'
        if os.path.isfile(data_f):
            with open(data_f) as fh:
                for line in fh.readlines()[1:]:
                    generation, model, fitness, irmsd, cluster_id, _ = line.split(',')
                    data.append([name, int(generation), int(model), float(fitness), float(irmsd), float(cluster_id)])
    df = pd.DataFrame(data, columns = ['target', 'generation','individual','fitness','irmsd','cluster_id'])
    with open(data_file, 'w') as fh:
        fh.write(df.to_csv(index=False))
    fh.close()
else:
    print(f'Data file found, reading {data_file}')
    df = pd.read_csv(data_file)

Data file found, reading /Users/rodrigo/repos/gdock/benchmark/benchmark_v1.0.0.dat


In [4]:
df.head()

Unnamed: 0,target,generation,individual,fitness,irmsd,cluster_id
0,2A9K,1,0,19.166,7.32,32.0
1,2A9K,1,1,41.281,18.72,
2,2A9K,1,2,-6.214,18.18,1.0
3,2A9K,1,3,16.972,10.62,
4,2A9K,1,4,18.558,14.46,


In [5]:
# Single structure
top_list = [1, 5, 10, 50, 200, 400, 1000]
for top in top_list:
    d = {'high': 0, 'medium': 0, 'acceptable': 0}
    for target in df['target'].unique():
        subdf = df[df['target'] == target].sort_values(by='fitness')
        filtereddf = subdf.drop_duplicates(subset='fitness')
        topdf = filtereddf[:top]
        if (topdf['irmsd'] <= 1).sum():
            d['high'] += 1
        if (topdf['irmsd'] <= 2).sum():
            d['medium'] += 1
        if (topdf['irmsd'] <= 4).sum():
            d['acceptable'] += 1

    print('top', top, d)

print('total targets', len(df['target'].unique()))

top 1 {'high': 0, 'medium': 0, 'acceptable': 4}
top 5 {'high': 0, 'medium': 0, 'acceptable': 6}
top 10 {'high': 0, 'medium': 2, 'acceptable': 12}
top 50 {'high': 0, 'medium': 2, 'acceptable': 26}
top 200 {'high': 0, 'medium': 2, 'acceptable': 46}
top 400 {'high': 0, 'medium': 3, 'acceptable': 51}
top 1000 {'high': 0, 'medium': 3, 'acceptable': 60}
total targets 227


In [6]:
top_cluster_range = [1, 3, 5, 10]

for top in top_cluster_range:
    d = {'high': 0, 'medium': 0, 'acceptable': 0, 'incorrect': 0}
    for target in df['target'].unique():
        c_df = df[df['target'] == target].sort_values(['fitness', 'cluster_id'])
        c_df = c_df.dropna()
        c_ranking = c_df.drop_duplicates(subset='cluster_id', keep='first')
        cluster_ranking_dic = dict((i+1, j) for i, j in enumerate(c_ranking['cluster_id']) if i+1 <= top)
        
        high_l = []
        medium_l = []
        acceptable_l = []
        incorrect_l = []
        for cluster in cluster_ranking_dic:
            topdf = c_df[c_df['cluster_id'] == cluster][:5]
            if (topdf['irmsd'] <= 1).sum():
                high_l.append(1)
            if (topdf['irmsd'] <= 2).sum():
                medium_l.append(1)
            if (topdf['irmsd'] <= 4).sum():
                acceptable_l.append(1)
            elif (topdf['irmsd'] > 4).sum():
                incorrect_l.append(1)
        
        if high_l:
            d['high'] += 1
        if medium_l:
            d['medium'] += 1
        if acceptable_l:
            d['acceptable'] += 1
        elif incorrect_l:
            d['incorrect'] += 1
    print('top', top, d)
            
print('total targets', len(df['target'].unique()))

top 1 {'high': 0, 'medium': 0, 'acceptable': 4, 'incorrect': 221}
top 3 {'high': 0, 'medium': 1, 'acceptable': 8, 'incorrect': 218}
top 5 {'high': 0, 'medium': 1, 'acceptable': 9, 'incorrect': 217}
top 10 {'high': 0, 'medium': 1, 'acceptable': 14, 'incorrect': 212}
total targets 227
