In [46]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import glob
import re, os
import os.path
%matplotlib inline

In [47]:
#function -- it calculates the topN percent & will look at N poses or all poses for a given key whichever is smaller
def topN(n, dic, thresh, redkeys=None,perpocket=False):
    '''
    This function returns the topN percentage by taking the top N poses
    and calculating the frac that meet thresh.
    Assumes dic is pocket-> key-> list of values.
    
    If redkeys (list of keys passed in) it will only use those keys
    
    If perpocket -- stats are calculated per-pocket and averaged
                    instead of just using ALL data
    '''
    
    has_stuff=[]
    counter=0 #tracks total number of poses
    
    
    if perpocket:
        for key,data in dic.items():
            kstuff=[]
            kcount=0#tracks number of poses per key
            
            for key2, data2 in data.items():
                
                #if reducing, skip over not selected items
                if redkeys and key2 not in redkeys:
                    continue
                counter+=1
                kcount+=1
                if n < len(data2):
                    lookat=n
                else:
                    lookat=len(data2)
                
                for rmsd in data2[:lookat]:
                    if rmsd < thresh:
                        kstuff.append(True)
                        break
            has_stuff.append(np.sum(kstuff)/float(kcount))
        #end for
        #print(counter)
        return np.mean(has_stuff)
    else:
        for key,data in dic.items():
            for key2, data2 in data.items():

                #if reducing, skip over not selected items
                if redkeys and key2 not in redkeys:
                    continue

                counter+=1
                if n<len(data2):
                    lookat=n
                else:
                    lookat=len(data2)
                for rmsd in data2[:lookat]:
                    if rmsd < thresh:
                        has_stuff.append(True)
                        break
        #print(counter)
        return np.sum(has_stuff)/float(counter)

In [48]:
def make_dict(filename,is_sweep=False,has_cnnscore=False,tag_prefix=None):
    #filling a dictionary
    # will be a dic of pocket:key:'rmsd'->[rmsds]  iff is_sweep=False
    #                             'score'->[cnnscores]
    #     else --  dic of tag:pocket:key:'rmsd'->[rmsds]
    #                                    'score'->[cnnscores]
    datadic={} #dic of pocket:key:[rmsds]
        
    with open(filename) as infile:
        for i,line in enumerate(infile):
            if i==0:
                continue
            items=line.rstrip().split(',')
            
            if has_cnnscore:
                pocket=items[6]
                key=items[7]+':'+items[8]
            else:
                pocket=items[3]
                key=items[4]+':'+items[5]
            
            rmsd=float(items[2])
            
            if is_sweep:
                if tag_prefix:
                    check=items[0].split(tag_prefix)[-1]
                else:
                    check=items[0]
                
                if '_' in check:
                    checkval=check.split('_')[0]
                    if checkval=='' or checkval=='rescore':
                        tag='0'
                    else:
                        tag=checkval
                else:
                    tag=float(items[0])
                if tag not in datadic:
                    datadic[tag]=dict()
                
                if pocket in datadic[tag] and key in datadic[tag][pocket]:
                    datadic[tag][pocket][key].append(rmsd)
                elif pocket in datadic[tag] and key not in datadic[tag][pocket]:
                    datadic[tag][pocket][key]=[rmsd]
                else:
                    datadic[tag][pocket]={key:[rmsd]}
            else:
                #no need to stratify by tag
                if pocket in datadic and key in datadic[pocket]:
                    datadic[pocket][key].append(rmsd)
                elif pocket in datadic and key not in datadic[pocket]:
                    datadic[pocket][key]=[rmsd]
                else:
                    datadic[pocket]={key:[rmsd]}
    return datadic

In [84]:
def filter_csv(subset_file, remove_files=['/home/anm329/Docking/2017_general.INDEX','/home/anm329/Docking/Crossdock2020_Lig.txt','/home/anm329/Docking/Crossdock2020_Prot.txt'],new_suffix="no2017_noCD2020"):                                                     
    subset_csv = pd.read_csv(subset_file,sep=',')
    
    subset_csv['pdbid'] = subset_csv['rec'].apply(lambda x: x.split('/')[-1])  
    for filename in remove_files:      
        with open(filename) as remove_file:      
            remove_recs = remove_file.readlines()      
        remove_recs = [(rec.strip()).upper() for rec in remove_recs]          
        subset_csv = subset_csv[~subset_csv['pdbid'].isin(remove_recs)]                      
        subset_csv = subset_csv[~subset_csv['lig'].isin(remove_recs)]                                  
    subset_csv.drop(['pdbid'],axis=1,inplace=True)
    
    subset_name = f"{subset_file.split('.')[0]}_{new_suffix}.csv"
    subset_csv.to_csv(subset_name,sep=',',index=False)
    
    return subset_name

In [50]:
%ls /home/anm329/Docking/cnn_gnina/gnina_out/cd_results/

final_all_ensemble_rescore_defaults.csv
final_all_ensemble_rescore_defaults_no2017.csv
final_all_ensemble_rescore_defaults_no2017_nocd2020.csv
final_all_ensemble_rescore_defaults_no2017_nocd2020_no2017_noCD2020.csv
final_cd_def2018_ensemble_rescore_defaults.csv
final_cd_def2018_ensemble_rescore_defaults_no2017.csv
final_cd_def2018_ensemble_rescore_defaults_no2017_nocd2020.csv
final_cd_def2018_ensemble_rescore_defaults_no2017_nocd2020_no2017_noCD2020.csv
final_cd_def2018_rescore_defaults.csv
final_cd_def2018_rescore_defaults_no2017.csv
final_cd_def2018_rescore_defaults_no2017_nocd2020.csv
final_cd_def2018_rescore_defaults_no2017_noCD2020.csv
final_def2017_rescore_defaults.csv
final_def2017_rescore_defaults_no2017.csv
final_def2017_rescore_defaults_no2017_nocd2020.csv
final_def2017_rescore_defaults_no2017_noCD2020.csv
final_default_ensemble_cnn_emp_weight_sweep.csv
final_default_ensemble_default_refinement.csv
final_def_ensemble_aba_sweep.csv
final_def_ensemble_cnn_emp

In [51]:
basepath='/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/'

# Rescore vs Refine (with Vina)

In [52]:
files = [f'{basepath}final_default_ensemble_default_refinement.csv']
files.append(f'{basepath}final_vina_rescore_defaults.csv')
files.append(f'{basepath}final_def_ensemble_rescore_defaults.csv')
final_dataframe = pd.DataFrame(index=list(range(1,10)))
for file in files:
    if 'vina' not in file:
        has_cnnscore=True
    else:
        has_cnnscore=False
    if 'sweep' in file:
        is_sweep=True
    else:
        is_sweep=False
    tmp=make_dict(file,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    if is_sweep:
        sorted_keys=sorted(tmp.keys())
        print(sorted_keys)
        for key in sorted_keys:
            if 'cnnrot' in file and key != 0:
                print('continuing')
                continue
#             if 'exhaustiveness' in file and key != 8:
#                 print('continuing')
#                 continue
            xs=[]
            for top in range(1,10):
                val=topN(top,tmp[key],2,perpocket=True)
                xs.append(val)
            final_dataframe[key] = xs
    else:
        xs=[]
        for top in range(1,10):
                val=topN(top,tmp,2,perpocket=True)
                xs.append(val*100)
        final_dataframe[os.path.basename(file).split('.')[0]] = xs
    print(xs)

[35.781060073442085, 42.63052064648246, 46.03357627127276, 48.21403002416913, 49.432561127624105, 50.19969809196505, 50.92088262632205, 51.45123552091182, 51.95717800136441]
[25.63183048611532, 31.591785971397478, 35.477988073560915, 37.72633005674337, 39.66288725319867, 41.751609560469895, 43.35032691270833, 44.313173948833786, 45.01977875089139]
[36.02247467337353, 42.700199322819294, 45.634749332586765, 47.848220874041196, 49.093727713788404, 49.85878041081866, 50.39838501291977, 50.86157228536371, 51.15182020247096]


In [53]:
final_dataframe.columns = ['Default Ensemble Refine',
                           'Vina',
                             'Default Ensemble Rescore',]
final_dataframe = final_dataframe.reindex(columns=['Vina','Default Ensemble Rescore','Default Ensemble Refine'])
file_name = 'rescore_vs_refine'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

In [54]:
final_vina = final_dataframe['Vina']
final_defens = final_dataframe['Default Ensemble Rescore']
final_defens_ref = final_dataframe['Default Ensemble Refine']

# Single Model CSV

In [55]:
files = sorted(glob.glob(f'{basepath}final_*_def2018_rescore_defaults.csv'))
files.append(f'{basepath}final_def2017_rescore_defaults.csv')
files.append(f'{basepath}final_dense_rescore_defaults.csv')
final_dataframe = pd.DataFrame(index=list(range(1,10)))
for file in files:
    if 'rescore' in file or 'refine' in file:
        has_cnnscore=True
    else:
        has_cnnscore=False
    if 'sweep' in file:
        is_sweep=True
    else:
        is_sweep=False
    tmp=make_dict(file,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    if is_sweep:
        sorted_keys=sorted(tmp.keys())
        print(sorted_keys)
        for key in sorted_keys:
            if 'cnnrot' in file and key != 0:
                print('continuing')
                continue
#             if 'exhaustiveness' in file and key != 8:
#                 print('continuing')
#                 continue
            xs=[]
            for top in range(1,10):
                val=topN(top,tmp[key],2,perpocket=True)
                xs.append(val)
            final_dataframe[key] = xs
    else:
        xs=[]
        for top in range(1,10):
                val=topN(top,tmp,2,perpocket=True)
                xs.append(val*100)
        final_dataframe[os.path.basename(file).split('.')[0]] = xs
    print(xs)

[31.093995510411844, 37.42333149858466, 41.48202545600984, 43.78036332148187, 45.417258960905826, 46.51165622066878, 47.50651756131129, 48.24038862635346, 48.94706625972389]
[33.22745930991062, 40.59836106390178, 44.10253688750036, 46.739224303921574, 47.84401319902955, 49.01688315568712, 49.71334770854291, 50.30497020479374, 50.68007085195884]
[29.024372393061988, 35.55968064809053, 38.63546883001202, 41.45720264959001, 42.89336835492398, 44.09579674067834, 45.00910964787025, 45.97966192548425, 46.778934965246414]
[24.893677993470202, 32.371224121459406, 36.73921150656269, 40.30439173397049, 42.21744812761945, 43.91172617755797, 45.30707322802723, 46.85113767005728, 47.53653105379796]
[31.426207888348845, 38.654185588909826, 42.096535122393945, 44.70363950554172, 46.67530763653588, 47.78570086075201, 48.59380709896523, 49.39638580639068, 49.882866682729556]


In [56]:
final_dataframe.columns

Index(['final_cd_def2018_rescore_defaults',
       'final_gen_def2018_rescore_defaults',
       'final_rd_def2018_rescore_defaults', 'final_def2017_rescore_defaults',
       'final_dense_rescore_defaults'],
      dtype='object')

In [57]:
final_dataframe.columns = ['Crossdock Default2018',
                             'General Default2018',
                             'Redock Default2018',
                             'Default2017',
                             'Crossdock Dense']
final_dataframe['Vina'] = final_vina
final_dataframe['Default Ensemble'] = final_defens
final_dataframe = final_dataframe.reindex(columns=['Default Ensemble','Crossdock Default2018','Crossdock Dense','General Default2018','Redock Default2018','Default2017','Vina'])

In [58]:
file_name = 'rescore_single_models'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

### No PDBbind2017 and NoCD2020

In [77]:
files = sorted(glob.glob(f'{basepath}final_*_def2018_rescore_defaults.csv'))
files.append(f'{basepath}final_def2017_rescore_defaults.csv')
files.append(f'{basepath}final_dense_rescore_defaults.csv')
files.append(f'{basepath}final_def_ensemble_rescore_defaults.csv')
files.append(f'{basepath}final_vina_rescore_defaults.csv')
files  = [filter_csv(f) for f in files]
print(files)
final_dataframe = pd.DataFrame(index=list(range(1,10)))
for file in files:
    if 'vina' not in file:
        has_cnnscore=True
    else:
        has_cnnscore=False
    if 'sweep' in file:
        is_sweep=True
    else:
        is_sweep=False
    tmp=make_dict(file,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    if is_sweep:
        sorted_keys=sorted(tmp.keys())
        print(sorted_keys)
        for key in sorted_keys:
            if 'cnnrot' in file and key != 0:
                print('continuing')
                continue
#             if 'exhaustiveness' in file and key != 8:
#                 print('continuing')
#                 continue
            xs=[]
            for top in range(1,10):
                val=topN(top,tmp[key],2,perpocket=True)
                xs.append(val)
            final_dataframe[key] = xs
    else:
        xs=[]
        for top in range(1,10):
                val=topN(top,tmp,2,perpocket=True)
                xs.append(val*100)
        final_dataframe[os.path.basename(file).split('.')[0]] = xs
    print(xs)

['/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_cd_def2018_rescore_defaults_no2017_noCD2020.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_gen_def2018_rescore_defaults_no2017_noCD2020.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_rd_def2018_rescore_defaults_no2017_noCD2020.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def2017_rescore_defaults_no2017_noCD2020.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_dense_rescore_defaults_no2017_noCD2020.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_rescore_defaults_no2017_noCD2020.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_vina_rescore_defaults_no2017_noCD2020.csv']
[33.989114060542626, 37.604387675816255, 40.532174175031315, 41.3016428016428, 42.433510933510924, 42.433510933510924, 42.75493950493951, 46.423437673437675, 46.69816294816295]
[34.62671455528598, 40.46386153529011, 42.96553842982414, 45.311692275

In [78]:
final_dataframe.columns = ['Crossdock Default2018',
                             'General Default2018',
                             'Redock Default2018',
                             'Default2017',
                             'Crossdock Dense',
                             'Default Ensemble',
                          'Vina']
final_dataframe = final_dataframe.reindex(columns=['Default Ensemble','Crossdock Default2018','Crossdock Dense','General Default2018','Redock Default2018','Default2017','Vina'])

In [79]:
file_name = 'single_models_no2017_nocd2020.csv'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

In [80]:
final_vina_no17_nocd2020 = final_dataframe['Vina']
final_defens_no17_nocd2020 = final_dataframe['Default Ensemble']

# Ensemble CSV

In [63]:
files = glob.glob('/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/*.csv')
final_dataframe = pd.DataFrame(index=list(range(1,10)))
for fname in [f for f in files if '_ensemble_' in f and 'sweep' not in f and 'refine' not in f and 'no2017' not in f and 'nocd2020' not in f]:
    print(fname)
    if '_gen_' in fname:
        col='General Default2018 Ensemble'
    elif '_cd_def2018' in fname:
        col='Crossdock Default2018 Ensemble'
    elif '_rd_def2018' in fname:
        col='Redock Default2018 Ensemble'
    elif '_dense_' in fname:
        col='Crossdock Dense Ensemble'
    elif 'def_' in fname:
        continue
    elif 'default_' in fname:
        continue ## Already calculated above
    elif 'all' in fname:
        col='All Ensemble'
        
    if 'rescore' in fname:
        has_cnnscore=True
    else:
        has_cnnscore=False
        
    if 'sweep' in fname:
        is_sweep=True
    else:
        is_sweep=False
    tmp=make_dict(fname,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    if is_sweep:
        sorted_keys=sorted(tmp.keys())
        print(sorted_keys)
        for key in sorted_keys:
            if key == '100':
                continue
            xs=[]
            for top in range(1,10):
                val=topN(top,tmp[key],2,perpocket=True)
                xs.append(val)
            final_dataframe[col] = xs
    else:
        xs=[]
        for top in range(1,10):
                val=topN(top,tmp,2,perpocket=True)
                xs.append(val*100)
        final_dataframe[col] = xs
    print(xs)

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_all_ensemble_rescore_defaults.csv
[35.97165260709284, 42.57032626529192, 45.586153502198925, 48.03838364902211, 49.08707549392741, 49.72912016034093, 50.29863596190845, 50.81632006602314, 51.19169631631172]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_gen_def2018_ensemble_rescore_defaults.csv
[34.102441349798966, 41.086560605303326, 44.51518528500551, 46.89071531345455, 48.1397907616121, 49.20033341276991, 49.82363815053041, 50.37776872742696, 50.732779021208295]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_rd_def2018_ensemble_rescore_defaults.csv
[29.80437967234296, 35.93237830658064, 39.52782489788632, 42.112579592800046, 43.673748992007944, 44.787027738351206, 45.74351084621555, 46.53282068706818, 47.22371832370216]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_cd_def2018_ensemble_rescore_defaults.

In [64]:
final_dataframe.columns

Index(['All Ensemble', 'General Default2018 Ensemble',
       'Redock Default2018 Ensemble', 'Crossdock Default2018 Ensemble',
       'Crossdock Dense Ensemble'],
      dtype='object')

In [65]:
final_dataframe['Vina'] = final_vina
final_dataframe['Default Ensemble'] = final_defens
final_dataframe = final_dataframe.reindex(columns=['Default Ensemble','Crossdock Default2018 Ensemble','Crossdock Dense Ensemble','General Default2018 Ensemble','Redock Default2018 Ensemble','All Ensemble','Vina'])
file_name = 'rescore_ensembles'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

## Top1 PerPocket

In [66]:
files = glob.glob('/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/*.csv')
files.append(f'{basepath}final_vina_rescore_defaults.csv')
files.append(f'{basepath}final_def_ensemble_rescore_defaults.csv')
final_dataframe = pd.DataFrame(index=list(range(0,92)))
pockets = dict()
for idx, fname in enumerate([f for f in files if ('_ensemble_' in f or 'vina' in f ) and 'sweep' not in f and 'refine' not in f and 'no2017' not in f and 'nocd2020' not in f]):
    print(fname)
    if '_gen_' in fname:
        col='General Default2018 Ensemble'
    elif '_cd_def2018' in fname:
        col='Crossdock Default2018 Ensemble'
    elif '_rd_def2018' in fname:
        col='Redock Default2018 Ensemble'
    elif '_dense_' in fname:
        col='Crossdock Dense Ensemble'
    elif 'def_' in fname:
        col = 'Default Ensemble'
    elif 'vina' in fname:
        col = 'Vina'
    elif 'all' in fname:
        col='All Ensemble'
        
    if 'vina' in fname:
        has_cnnscore=False
    else:
        has_cnnscore=True
    is_sweep=False
    tmp=make_dict(fname,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    xs=[]
    for pock_num, pock in enumerate(tmp.keys()):
        if idx == 1:
            pockets[pock_num] = pock.split('/')[0]
        pocket_only = {pock: tmp[pock]} 
        val=topN(1,pocket_only,2,perpocket=False)
        xs.append(val*100)
    final_dataframe[col] = xs
final_dataframe.rename(index=pockets,inplace=True)
final_dataframe = final_dataframe.reindex(columns=['Default Ensemble','Crossdock Default2018 Ensemble','Crossdock Dense Ensemble','General Default2018 Ensemble','Redock Default2018 Ensemble','All Ensemble','Vina'])
file_name = 'top1_per_pocket'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_all_ensemble_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_gen_def2018_ensemble_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_rd_def2018_ensemble_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_cd_def2018_ensemble_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_vina_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_dense_ensemble_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_vina_rescore_defaults.csv
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_rescore_defaults.csv


### No PDBbind2017 and No CD2020

In [81]:
files = glob.glob('/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/*.csv')
final_dataframe = pd.DataFrame(index=list(range(1,10)))
for fname in [f for f in files if '_ensemble_' in f and 'sweep' not in f and 'refine' not in f and 'no2017' not in f and 'nocd2020' not in f]:
    fname  = filter_csv(fname)
    print(fname)
    if '_gen_' in fname:
        col='General Default2018 Ensemble'
    elif '_cd_def2018' in fname:
        col='Crossdock Default2018 Ensemble'
    elif '_rd_def2018' in fname:
        col='Redock Default2018 Ensemble'
    elif '_dense_' in fname:
        col='Crossdock Dense Ensemble'
    elif 'def_' in fname:
        continue
    elif 'all' in fname:
        col='All Ensemble'
        
    if 'rescore' in fname:
        has_cnnscore=True
    else:
        has_cnnscore=False
        
    if 'sweep' in fname:
        is_sweep=True
    else:
        is_sweep=False
    tmp=make_dict(fname,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    if is_sweep:
        sorted_keys=sorted(tmp.keys())
        print(sorted_keys)
        for key in sorted_keys:
            if key == '100':
                continue
            xs=[]
            for top in range(1,10):
                val=topN(top,tmp[key],2,perpocket=True)
                xs.append(val)
            final_dataframe[col] = xs
    else:
        xs=[]
        for top in range(1,10):
                val=topN(top,tmp,2,perpocket=True)
                xs.append(val*100)
        final_dataframe[col] = xs
    print(xs)

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_all_ensemble_rescore_defaults_no2017_noCD2020.csv
[37.74081474081474, 40.820219463076604, 42.27351616637331, 45.48176030318888, 46.3059361273647, 46.48450755593613, 47.22260279403137, 48.09256616399474, 48.687804259232834]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_gen_def2018_ensemble_rescore_defaults_no2017_noCD2020.csv
[35.661513090084526, 40.38222888222888, 42.918704311561456, 45.396726289583434, 46.95546516975089, 46.95546516975089, 47.230190444476165, 48.224434295862864, 48.224434295862864]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_rd_def2018_ensemble_rescore_defaults_no2017_noCD2020.csv
[32.16450216450217, 35.84156716299574, 37.73193869622441, 39.80336726765298, 40.24816056958914, 40.843398664827234, 41.1648272362558, 41.1648272362558, 41.1648272362558]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_rescore_defaults_no2017_noCD2020.csv
/home/anm329/Docking/cnn_gnina/g

In [82]:
final_dataframe.columns

Index(['All Ensemble', 'General Default2018 Ensemble',
       'Redock Default2018 Ensemble', 'Crossdock Default2018 Ensemble',
       'Crossdock Dense Ensemble'],
      dtype='object')

In [83]:
final_dataframe['Vina'] = final_vina_no17_nocd2020
final_dataframe['Default Ensemble'] = final_defens_no17_nocd2020
final_dataframe = final_dataframe.reindex(columns=['Default Ensemble','Crossdock Default2018 Ensemble','Crossdock Dense Ensemble','General Default2018 Ensemble','Redock Default2018 Ensemble','All Ensemble','Vina'])
file_name = 'ensemble_models_no2017_nocd2020'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

# Doing all of the Sweeps
Not whole protein

In [70]:
todo=glob.glob('/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble*sweep.csv')
# print(todo)
for filename in [file for file in todo if 'sweep' in file and '_wp_' not in file]:
    final_dataframe = pd.DataFrame(index=list(range(1,10)))
    prefix='sweep_'
    if 'nummodes' in filename:
        prefix+='num_modes'
        formats=['.0f','.0f']
    elif 'minRMSDfilter' in filename:
        prefix+='rmsdf_'
        formats=['.1f','.1f','.1f']
    elif '_aba_' in filename:
        prefix+='autobox_add'
        formats=['.0f','.0f','.0f','.0f']
    elif '_numMCsaved_' in filename:
        prefix+='mcsaved'
        final_dataframe['DefE_Default'] = final_defens
        formats=['.0f','.0f','.0f','.0f','.0f']
    elif 'exhaustiveness' in filename:
        continue ## doing this one separately
        prefix+='exhaustiveness'
        formats=['.0f','.0f','.0f','.0f','.0f']
    elif '_cnnrot_' in filename:
        prefix+='cnnrot'
        formats=['.0f','.0f','.0f','.0f','.0f']
    elif '_cnn_ew_' in filename or 'cnn_emp_weight' in filename:
        if 'rescore' in filename:
            continue
        else:
            prefix = 'refine_sweep_'
            final_dataframe['DefE_No Empirical Weight'] = final_defens_ref
            formats=['.2f','.1f','.0f','.0f']
        prefix+='cnn_empirical_weight'

    tmp=make_dict(filename,is_sweep=True,has_cnnscore=True)
    sorted_keys=sorted(tmp.keys())
    for key,form in zip(sorted_keys,formats):
        xs=[]
        for top in range(1,10):
            val=topN(top,tmp[key],2,perpocket=True)
            xs.append(val*100)
        final_dataframe[f'DefE_{key:{form}}'] = xs
        
    print(filename,prefix.strip('_'),final_dataframe.columns.tolist())
    print('')
    final_dataframe.to_csv(f"/home/anm329/GNINA-1.0/data/crossdocking/{prefix.strip('_')}.csv")

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_cnn_emp_weight_sweep.csv refine_sweep_cnn_empirical_weight ['DefE_No Empirical Weight', 'DefE_0.01', 'DefE_0.1', 'DefE_1', 'DefE_10']

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_cnnrot_sweep.csv sweep_cnnrot ['DefE_0', 'DefE_1', 'DefE_5', 'DefE_10', 'DefE_20']

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_nummodes_sweep.csv sweep_num_modes ['DefE_9', 'DefE_100']

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_aba_sweep.csv sweep_autobox_add ['DefE_2', 'DefE_4', 'DefE_6', 'DefE_8']

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_minRMSDfilter_sweep.csv sweep_rmsdf ['DefE_0.5', 'DefE_1.0', 'DefE_1.5']

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_numMCsaved_sweep.csv sweep_mcsaved ['DefE_Default', 'DefE_20', 'DefE_40', 'DefE_60', 'DefE_80', 'DefE_100']



In [71]:
todo=glob.glob('/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/*exhaustiveness*sweep.csv')
print(todo)
final_dataframe = pd.DataFrame(index=list(range(1,10)))
for filename in [file for file in todo if 'sweep' in file and '_wp_' not in file]:
    prefix='sweep_'
    if 'exhaustiveness' in filename:
        prefix+='exhaustiveness'
        formats=['.0f','.0f','.0f','.0f','.0f']
    if 'def_ensemble' in filename:
        colpre = 'DefE'
        has_cnnsc = True
    elif 'vina' in filename:
        colpre = 'Vina'
        has_cnnsc = False
    tmp=make_dict(filename,is_sweep=True,has_cnnscore=has_cnnsc)
    sorted_keys=sorted(tmp.keys())
    for key,form in zip(sorted_keys,formats):
        xs=[]
        for top in range(1,10):
            val=topN(top,tmp[key],2,perpocket=True)
            xs.append(val*100)
        final_dataframe[f'{colpre}_{key:{form}}'] = xs
        
print(final_dataframe.columns.tolist())
print('')
final_dataframe.to_csv(f"/home/anm329/GNINA-1.0/data/crossdocking/{prefix.strip('_')}.csv")

['/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_vina_scoring_wp_exhaustiveness_sweep.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_exhaustiveness_sweep.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_vina_exhaustiveness_sweep.csv', '/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_ensemble_wp_rescore_exhaustiveness_sweep.csv']
['DefE_4', 'DefE_8', 'DefE_16', 'Vina_4', 'Vina_8', 'Vina_16']



## Threshold CNNscore

In [85]:
files = glob.glob(f'{basepath}final_*ensemble_rescore_defaults.csv')
final_dataframe = pd.DataFrame(index=list(np.linspace(0,1,100)))
for file in files:
    if '_gen_' in file:
        col='General Default2018 Ensemble'
    elif '_cd_def2018' in file:
        col='Crossdock Default2018 Ensemble'
    elif '_rd_def2018' in file:
        col='Redock Default2018 Ensemble'
    elif '_dense_' in file:
        col='Crossdock Dense Ensemble'
    elif '_all_' in file:
        col='All Ensemble'
    elif 'def_ensemble' in file:
        col='Default Ensemble'
        keep_mult = True
    else:
        col='Vina'
        keep_mult = True
    if 'rescore' in file:
        has_cnnscore=True
    else:
        has_cnnscore=False
    if 'sweep' in file:
        is_sweep=True
    else:
        is_sweep=False
    remaining_sys = []
    base = pd.read_csv(file)
    if is_sweep:
#         sorted_keys=sorted(tmp.keys())
#         print(sorted_keys)
#         for key in sorted_keys:
#             if key == '100':
#                 continue
        print(f'sweep:{file}')
        break
        xs=[]
        for x in range(0,100):
            check=x/100.
            new=base[base['cnnscore']>check]
            remaining_sys.append(len(new))
            outname=f"{file.split('.')[0]}_{check}.csv"
            new.to_csv(outname,index=False)
            tmp=make_dict(outname,is_sweep=is_sweep, has_cnnscore=has_cnnscore)
            val=topN(1,tmp['9'],2,perpocket=True)
            xs.append(val)
        final_dataframe[f"{col} Good"] = xs
        final_dataframe[f"{col} Left"] = remaining_sys
    else:
        print(f'{file}:{col}')
        xs=[]
        base_amount=0
        for check in np.linspace(0,1,100):
            new=base[base['cnnscore']>check]
            if check==0:
                base_amount=len(new)
            remaining_sys.append((len(new)/base_amount)*100)
            if len(new):
                outname=f"{file.split('.')[0]}_{check}.csv"
                new.to_csv(outname,index=False)
                tmp=make_dict(outname,is_sweep=is_sweep, has_cnnscore=has_cnnscore)
                val=topN(1,tmp,2,perpocket=True)
                xs.append(val*100)
                os.remove(outname)
            else:
                xs.append('')
        final_dataframe[f"{col} Good"] = xs
        final_dataframe[f"{col} Left"] = remaining_sys

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_all_ensemble_rescore_defaults.csv:All Ensemble
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_gen_def2018_ensemble_rescore_defaults.csv:General Default2018 Ensemble
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_rd_def2018_ensemble_rescore_defaults.csv:Redock Default2018 Ensemble
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_def_ensemble_rescore_defaults.csv:Default Ensemble
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_cd_def2018_ensemble_rescore_defaults.csv:Crossdock Default2018 Ensemble
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_dense_ensemble_rescore_defaults.csv:Crossdock Dense Ensemble


In [86]:
final_dataframe1 = final_dataframe.reindex(['Default Ensemble Good','Default Ensemble Left','Crossdock Default2018 Ensemble Good','Crossdock Default2018 Ensemble Left','Crossdock Dense Ensemble Good','Crossdock Dense Ensemble Left','General Default2018 Ensemble Good','General Default2018 Ensemble Left','Redock Default2018 Ensemble Good','Redock Default2018 Ensemble Left','All Ensemble Good','All Ensemble Left'],axis=1)
final_dataframe1.to_csv('/home/anm329/GNINA-1.0/data/crossdocking/thresh_cnnscore_ensembles.csv',sep=',')

## Top1 PerPocket For Thresholds

In [75]:
files = glob.glob(f'{basepath}final_*ensemble_rescore_defaults.csv')
final_dataframe = pd.DataFrame(index=list(range(0,92)))
for file in files:
    if 'def_ensemble' in file:
        col='Default Ensemble'
        keep_mult = True
    else:
        continue
    is_sweep = False
    has_cnnscore = True
    remaining_sys = []
    base = pd.read_csv(file)
    for check in [0,0.5,0.8]:
        xs=[]
        new=base[base['cnnscore']>check]
        outname=f"{file.split('.')[0]}_{check}.csv"
        new.to_csv(outname,index=False)
        tmp=make_dict(outname,is_sweep=is_sweep, has_cnnscore=has_cnnscore)
        for pock in pockets.values():
            if f'{pock}/PDB_Structures/' in tmp.keys():
                pocket = {f'{pock}/PDB_Structures/': tmp[f'{pock}/PDB_Structures/']}
                val=topN(1,pocket,2,perpocket=True)
                xs.append(val*100)
            else:
                xs.append('--')
        final_dataframe[f"{check}"] = xs
        os.remove(outname)
final_dataframe.rename(index=pockets,inplace=True)
file_name = 'thresh_top1_per_pocket'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

# Whole protein
This is only one CSV with Vina and Default Ensemble with 8,16,32,64 exhaustiveness 

In [24]:
files = glob.glob('/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/*wp*exhaustiveness*filtered.csv')
final_dataframe = pd.DataFrame(index=list(range(1,10)))
for fname in files:
    print(fname)
    if 'vina' in fname:
        col='Vina'
    else:
        col='DefE'
    if 'rescore' in fname:
        has_cnnscore=True
    else:
        has_cnnscore=False
    if 'sweep' in fname:
        is_sweep=True
    else:
        print(f'{fname} is not a sweep')
        break
    tmp=make_dict(fname,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    sorted_keys=sorted(tmp.keys())
    print(sorted_keys)
    for key in sorted_keys:
        if key == 4.0:
            continue
        xs=[]
        for top in range(1,10):
            val=topN(top,tmp[key],2,perpocket=True)
            xs.append(val*100)
        final_dataframe[f'{col}_{key:.0f}'] = xs

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_vina_scoring_wp_exhaustiveness_sweep_filtered.csv
[8.0, 16.0, 32.0, 64.0]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_ensemble_wp_rescore_exhaustiveness_sweep_filtered.csv
[4.0, 8.0, 16.0, 32.0, 64.0]


In [25]:
final_dataframe = final_dataframe.reindex(columns=['DefE_8', 'DefE_16',
       'DefE_32', 'DefE_64','Vina_8', 'Vina_16', 'Vina_32', 'Vina_64'])
file_name = 'whole_ptn_sweep_exhaustiveness'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

## Whole Protein Top1 PerPocket

In [91]:
files = glob.glob('/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/*wp*exhaustiveness*.csv')
final_dataframe = pd.DataFrame(index=list(range(0,94)))
pockets = dict()
for idx,fname in enumerate(files):
    print(fname)
    if 'vina' in fname:
        col='Vina'
    else:
        col='Default Ensemble'
    if 'rescore' in fname:
        has_cnnscore=True
    else:
        has_cnnscore=False
    if 'sweep' in fname:
        is_sweep=True
    else:
        print(f'{fname} is not a sweep')
        break
    tmp=make_dict(fname,has_cnnscore=has_cnnscore, is_sweep=is_sweep)
    sorted_keys=sorted(tmp.keys())
    print(sorted_keys)
    for key in sorted_keys:
        if key != 8.0:
            continue
        xs=[]
        for pock_num, pock in enumerate(tmp[key].keys()):
            if key == 8.0 and idx == 1:
                pockets[pock_num] = pock.split('/')[0]
            pocket_only = {pock: tmp[key][pock]} 
            val=topN(top,pocket_only,2,perpocket=True)
            xs.append(val*100)
        final_dataframe[f'{col}'] = xs
final_dataframe.rename(index=pockets,inplace=True)
file_name = 'whole_ptn_top1_per_pocket'
final_dataframe.to_csv(f'/home/anm329/GNINA-1.0/data/crossdocking/{file_name}.csv')

/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_vina_scoring_wp_exhaustiveness_sweep.csv
[8.0, 16.0, 32.0, 64.0]
/home/anm329/Docking/cnn_gnina/gnina_out/cd_results/final_ensemble_wp_rescore_exhaustiveness_sweep.csv
[4.0, 8.0, 16.0, 32.0, 64.0]
