In [None]:
# This code takes in the log files from ~/Formatted_Data/LogFiles/GLIDE
# It calculates the weighted average by multiplying the score for a ligand docked to a cluster centroid times the percentage of frames in that cluster
# There are 2 sets of weights because clustering was done twice (once on the APO traj and the other on the HOLO traj)
# This code outputs a file with the Kendall's Tau values of the weighted Average rankings for GLIDE

In [None]:
import numpy as np
import scipy.stats as stats

In [None]:
GLIDE_types = ['holo', 'holo_core', 'SP', 'SP_core', 'XP', 'XP_core']
clustering = ['GROMOS', 'GROMOS_CBA','PCA', 'PCA_CBA', 'TICA', 'TICA_CBA']

In [None]:
weights_APO = [[1149/4500.0,687/4500.0,607/4500.0,370/4500.0,243/4500.0,175/4500.0,141/4500.0,123/4500.0,121/4500.0,97/4500.0],
              [2988/4500.0,363/4500.0,287/4500.0,173/4500.0,113/4500.0,58/4500.0,54/4500.0,41/4500.0,41/4500.0,31/4500.0],
              [0.11090, 0.09478, 0.07956, 0.08656, 0.10929, 0.08838, 0.11913, 0.06789, 0.09734, 0.14616],
              [0.102591 , 0.066524 , 0.073911 , 0.091118 , 0.109569 , 0.071591 , 0.125113 , 0.103242 , 0.085047 , 0.171293],
              [0.1025824 , 0.16532, 0.115838, 0.043793, 0.040069,0.200336, 0.014096, 0.057756, 0.144047, 0.09292],
              [0.098862 , 0.025933 , 0.199980 , 0.032904 , 0.157293, 0.06856 , 0.111398 , 0.006767 , 0.12446 , 0.173842]]

weights_Holo = [[2562/5000.0, 795/5000.0, 348/5000.0, 220/5000.0, 113/5000.0, 91/5000.0, 66/5000.0, 53/5000.0, 42/5000.0, 40/5000.0],
                [2310/5000.0, 1278/5000.0, 413/5000.0, 329/5000.0, 303/5000.0, 95/5000.0, 55/5000.0, 39/5000.0, 36/5000.0, 29/5000.0],
                [10.3672/100.0, 4.1514/100.0, 20.0/100.0, 20.0/100.0, 20.0/100.0, 3.9142/100.0, 3.8454/100.0, 9.6328/100.0, 4.0424/100.0, 4.0466/100.0 ],
                [22.2222/100.0, 4.7282/100.0, 22.2222/100.0, 22.2222/100.0, 10.6476/100.0, 3.8551/100.0, 3.8356/100.0, 4.9569/100.0, 11.5747/100.0, 4.8464/100.0],
                [1.8729/100.0, 1.0536/100.0, 16.7702/100.0, 10.5036/100.0, 19.5209/100.0, 13.4773/100.0, 5.2311/100.0, 13.5818/100.0, 10.6167/100.0, 18.4831/100.0],
                [8.4302/100.0, 9.3036/100.0, 22.0269/100.0, 13.3356/100.0, 11.7727/100.0, 4.6838/100.0, 8.8867/100.0, 8.1551/100.0, 10.4496/100.0, 14.0671/100.0]]

In [None]:
KT_file = open("/home/dhkumar/Formatted_Data/LogFiles/GLIDE/Kendalls_Tau_WA.txt","w+")
answer = open("/net/jam-amaro-shared/bccgc4/Answers.csv","r")
ranswer = answer.readlines()
answer.close()
for typ in GLIDE_types:
    for zzz in range(len(clustering)):
        clust = clustering[zzz]
        file_name = typ + '_' + clust + '_scores.csv'
        path = '/home/dhkumar/Formatted_Data/LogFiles/GLIDE/' + typ + '/' + file_name
        with open(path, 'r') as input_file:
            content = input_file.readlines()
        content = content[11:]
        
        avg_score = []
        
        for lig in range(1,460):
            temp = [0,0,0,0,0,0,0,0,0,0]
            for line in content:
                lig_num = int(line.split(',')[0].split('_')[1])
                lig_score = float(line.split(',')[1])
                lig_clustnum = int(line.split(',')[2])
                if lig_num == lig:
                    temp[lig_clustnum - 1] = lig_score
            if typ == 'holo' or typ == 'holo_core':
                avg_score_temp = np.dot(temp, weights_Holo[zzz])
            else:
                avg_score_temp = np.dot(temp, weights_APO[zzz])
            avg_score.append(avg_score_temp)
                    
        rankings = np.arange(1,460)
        rankings_scores = avg_score
        for index in range(1,459):
            value = rankings_scores[index] 
            r1 = rankings[index]
            i = index - 1
            while i>=0:
                r2 = rankings[i]
                if value < rankings_scores[i]: 
                    rankings_scores[i+1] = rankings_scores[i]   
                    rankings_scores[i] = value
                    rankings[i+1] = r2
                    rankings[i] = r1
                    i=i-1
                else:
                    break  
        rguess = []
        for i in range(459):
            rguess.append(('CatS_'+str(rankings[i])+','+ str(i+1) + ',' + str(rankings_scores[i])))        

        guesslist = []
        for i in rguess:
            entry = i.split(" ")
            guesslist.append(entry)
        answerlist=[]
        for i in ranswer:
            entry = i.split(" ")
            answerlist.append(entry)

        ans = []
        for k in answerlist:
            for h in k:
                h = h.split(",")
                num = h[1]
                ans.append(num)

        gues = []
        for y in answerlist:
            for u in y:
                u = u.split(',')
                for p in guesslist:
                    for v in p:
                        v = v.split(',')
                        if u[0] == v[0]:
                            gues.append(v[1])

        gues = [int(i) for i in gues]
        ans = [int(i) for i in ans]

        tau,p_value = stats.kendalltau(ans,gues)

        print(typ + '----' + clust + ': ' + str(tau))

        KT_file.write(typ + ' ' + clust + ": Kendall's Tau = " + str(tau) + "  p-value: " + str(p_value) + '\n')
KT_file.close()
