In [1]:
import os.path, sys

import numpy as np
import pandas as pd
from scipy import linalg
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder

import Bio.PDB, warnings
pdb_list = Bio.PDB.PDBList()
pdb_parser = Bio.PDB.PDBParser()
from scipy.spatial import distance_matrix
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)

from joblib import Parallel, delayed
import timeit

import matplotlib.pyplot as plt

# # --- Import our Code ---# #
#import emachine as EM
from direct_info import direct_info

# import data processing and general DCA_ER tools
from data_processing import data_processing_msa2pdb
import ecc_tools as tools
from pathlib import Path
np.random.seed(1)

from Bio import SeqIO
from Bio.PDB import *
from scipy.spatial import distance_matrix
from Bio import pairwise2
#from Bio.SubsMat.MatrixInfo import blosum62
pdb_parser = Bio.PDB.PDBParser()

from prody import *



# Z-test comparison

https://glassboxmedicine.com/2020/02/04/comparing-aucs-of-machine-learning-models-with-delongs-test/
* (code) https://github.com/yandexdataschool/roc_comparison

In [2]:
data_path = Path('/home/ecresswell/Pfam-A.full')
data_path = Path('/data/cresswellclayec/Pfam-A.full')

# Define data directories
DCA_ER_dir = '/home/ecresswell/DCA_ER' # Set DCA_ER directory
DCA_ER_dir = '/data/cresswellclayec/DCA_ER' # Set DCA_ER directory

biowulf_dir = '%s/biowulf_full' % DCA_ER_dir

out_dir = '%s/protein_data/di/' % biowulf_dir
processed_data_dir = "%s/protein_data/data_processing_output" % biowulf_dir
pdb_dir = '%s/protein_data/pdb_data/' % biowulf_dir

# pdb_path = "/pdb/pdb/zd/pdb1zdr.ent.gz"
pdb_path = "/pdb/pdb/as/pdb1ass.ent.gz"
pdb_id = '1ass'
pfam_dir = "/fdb/fastadb/pfam"
pdb_path = "/pdb/pdb/zd/pdb1zdr.ent.gz"

pdb_path = '%s%s/pdb%s.ent.gz' % (pdb_path, pdb_id[1:3], pdb_id)
unzipped_pdb_filename = os.path.basename(pdb_path).replace(".gz", "")
pdb_out_path = "%s%s" % (pdb_dir, unzipped_pdb_filename)
print('Unzipping %s to %s' % (pdb_path, pdb_out_path))


Unzipping /pdb/pdb/zd/pdb1zdr.ent.gzas/pdb1ass.ent.gz to /data/cresswellclayec/DCA_ER/biowulf_full/protein_data/pdb_data/pdb1ass.ent


In [39]:
import pandas as pd
import numpy as np
import scipy.stats

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    print('aucs: ', aucs)
    print('delong cov:', delongcov)
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    return order, label_1_count


def delong_roc_variance(ground_truth, predictions):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_roc_test(ground_truth, predictions_one, predictions_two):
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Args:
       ground_truth: np.array of 0 and 1
       predictions_one: predictions of the first model,
          np.array of floats of the probability of being class 1
       predictions_two: predictions of the second model,
          np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    return calc_pvalue(aucs, delongcov)

In [40]:
prody_df = pd.read_csv('%s/%s_pdb_df.csv' % (pdb_dir, pdb_id))

from data_processing import pdb2msa, data_processing_pdb2msa

pdb2msa_row = prody_df.iloc[0]
pfam_id = pdb2msa_row['Pfam']

s0 = np.load("%s/%s_%s_preproc_msa.npy" % (processed_data_dir, pfam_id, pdb_id))
s_index = np.load("%s/%s_%s_preproc_sindex.npy" % (processed_data_dir, pfam_id, pdb_id))
#pdb_s_index = np.load("%s/%s_%s_preproc_pdb_sindex.npy" % (processed_data_dir, pfam_id, pdb_id))
removed_cols = np.load("%s/%s_%s_removed_cols.npy" % (processed_data_dir, pfam_id, pdb_id))
ref_seq = np.load("%s/%s_%s_preproc_refseq.npy" % (processed_data_dir, pfam_id, pdb_id))

In [41]:
ER_di = np.load("%s/%s_%s_ER_di.npy" % (out_dir, pdb_id, pfam_id))
MF_di = np.load("%s/%s_%s_MF_di.npy" % (out_dir, pdb_id, pfam_id))
PMF_di_data = np.load("%s/%s_%s_PMF_di.npy" % (out_dir, pdb_id, pfam_id),allow_pickle=True)
PLM_di_data = np.load("%s/%s_%s_PLM_di.npy" % (out_dir, pdb_id, pfam_id),allow_pickle=True)

# PMF ROC
# translate PMF di tuple to contact matrix
PMF_di = np.zeros(ER_di.shape)
PMF_di_dict = {}
for score_set in PMF_di_data:
    PMF_di_dict[(score_set[0][0], score_set[0][1])] = score_set[1]
for i, index_i in enumerate(s_index):
    for j, index_j in enumerate(s_index):
        if i==j:
            PMF_di[i,j] = 1.
            continue
        try:
            PMF_di[i,j] = PMF_di_dict[(index_i, index_j)]
            PMF_di[j,i] = PMF_di_dict[(index_i, index_j)] # symetric
        except(KeyError):
            continue

# PLM ROC
# translate PMF di tuple to contact matrix
PLM_di = np.zeros(ER_di.shape)
PLM_di_dict = {}
for score_set in PLM_di_data:
    PLM_di_dict[(score_set[0][0], score_set[0][1])] = score_set[1]
for i, index_i in enumerate(s_index):
    for j, index_j in enumerate(s_index):
        if i==j:
            PLM_di[i,j] = 1.
            continue
        try:
            PLM_di[i,j] = PLM_di_dict[(index_i, index_j)]
            PLM_di[j,i] = PLM_di_dict[(index_i, index_j)] # symetric
        except(KeyError):
            continue


di_not = ER_di


In [42]:
# updated version of roc_curve_new 4/7/2022 
# flat-binary-contact array
from sklearn.metrics import roc_curve as roc_scikit
from sklearn.metrics import auc, precision_recall_curve                                                  
from math import comb

pdb_s_index = s_index
ct_thres = 6
ld_thresh = 0

ct, ct_full = tools.contact_map_pdb2msa_new(pdb2msa_row, pdb_out_path, removed_cols, pdb_s_index, pdb_out_dir=pdb_dir, printing=True)

ct1 = ct.copy()

ct_pos = ct1 < ct_thres
ct1[ct_pos] = 1
ct1[~ct_pos] = 0

mask = np.triu(np.ones(di_not.shape[0], dtype=bool), k=1)
# argsort sorts from low to high. [::-1] reverses 
order = di_not[mask].argsort()[::-1]
ct_flat = ct1[mask][order]

linear_distance = np.zeros((len(s_index),len(s_index)))
for i, ii in enumerate(s_index):
    for j, jj in enumerate(s_index):
        linear_distance[i,j] = abs(ii - jj)
ld = linear_distance >= ld_thresh
ld_flat = ld[mask][order]
#print(ld_flat)
old_len = len(ct_flat)
ct_flat = ct_flat[ld_flat]
ct_pos_flat = ct[mask][order][ld_flat]

print('observations: ', len(ct_flat))
fpr, tpr, thresholds = roc_scikit(ct_flat, di_not[mask][order][ld_flat])
roc_auc= auc(fpr, tpr)
print('ct thresh %f gives auc = %f' % (ct_thres, roc_auc))

MF_di_compare = MF_di[mask][order]
PMF_di_compare = PMF_di[mask][order]
PLM_di_compare = PLM_di[mask][order]

print('\nMF_di %d' % len(MF_di_compare))
print('PMF_di %d' % len(PMF_di_compare))
print('PLM_di %d' % len(PLM_di_compare))
print('ct shape choose 2: ', comb(ct.shape[0], 2))




#-----------------------#
Generating Contact Map
#----------------------------#

1 152
new poly seq list:  METSERGLYILEVALILEASPLYSGLULYSVALHISSERLYSMETPROASPVALVALLYSASNALALYSILEALALEUILEASPSERALALEUGLUILELYSLYSTHRGLUILEGLUALALYSVALGLNILESERASPPROSERLYSILEGLNASPPHELEUASNGLNGLUTHRASNTHRPHELYSGLNMETVALGLULYSILELYSLYSSERGLYALAASNVALVALLEUCYSGLNLYSGLYILEASPASPVALALAGLNHISTYRLEUALALYSGLUGLYILETYRALAVALARGARGVALLYSLYSSERASPMETGLULYSLEUALALYSALATHRGLYALALYSILEVALTHRASPLEUASPASPLEUTHRPROSERVALLEUGLYGLUALAGLUTHRVALGLUGLUARGLYSILEGLYASPASPARGMETTHRPHEVALMETGLYCYSLYS

Chain  <Chain id=A> :
 MSGIVIDKEKVHSKMPDVVKNAKIALIDSALEIKKTEIEAKVQISDPSKIQDFLNQETNTFKQMVEKIKKSGANVVLCQKGIDDVAQHYLAKEGIYAVRRVKKSDMEKLAKATGAKIVTDLDDLTPSVLGEAETVEERKIGDDRMTFVMGCK

 SGIVIDKEKVHSKMPDVVKNAKIALIDSALEIKKTEIEAKVQISDPSKIQDFLNQETNTFKQMVEKIKKSGANVVLCQKGIDDVAQHYLAKEGIYAVRRVKKSDMEKLAKATGAKIVTDLDDLTPSVLGEAETVEERKIGDDRMTFVMGCK 

poly_seq_range (151)
pp seq coordinates (152)
Aligned poly_seq: (len 119) ['G', 'I', 'V', 'I', 'D', '

### How likely is it that these models are the same and produced these results:

In [48]:
print('ER vs MF: ', delong_roc_test(ct_flat, ER_di[mask][order], MF_di_compare), '\n')
print('ER vs PMF: ', delong_roc_test(ct_flat, ER_di[mask][order], PMF_di_compare), '\n')
print('ER vs PLM: ', delong_roc_test(ct_flat, ER_di[mask][order], PLM_di_compare), '\n')
print('MF vs PMF: ', delong_roc_test(ct_flat, MF_di_compare, PMF_di_compare), '\n')
print('MF vs PLM: ', delong_roc_test(ct_flat, MF_di_compare, PLM_di_compare), '\n')
print('PLM vs PMF: ', delong_roc_test(ct_flat, PLM_di_compare, PMF_di_compare), '\n')

aucs:  [0.82536924 0.75169848]
delong cov: [[0.00019858 0.0001521 ]
 [0.0001521  0.00028365]]
ER vs MF:  [[-7.47326201]] 

aucs:  [0.82536924 0.75477344]
delong cov: [[0.00019858 0.00015546]
 [0.00015546 0.00029141]]
ER vs PMF:  [[-6.87816687]] 

aucs:  [0.82536924 0.77137546]
delong cov: [[0.00019858 0.00016489]
 [0.00016489 0.00027697]]
ER vs PLM:  [[-5.1111778]] 

aucs:  [0.75169848 0.75477344]
delong cov: [[0.00028365 0.00027295]
 [0.00027295 0.00029141]]
MF vs PMF:  [[-0.24484515]] 

aucs:  [0.75169848 0.77137546]
delong cov: [[0.00028365 0.00020248]
 [0.00020248 0.00027697]]
MF vs PLM:  [[-0.94019347]] 

aucs:  [0.77137546 0.75477344]
delong cov: [[0.00027697 0.00022941]
 [0.00022941 0.00029141]]
PLM vs PMF:  [[-0.94802737]] 

