In [1]:
# number of CPUs to be used
NCPU=20

## Load and process the dataset

In [2]:
import gzip, json
import numpy as np
import utils 
from sklearn.model_selection import train_test_split

# 20 standard amino acids
aa2idx = {'A':0, 'R':1, 'N':2, 'D':3, 'C':4, 'Q':5, 'E':6, 'G':7, 'H':8, 'I':9,
          'L':10, 'K':11, 'M':12, 'F':13, 'P':14, 'S':15, 'T':16, 'W':17, 'Y':18, 'V':19}

# load
dataset = utils.load_phipsi()

# 90% train, 10% test
train,test = train_test_split(dataset, test_size=0.1, random_state=42)


## Randomness in clustering

K-means clustering is not guarantied to converge to the best  global solution each time you run it, but rather gives you a local suboptimal splitting. To check how this randomness in the k-means clusters affects the full angle prediction pipeline, we repeat clustering-training-testing procedure multiple times (number of clusters is fixed to 15) and compare the results.

In [None]:
WINDOW = 15

# do not need to change X (1-hot-encoded sequence)
# so can set it only once
X_train = utils.getX(train, WINDOW)
X_test = utils.getX(test, WINDOW)

# reference phi,psi for the test set
# also do not change
phi_ref = utils.getPHI(test, WINDOW)
psi_ref = utils.getPSI(test, WINDOW)


In [None]:
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

WINDOW = 15

# number of times clustering-learning-testing pipeline
# will be repeated
niter = 15

# save all clustering results here
KMEANS = []
scores1 = []

for i in range(niter):
    
    # cluster
    KM = KMeans(n_clusters=20, max_iter=5, n_jobs=NCPU)
    KM.fit(np.vstack([item['avec'] for item in train]))
    
    # save current solution
    KMEANS.append(KM)
    
    # update Y vectors (cluster IDs for central residues)
    Y_train = utils.getY(train, WINDOW, KM)
    Y_test = utils.getY(test, WINDOW, KM)
    
    # train logistis regression model
    sgd = SGDClassifier(max_iter=10, tol=1e-3, loss='log', n_jobs=NCPU)
    sgd.fit(X_train, Y_train)
    
    # calculate sin(.),cos(.) weighted averages on the test set
    avec = np.matmul(sgd.predict_proba(X_test), KM.cluster_centers_)

    # convert angle vectors to true angles
    norm_phi = np.sqrt(np.square(avec[:,0])+np.square(avec[:,1]))
    norm_psi = np.sqrt(np.square(avec[:,2])+np.square(avec[:,3]))
    phi_pred = np.arctan2(avec[:,0] / norm_phi, avec[:,1] / norm_phi)
    psi_pred = np.arctan2(avec[:,2] / norm_psi, avec[:,3] / norm_psi)

    # calculate scores
    loss = log_loss(Y_test, sgd.predict_proba(X_test))
    rmse_phi = utils.ang_rmse(phi_ref, phi_pred) * 180 / np.pi
    rmse_psi = utils.ang_rmse(psi_ref, psi_pred) * 180 / np.pi
    
    scores1.append([rmse_phi, rmse_psi, loss])
    
    print("iter {:2d} | loss {:.5f} | rmse(phi) {:9.5f} | rmse(phi) {:9.5f}".
          format(i, loss, rmse_phi, rmse_psi))

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

xyc = np.array(scores1)

plt.figure(figsize=(6,6))
plt.xlabel('rmse(phi)', fontsize=15)
plt.ylabel('rmse(psi)', fontsize=15)
plt.grid()
plt.rc('axes', axisbelow=True)
plt.title('Variation in accuracy', fontsize=20)
plt.scatter(xyc[:,0], xyc[:,1], c=xyc[:,2], s=100)
plt.show()


## Effect of window size

In [None]:
# select the first cluster
KM = KMEANS[0]

scores2 = []

for WINDOW in range(1,32,2):
    
    # WINDOW changes, so does X
    X_train = utils.getX(train, WINDOW)
    X_test = utils.getX(test, WINDOW)
    
    # update Y vectors (cluster IDs for central residues)
    Y_train = utils.getY(train, WINDOW, KM)
    Y_test = utils.getY(test, WINDOW, KM)
    
    # same for reference angles
    phi_ref = utils.getPHI(test, WINDOW)
    psi_ref = utils.getPSI(test, WINDOW)

    # train logistis regression model
    sgd = SGDClassifier(max_iter=10, tol=1e-3, loss='log', n_jobs=NCPU)
    sgd.fit(X_train, Y_train)
    
    # calculate sin(.),cos(.) weighted averages on the test set
    avec = np.matmul(sgd.predict_proba(X_test), KM.cluster_centers_)

    # convert angle vectors to true angles
    norm_phi = np.sqrt(np.square(avec[:,0])+np.square(avec[:,1]))
    norm_psi = np.sqrt(np.square(avec[:,2])+np.square(avec[:,3]))
    phi_pred = np.arctan2(avec[:,0] / norm_phi, avec[:,1] / norm_phi)
    psi_pred = np.arctan2(avec[:,2] / norm_psi, avec[:,3] / norm_psi)

    # calculate scores
    loss = log_loss(Y_test, sgd.predict_proba(X_test))
    rmse_phi = utils.ang_rmse(phi_ref, phi_pred) * 180 / np.pi
    rmse_psi = utils.ang_rmse(psi_ref, psi_pred) * 180 / np.pi
    
    scores2.append([WINDOW, rmse_phi, rmse_psi, loss])
    
    print("WINDOW {:2d} | loss {:.5f} | rmse(phi) {:9.5f} | rmse(phi) {:9.5f}".
          format(WINDOW, loss, rmse_phi, rmse_psi))
    

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3)
fig.set_size_inches(15, 4)

xyc = np.array(scores2)

# set style
for col in (0,1,2):
    ax[col].set_xlim([0,32])
    ax[col].set_xlabel("window size", fontsize=15)

ax[0].set_ylabel("loss", fontsize=15)
ax[0].plot(xyc[:,0], xyc[:,3], linestyle='-', marker='o', color='g')

ax[1].set_ylabel("rmse(phi)", fontsize=15)
ax[1].plot(xyc[:,0], xyc[:,1], linestyle='-', marker='o', color='g')

ax[2].set_ylabel("rmse(psi)", fontsize=15)
ax[2].plot(xyc[:,0], xyc[:,2], linestyle='-', marker='o', color='g')

plt.subplots_adjust(wspace = 0.3)

plt.show()

## Number of clusters

## Regularization strength