In [1]:
# number of CPUs to be used
NCPU=4

## Load and process the dataset

In [2]:
import json, gzip
import numpy as np
from sklearn.model_selection import train_test_split

aa2idx = {'A':0, 'R':1, 'N':2, 'D':3, 'C':4, 'Q':5, 'E':6, 'G':7, 'H':8, 'I':9,
          'L':10, 'K':11, 'M':12, 'F':13, 'P':14, 'S':15, 'T':16, 'W':17, 'Y':18, 'V':19}

# read .json file
with gzip.open('../data/phipsi.json.gz', 'rb') as f:
    dataset = json.loads(f.read().decode('utf-8'))

# reduse dataset to a list for simpler access
dataset = dataset['phipsi10882']

# convert data to numpy arrays skipping first and last residues
for item in dataset:
    n = len(item['sequence'])
    item['sequence'] = np.array([aa2idx[aa] for aa in item['sequence'][1:n-1]], dtype=np.int8)
    item['phi'] = np.array(item['phi'], dtype=np.float32)[1:n-1]
    item['psi'] = np.array(item['psi'], dtype=np.float32)[1:n-1]
    
    # convert (phi,psi) to their sin() and cos()
    # (4 numbers per angle pair)
    item['avec'] = np.vstack([
        np.sin(item['phi']).T,
        np.cos(item['phi']).T,
        np.sin(item['psi']).T,
        np.cos(item['psi']).T ]).T

# 90% train, 10% test
train,test = train_test_split(dataset, test_size=0.1, random_state=42)


In [3]:
%%time
from sklearn.cluster import KMeans

# split train set into 20 clusters
KMEANS = KMeans(n_clusters=20, max_iter=5, n_jobs=NCPU)
KMEANS.fit(np.vstack([item['avec'] for item in train]))

CPU times: user 54.9 s, sys: 47.2 s, total: 1min 42s
Wall time: 43.3 s


## Help functions

In [4]:
# assign each (phi,psi) to a cluster
# (KMEANS stores clustering results)
def set_clusters(train, test, KMEANS):
    for subset in (train, test):
        for item in subset:
            item['abin'] = np.array(KMEANS.predict(item['avec']), dtype=np.int8)
            

In [5]:
# split all sequences into chunks of size WINDOW
def set_window(train, test, WINDOW):
    
    for subset in (train, test):
        for item in subset:
            l = len(item['sequence'])

            abin = item['abin']
            seq = item['sequence']

            # for every window, pick the element in the middle and
            # save corresponding dihedral cluster ID in item['Y']
            item['Y'] = np.hstack([item[WINDOW//2] 
                                   for shift in range(0,WINDOW,1) 
                                   for item in np.split(abin[shift:],range(0,l,WINDOW)) 
                                   if len(item) == WINDOW])

            # use 1-hot encoding for every sequence chunk
            seq_chunks = np.vstack([item for shift in range(0,WINDOW,1) 
                                    for item in np.split(seq[shift:],range(0,l,WINDOW)) 
                                    if len(item) == WINDOW])
            item['X'] = np.array(np.eye(20)[seq_chunks], dtype=np.int8).reshape((seq_chunks.shape[0],-1))
            

## Optimal window size

In [None]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

set_clusters(train, test, KMEANS)

for WINDOW in range(1,33,2):
    set_window(train, test, WINDOW)
    
    X_train = np.vstack([item['X'] for item in train])
    Y_train = np.hstack([item['Y'] for item in train])
    X_test = np.vstack([item['X'] for item in test])
    Y_test = np.hstack([item['Y'] for item in test])
    
    sgd = SGDClassifier(max_iter=10, tol=1e-3, loss='log', n_jobs=NCPU)
    sgd.fit(X_train, Y_train)
    
    score_train = sgd.score(X_train, Y_train)
    score_test = sgd.score(X_test, Y_test)
    
    loss = log_loss(Y_test, sgd.predict_proba(X_test))

    print(WINDOW, score_train, score_test, loss)

1 0.27731598535191937 0.27676418816894827 2.374077414593405
3 0.291750679077452 0.29088582445151623 2.275495572812937
5 0.3029658723360613 0.30326357557373845 2.2337286323513728
7 0.310950939033964 0.3112886350382491 2.2092355370879977
9 0.3155080990072185 0.3161927022464615 2.1956344308986053


In [None]:
Y_test

## Regularization strength tuning

In [None]:
from sklearn.model_selection import GridSearchCV

#params = {'l1_ratio' : np.linspace(0.0, 1.0, 11), 'alpha' : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1] }
params = {'l1_ratio' : np.linspace(0.0, 1.0, 11)}

X_train = np.vstack([item['X'] for item in train[::10]])
Y_train = np.hstack([item['Y'] for item in train[::10]])

sgd = SGDClassifier(max_iter=100, tol=1e-3, loss='log', n_jobs=4)
hsearch = GridSearchCV(sgd, params, cv=5, verbose=2, n_jobs=5)


In [None]:
hsearch.fit(X_train, Y_train)

In [None]:
for mean, std, params in zip(means, stds, hsearch.cv_results_['params']):
       print("%0.5f (+/-%0.05f) for %r" % (mean, std * 2, params))

## Number of clusters