## Load the dataset

In [1]:
import json, gzip
import numpy as np

aa2idx = {'A':0, 'R':1, 'N':2, 'D':3, 'C':4, 'Q':5, 'E':6, 'G':7, 'H':8, 'I':9,
          'L':10, 'K':11, 'M':12, 'F':13, 'P':14, 'S':15, 'T':16, 'W':17, 'Y':18, 'V':19}

# read .json file
with gzip.open('../data/phipsi.json.gz', 'rb') as f:
    dataset = json.load(f)

# reduse dataset to a list for simpler access
dataset = dataset['phipsi10882']

# convert data to numpy arrays skipping first and last residues
for item in dataset:
    n = len(item['sequence'])
    item['sequence'] = np.array([aa2idx[aa] for aa in item['sequence'][1:n-1]], dtype=np.int8)
    item['phi'] = np.array(item['phi'], dtype=np.float32)[1:n-1]
    item['psi'] = np.array(item['psi'], dtype=np.float32)[1:n-1]
    
    # set angle vector:
    # convert (phi,psi) to their sin() and cos()
    # (4 numbers per angle pair)
    item['avec'] = np.vstack([
        np.sin(item['phi']).T,
        np.cos(item['phi']).T,
        np.sin(item['psi']).T,
        np.cos(item['psi']).T ]).T

## Train / test split

In [2]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(dataset, test_size=0.1, random_state=42)

print("Train size: {} proteins".format(len(train)))
print("Test size: {} proteins".format(len(test)))


Train size: 9793 proteins
Test size: 1089 proteins


## K-means clusters from angle vectors

In [3]:
from sklearn.cluster import KMeans

# combine all proteins in the training set
# into one vector X
X = np.vstack([item['avec'] for item in train])

# split into 20 clusters
kmeans = KMeans(n_clusters=20, max_iter=5, n_jobs=4).fit(X)


Assign all dihedral pairs (&phi;,&psi;) to the inferred clusters

In [4]:
# save cluster IDs for both train and test sets
for subset in (train, test):
    for item in subset:
        item['abin'] = np.array(kmeans.predict(item['avec']), dtype=np.int8)

## Split sequences into chunks of equal length

In [7]:
def set_window(train, test, WINDOW):
    for subset in (train, test):
        for item in subset:
            l = len(item['sequence'])

            abin = item['abin']
            seq = item['sequence']

            # for every window, pick the element in the middle and
            # save corresponding dihedral cluster ID in item['Y']
            item['Y'] = np.hstack([item[WINDOW//2] 
                                   for shift in range(0,WINDOW,1) 
                                   for item in np.split(abin[shift:],range(shift,l,WINDOW)) 
                                   if len(item) == WINDOW])

            # use 1-hot encoding for every sequence chunk
            seq_chunks = np.vstack([item for shift in range(0,WINDOW,1) 
                                    for item in np.split(seq[shift:],range(shift,l,WINDOW)) 
                                    if len(item) == WINDOW])
            item['X'] = np.array(np.eye(20)[seq_chunks], dtype=np.int8).reshape((seq_chunks.shape[0],-1))

Test ```set_window(..)``` function (might take ~1 min)

In [6]:
WINDOW = 13

set_window(train, test, WINDOW)

item = test[42]
print("Sequence length: {}".format(len(item['sequence'])))
print("Number of {}-mers: {}".format(WINDOW, item['X'].shape[0]))
print("Number of features: {}".format(item['X'].shape[1]))

Sequence length: 80
Number of 13-mers: 62
Number of features: 260


## Logistic regression - first try

Stack X and Y vectors from all proteins together to create single train/test vectors

In [8]:
X_train = np.vstack([item['X'] for item in train])
Y_train = np.hstack([item['Y'] for item in train])

X_test = np.vstack([item['X'] for item in test])
Y_test = np.hstack([item['Y'] for item in test])

print("X_train shape: {}".format(X_train.shape))
print("Y_train shape: {}".format(Y_train.shape))

X_train shape: (1968990, 260)
Y_train shape: (1968990,)


Applying regular ```sklearn```'s ```LogisticRegression``` to the entire training set is quite time-consuming (~1 hour per one run). A workaround is mini batch logistic regression accessible via ```SGDClassifier``` class with ```loss``` parameter set to ```'log'```

In [9]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(max_iter=1000, tol=1e-3, loss='log', n_jobs=4)
sgd.fit(X_train, Y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
       n_jobs=4, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False)

In [13]:
print("Train/test scores: {} / {}".format(sgd.score(X_train, Y_train), sgd.score(X_test, Y_test)))

Train/test scores: 0.34592405243297325 / 0.3461297356227855


Convert predictions back to angles and plot

## Optimal window size

In [14]:
for WINDOW in range(1,25,2):
    set_window(train, test, WINDOW)
    
    X_train = np.vstack([item['X'] for item in train])
    Y_train = np.hstack([item['Y'] for item in train])
    X_test = np.vstack([item['X'] for item in test])
    Y_test = np.hstack([item['Y'] for item in test])
    
    sgd = SGDClassifier(max_iter=10, tol=1e-3, loss='log', n_jobs=8)
    sgd.fit(X_train, Y_train)

    print(WINDOW, sgd.score(X_train, Y_train), sgd.score(X_test, Y_test))

1 0.3062727943973329 0.30564106414395475
3 0.3211360730852575 0.32045670789724073
5 0.33137072215464514 0.3314937737984975
7 0.3385323317422794 0.33946099216755604
9 0.3422915470758694 0.34291978788921534
11 0.3444799233775144 0.34514137873925166
13 0.34531663441662985 0.3459843735804488
15 0.345697668243787 0.34602739852355985
17 0.3452674858341814 0.345416327256725
19 0.3448993621511091 0.34478151935684775
21 0.3446404334598419 0.3446768850384879
23 0.34356287753740256 0.34338880276300554


## Hyperparameter tuning

In [18]:
set_window(train, test, 15)

In [33]:
from sklearn.model_selection import GridSearchCV

params = {'l1_ratio' : np.linspace(0.0, 1.0, 11), 'alpha' : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1] }

X_train = np.vstack([item['X'] for item in train[::10]])
Y_train = np.hstack([item['Y'] for item in train[::10]])

sgd = SGDClassifier(max_iter=100, tol=1e-3, loss='log', n_jobs=1)
hsearch = GridSearchCV(sgd, params, cv=3, verbose=2, n_jobs=8)


In [34]:
hsearch.fit(X_train, Y_train)


Fitting 3 folds for each of 66 candidates, totalling 198 fits
[CV] alpha=1e-05, l1_ratio=0.0 .......................................
[CV] alpha=1e-05, l1_ratio=0.0 .......................................
[CV] alpha=1e-05, l1_ratio=0.0 .......................................
[CV] alpha=1e-05, l1_ratio=0.1 .......................................
[CV] alpha=1e-05, l1_ratio=0.1 .......................................
[CV] alpha=1e-05, l1_ratio=0.1 .......................................
[CV] alpha=1e-05, l1_ratio=0.2 .......................................
[CV] alpha=1e-05, l1_ratio=0.2 .......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.0, total= 1.5min
[CV] alpha=1e-05, l1_ratio=0.2 .......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.2, total= 1.5min
[CV] alpha=1e-05, l1_ratio=0.30000000000000004 .......................
[CV] ........................ alpha=1e-05, l1_ratio=0.2, total= 2.1min
[CV] alpha=1e-0

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  8.0min


[CV] ........................ alpha=1e-05, l1_ratio=0.8, total= 2.5min
[CV] alpha=0.0001, l1_ratio=0.0 ......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.8, total= 2.6min
[CV] alpha=0.0001, l1_ratio=0.0 ......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.9, total= 2.5min
[CV] alpha=0.0001, l1_ratio=0.0 ......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.9, total= 2.4min
[CV] alpha=0.0001, l1_ratio=0.1 ......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.9, total= 2.6min
[CV] alpha=0.0001, l1_ratio=0.1 ......................................
[CV] ........................ alpha=1e-05, l1_ratio=1.0, total= 2.6min
[CV] alpha=0.0001, l1_ratio=0.1 ......................................
[CV] ....................... alpha=0.0001, l1_ratio=0.0, total= 1.1min
[CV] alpha=0.0001, l1_ratio=0.2 ......................................
[CV] .

[CV] ........................ alpha=0.001, l1_ratio=0.5, total=  39.6s
[CV] alpha=0.001, l1_ratio=0.8 .......................................
[CV] ......... alpha=0.001, l1_ratio=0.6000000000000001, total=  39.7s
[CV] alpha=0.001, l1_ratio=0.8 .......................................
[CV] ......... alpha=0.001, l1_ratio=0.6000000000000001, total=  42.0s
[CV] alpha=0.001, l1_ratio=0.9 .......................................
[CV] ......... alpha=0.001, l1_ratio=0.6000000000000001, total=  40.9s
[CV] alpha=0.001, l1_ratio=0.9 .......................................
[CV] ......... alpha=0.001, l1_ratio=0.7000000000000001, total=  40.6s
[CV] alpha=0.001, l1_ratio=0.9 .......................................
[CV] ......... alpha=0.001, l1_ratio=0.7000000000000001, total=  36.9s
[CV] alpha=0.001, l1_ratio=1.0 .......................................
[CV] ........................ alpha=0.001, l1_ratio=0.8, total=  38.4s
[CV] alpha=0.001, l1_ratio=1.0 .......................................
[CV] .

[CV] .......... alpha=0.1, l1_ratio=0.30000000000000004, total=  30.0s
[CV] alpha=0.1, l1_ratio=0.5 .........................................
[CV] .......... alpha=0.1, l1_ratio=0.30000000000000004, total=  28.6s
[CV] alpha=0.1, l1_ratio=0.6000000000000001 ..........................
[CV] .......... alpha=0.1, l1_ratio=0.30000000000000004, total=  28.6s
[CV] alpha=0.1, l1_ratio=0.6000000000000001 ..........................
[CV] .......................... alpha=0.1, l1_ratio=0.4, total=  28.8s
[CV] alpha=0.1, l1_ratio=0.6000000000000001 ..........................
[CV] .......................... alpha=0.1, l1_ratio=0.4, total=  28.9s
[CV] alpha=0.1, l1_ratio=0.7000000000000001 ..........................


[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed: 20.3min


[CV] .......................... alpha=0.1, l1_ratio=0.5, total=  26.2s
[CV] alpha=0.1, l1_ratio=0.7000000000000001 ..........................
[CV] .......................... alpha=0.1, l1_ratio=0.4, total=  31.0s
[CV] alpha=0.1, l1_ratio=0.7000000000000001 ..........................
[CV] .......................... alpha=0.1, l1_ratio=0.5, total=  30.2s
[CV] alpha=0.1, l1_ratio=0.8 .........................................
[CV] .......................... alpha=0.1, l1_ratio=0.5, total=  30.2s
[CV] alpha=0.1, l1_ratio=0.8 .........................................
[CV] ........... alpha=0.1, l1_ratio=0.6000000000000001, total=  29.8s
[CV] alpha=0.1, l1_ratio=0.8 .........................................
[CV] ........... alpha=0.1, l1_ratio=0.6000000000000001, total=  29.8s
[CV] alpha=0.1, l1_ratio=0.9 .........................................
[CV] ........... alpha=0.1, l1_ratio=0.6000000000000001, total=  30.4s
[CV] alpha=0.1, l1_ratio=0.9 .........................................
[CV] .

[Parallel(n_jobs=8)]: Done 198 out of 198 | elapsed: 26.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=100, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [35]:
means = hsearch.cv_results_['mean_test_score']
stds = hsearch.cv_results_['std_test_score']

In [38]:
for mean, std, params in zip(means, stds, hsearch.cv_results_['params']):
       print("%0.5f (+/-%0.05f) for %r" % (mean, std * 2, params))

0.32377 (+/-0.00315) for {'alpha': 1e-05, 'l1_ratio': 0.0}
0.32377 (+/-0.00315) for {'alpha': 1e-05, 'l1_ratio': 0.1}
0.32276 (+/-0.00311) for {'alpha': 1e-05, 'l1_ratio': 0.2}
0.32161 (+/-0.01044) for {'alpha': 1e-05, 'l1_ratio': 0.30000000000000004}
0.32161 (+/-0.01044) for {'alpha': 1e-05, 'l1_ratio': 0.4}
0.32361 (+/-0.00660) for {'alpha': 1e-05, 'l1_ratio': 0.5}
0.32338 (+/-0.00599) for {'alpha': 1e-05, 'l1_ratio': 0.6000000000000001}
0.32338 (+/-0.00599) for {'alpha': 1e-05, 'l1_ratio': 0.7000000000000001}
0.32145 (+/-0.00747) for {'alpha': 1e-05, 'l1_ratio': 0.8}
0.32145 (+/-0.00747) for {'alpha': 1e-05, 'l1_ratio': 0.9}
0.32166 (+/-0.00753) for {'alpha': 1e-05, 'l1_ratio': 1.0}
0.33817 (+/-0.00348) for {'alpha': 0.0001, 'l1_ratio': 0.0}
0.33817 (+/-0.00348) for {'alpha': 0.0001, 'l1_ratio': 0.1}
0.33761 (+/-0.00172) for {'alpha': 0.0001, 'l1_ratio': 0.2}
0.33785 (+/-0.00142) for {'alpha': 0.0001, 'l1_ratio': 0.30000000000000004}
0.33761 (+/-0.00196) for {'alpha': 0.0001, 'l1_ra