## Load VALL

In [1]:
import gzip
import json
import pickle
import scipy
from sklearn.cluster import KMeans
from scipy import signal
import numpy as np

In [2]:
%%time
with gzip.GzipFile('../data/vall.jul19.2011.json.gz', 'r') as fin:
    json_bytes = fin.read() 

json_str = json_bytes.decode('utf-8')
vall = json.loads(json_str)
print("vall size: ", len(vall))

vall size:  16800
CPU times: user 1.6 s, sys: 101 ms, total: 1.7 s
Wall time: 1.71 s


## Load kmeans clusters

In [3]:
%%time
with open('../data/phipsi_km20.pkl', 'rb') as f:
    KM = pickle.load(f)
    
NCLUST = KM.cluster_centers_.shape[0]

CPU times: user 14.6 ms, sys: 4.51 ms, total: 19.1 ms
Wall time: 94 ms




## Assign clusters to (&phi;,&psi;) pairs

In [4]:
%%time
for key,item in vall.items():
    n = len(item['seq'])
    phi = np.array(item['phi'], dtype=np.float16)
    psi = np.array(item['psi'], dtype=np.float16)
    avec = np.vstack([np.sin(phi).T, np.cos(phi).T, np.sin(psi).T, np.cos(psi).T ]).T
    item['abin'] = np.eye(NCLUST)[KM.predict(avec)]

CPU times: user 4.15 s, sys: 183 ms, total: 4.34 s
Wall time: 4.34 s


In [5]:
# one of the vall 'profiles'
vall['7odcA']['abin']

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Searching

In [6]:
# randomly generated 'profile'
L = 100
query = np.random.rand(L, NCLUST)


### One fragment

In [9]:
%%time

hits = {}

# search vall for one fragment
chunk = query[0:9,:]
for key,item in vall.items():
    hits[key] = signal.correlate2d(item['abin'], chunk, mode='valid')

print(hits['7odcA'].shape)

(379, 1)
CPU times: user 1.66 s, sys: 0 ns, total: 1.66 s
Wall time: 1.66 s


### All fragments

In [10]:
%%time

# search for 9-mers
WINDOW = 9

hits = {}

iter=0
for key,item in vall.items():
    iter += 1
    
    # print every 1000
    if iter%1000==0:
        print(iter, key)
    
    # correlate all fragments of the query
    hits[key] = np.hstack([signal.correlate2d(item['abin'], query[shift:shift+WINDOW,:], mode='valid') 
               for shift in range(L - WINDOW + 1)])


1000 2boyA
2000 3civA
3000 1kz1A
4000 3hygA
5000 1nezG
6000 2fupA
7000 2cioA
8000 3m5wA
9000 2pbnA
10000 1xu2R
11000 1u46A
12000 3pqkA
13000 3h0nA
14000 1tuwA
15000 3ofgA
16000 1fyeA
CPU times: user 2min 34s, sys: 595 ms, total: 2min 35s
Wall time: 2min 36s


In [12]:
hits['7odcA'].shape

(379, 92)