In [1]:
from scipy import sparse, io
import numpy as np

In [2]:
import os

In [3]:
filedir_thyme1 = '../data/featfiles/thymefiles1/'
outfiledir_thyme1 = '../data/featvec/thymefiles1/'

filedir_thyme2 = '../data/featfiles/thymefiles2/'
outfiledir_thyme2 = '../data/featvec/thymefiles2/'

file_prefixes = ['spec', 'gap', 'mism']

In [4]:
def save_feat_vec_files(filedir, outfile, fileprefix):
    sp_mat_file = filedir + fileprefix + '_kern_sparsematrix.txt'
    enz_name_file = filedir + fileprefix + '_kern_rownames.txt'
    
    sp_mat = io.mmread(sp_mat_file).tocsr()
    enz_names = np.genfromtxt(enz_name_file, dtype=str)
    
    
    train_enz_idx = []
    test_enz_idx = []
    thyme_enz_idx = []

    for idx, enz_name in enumerate(enz_names):
        if enz_name.startswith('enz'):
            train_enz_idx.append(idx)
        elif enz_name.startswith('test'):
            test_enz_idx.append(idx)
        elif enz_name.startswith('thyme'):
            thyme_enz_idx.append(idx)
        else:
            raise ValueError('Wrong Enzyme Prefix')
            
    X_train, X_test, X_thyme = sp_mat[train_enz_idx,:], sp_mat[test_enz_idx,:], sp_mat[thyme_enz_idx, :]
    print(X_train.shape, X_test.shape, X_thyme.shape)
    enz_names_train, enz_names_test, enz_names_thyme = enz_names[train_enz_idx], enz_names[test_enz_idx], enz_names[thyme_enz_idx]
    
    assert X_train.shape[0] == len(enz_names_train)
    assert X_test.shape[0] == len(enz_names_test)
    assert X_thyme.shape[0] == len(enz_names_thyme)
    
    sparse.save_npz(outfile+'train/'+fileprefix+'mat.npz', X_train)
    sparse.save_npz(outfile+'test/'+fileprefix+'mat.npz', X_test)
    sparse.save_npz(outfile+'thyme/'+fileprefix+'mat.npz', X_thyme)

    np.savetxt(outfile+'train/'+fileprefix+'enz_names.txt', enz_names_train, fmt='%s')
    np.savetxt(outfile+'test/'+fileprefix+'enz_names.txt', enz_names_test, fmt='%s')
    np.savetxt(outfile+'thyme/'+fileprefix+'enz_names.txt', enz_names_thyme, fmt='%s')
    
    return 

In [5]:
for fp in file_prefixes:
    
    save_feat_vec_files(filedir_thyme1, outfiledir_thyme1, fp)
    save_feat_vec_files(filedir_thyme2, outfiledir_thyme2, fp)

(115, 45430) (9, 45430) (589, 45430)
(115, 46135) (9, 46135) (589, 46135)
(115, 124654) (9, 124654) (589, 124654)
(115, 124900) (9, 124900) (589, 124900)
(115, 9200) (9, 9200) (589, 9200)
(115, 9200) (9, 9200) (589, 9200)
