In [67]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest

In [68]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from dna2vec.multi_k_model import MultiKModel
import seaborn as sns
%matplotlib inline

In [3]:
#### Read fasta ####
def read_FASTA(file_name):
    with open(file_name, "r") as fn:
        text = fn.read().split(">")
    text = [x.split("\n") for x in text if x != ""]
    text = [[x[0],"".join(x[1:]).upper()] for x in text]
    text_dict = {line[0].split('|')[0]:line[1] for line in text}
    return text_dict

In [4]:
filepath = 'pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v'
mk_model = MultiKModel(filepath)

In [18]:
len(mk_model.vector("AAATGGTT"))

100

# Data

In [6]:
code = read_FASTA("../../data/FINAL_DATA_1/fasta_for_test/train_fasta_gencode.fasta")
noncode = read_FASTA("../../data/FINAL_DATA_1/fasta_for_test/train_fasta_lncPedia.fasta")

# Preprocessing

## making data generator

In [19]:
def make_vec(seq, kmer=8, max_length=13000):
    vectors = []
    i = 0
    while (i<(len(seq)-kmer)) and (i<max_length):
        tmp = seq[i:i+kmer]
        if 'N' in tmp:
            i+=1
            continue
        vec_tmp = mk_model.vector(tmp)
        vectors.append(vec_tmp)
        i+=1
    return np.mean(vectors, axis=0)

In [8]:
from sklearn.cross_validation import train_test_split
from copy import deepcopy
from tqdm import tqdm



In [21]:
N = 7000
y = np.array([1]*N + [0]*N)
X_tmp = deepcopy(list(code.items())[:N])
X_tmp.extend(list(noncode.items())[:N])
len(X_tmp)

14000

## making embedding matrix

In [34]:
embedding_matrix.to_csv("../../data/FINAL_DATA_1/embedding_matrix.csv")

In [33]:
k_mer = 8
embedding_matrix = dict()
"""
A -- 1
T -- 2
G -- 3
C -- 4

"""
for i1 in "ATGC":
    for i2 in "ATGC":
        for i3 in "ATGC":
            for i4 in "ATGC":
                for i5 in "ATGC":
                    for i6 in "ATGC":
                        for i7 in "ATGC":
                            for i8 in "ATGC":
                                name = i1+i2+i3+i4+i5+i6+i7+i8
                                embedding_matrix[name] = mk_model.vector(name)

embedding_matrix['0'] = np.zeros((100,))
embedding_matrix = pd.DataFrame(embedding_matrix).T
embedding_matrix.insert(0,'code', range(0, len(embedding_matrix)))
print(embedding_matrix.shape)
embedding_matrix.head()



[A

[A[A


[A[A[A

(65537, 101)


Unnamed: 0,code,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAAAAAA,1,-0.365117,0.408025,0.064311,-0.293642,0.103023,0.089713,-0.764232,-0.003985,0.107479,...,-0.216967,-0.809641,0.370482,0.632726,0.690164,-0.406542,0.187238,-0.62623,-0.271666,0.082561
AAAAAAAC,2,-0.34543,0.288133,0.324433,-0.366527,-0.170981,-0.025718,-0.380066,0.055025,-0.016809,...,-0.2333,-0.737088,0.072006,0.639653,0.292097,-0.500181,0.094142,0.100279,-0.439565,0.160597
AAAAAAAG,3,0.000498,0.310706,-0.153957,-0.397525,0.163247,0.102393,-0.393191,-0.149867,0.093688,...,-0.132578,-0.596084,0.520723,0.664362,0.738464,-0.411244,-0.110038,-0.386498,-0.017523,-0.151797
AAAAAAAT,4,-0.276156,0.117486,0.097248,-0.301252,-0.19459,-0.149558,-0.705774,-0.121932,0.032583,...,0.003845,-0.890366,0.143492,0.22074,0.490493,0.093366,-0.143799,-0.783927,-0.192266,0.22324


In [48]:
# embedding_matrix = pd.read_csv("../../data/FINAL_DATA_1/embedding_matrix.csv", index_col=0)
embedding_matrix.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.365117,0.408025,0.064311,-0.293642,0.103023,0.089713,-0.764232,-0.003985,0.107479,-0.488348,...,-0.216967,-0.809641,0.370482,0.632726,0.690164,-0.406542,0.187238,-0.62623,-0.271666,0.082561
2,-0.34543,0.288133,0.324433,-0.366527,-0.170981,-0.025718,-0.380066,0.055025,-0.016809,-0.320237,...,-0.2333,-0.737088,0.072006,0.639653,0.292097,-0.500181,0.094142,0.100279,-0.439565,0.160597
3,0.000498,0.310706,-0.153957,-0.397525,0.163247,0.102393,-0.393191,-0.149867,0.093688,-0.231512,...,-0.132578,-0.596084,0.520723,0.664362,0.738464,-0.411244,-0.110038,-0.386498,-0.017523,-0.151797
4,-0.276156,0.117486,0.097248,-0.301252,-0.19459,-0.149558,-0.705774,-0.121932,0.032583,-0.345269,...,0.003845,-0.890366,0.143492,0.22074,0.490493,0.093366,-0.143799,-0.783927,-0.192266,0.22324


In [41]:
encoder_dict = dict(zip(embedding_matrix.index, embedding_matrix.code))

In [44]:
def make_kmers(seq, kmer=8, max_length=7000):
    res = []
    i=0
    while (i<(len(seq)-kmer)) and (i<max_length):
        tmp = seq[i:i+kmer]
        if 'N' in tmp:
            i+=1
            continue
        res.append(encoder_dict[tmp])
        i+=1
    return res

X = np.array([make_kmers(v) for k,v in X_tmp])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [47]:
embedding_matrix.index = embedding_matrix.code
embedding_matrix.drop('code', axis=1, inplace=True)

In [70]:
from keras.layers import Embedding
MAX_SEQUENCE_LENGTH = 7000
EMBEDDING_DIM = 100
embedding_layer = Embedding(len(embedding_matrix),
                            EMBEDDING_DIM,
                            weights=[np.array(embedding_matrix)],
#                             input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [60]:
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

# create the model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         6553700   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 6,634,201
Trainable params: 80,501
Non-trainable params: 6,553,700
_________________________________________________________________
None
Epoch 1/3
 128/9380 [..............................] - ETA: 1344s - loss: 0.7146 - acc: 0.4297

KeyboardInterrupt: 

In [69]:
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalAveragePooling1D

In [61]:
# y_train_dumm = np.array(pd.get_dummies(y_train))
# y_test_dumm = np.array(pd.get_dummies(y_test))

print("train: ", X_train.shape)
print("test: ", X_test.shape)

model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=300, kernel_size=5, padding='same', activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling1D(pool_size=2))
model.add(GlobalAveragePooling1D())
# model.add(LSTM(128)) #,dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=10, batch_size=80, validation_data=(X_test, y_test)) #class_weight=Counter(y_train))

scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

train:  (9380, 7000)
test:  (4620, 7000)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         6553700   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 300)         150300    
_________________________________________________________________
global_average_pooling1d_3 ( (None, 300)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 301       
Total params: 6,704,301
Trainable params: 150,601
Non-trainable params: 6,553,700
_________________________________________________________________
None
Train on 9380 samples, validate on 4620 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 77.36%


SyntaxError: 'return' outside function (<ipython-input-61-46d62ca5da5b>, line 22)

In [65]:
import h5py

In [72]:
model.save_weights('conv_model_weights.h5')

ImportError: `save_weights` requires h5py.

In [None]:
# top_words = 60000
max_review_length = 5000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model
embedding_vecor_length = 100
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))