In [1]:
import numpy as np
import pandas as pd

In [2]:
import json

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten
# from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
def load_embedding(filename='embedding.csv'):
    return pd.read_csv(filename, header=None)

In [5]:
emb = load_embedding()

In [6]:
emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.363037,0.334529,-0.165212,-0.409537,-0.298907,-0.652681,0.433844,0.339051,-0.419195,0.162223,...,-0.150816,0.380193,0.763373,-0.686894,-0.139185,-0.273024,0.187815,0.011107,0.785204,-0.255765
1,-0.58402,0.39031,0.65282,-0.3403,0.19493,-0.83489,0.11929,-0.57291,-0.56844,0.72989,...,0.28586,-0.052162,-0.50818,-0.63459,0.33889,0.28416,-0.2034,-1.2338,0.46715,0.78858
2,0.25769,0.45629,-0.76974,-0.37679,0.59272,-0.063527,0.20545,-0.57385,-0.29009,-0.13662,...,0.030498,-0.39543,-0.38515,-1.0002,0.087599,-0.31009,-0.34677,-0.31438,0.75004,0.97065
3,-0.039369,1.2036,0.35401,-0.55999,-0.52078,-0.66988,-0.75417,-0.6534,-0.23246,0.58686,...,-0.60141,0.50403,-0.083316,0.20239,0.443,-0.060769,-0.42807,-0.084135,0.49164,0.085654
4,-0.2682,-0.4787,0.18099,-0.53837,-0.24021,-0.56203,0.20944,0.42358,-0.46147,0.76903,...,0.005399,-0.53344,-0.18706,0.52254,0.24361,0.051387,0.2721,-0.44433,0.019464,0.62782


In [7]:
emb_dim = emb.shape[1]

In [8]:
emb_dim

50

In [9]:
def load_dict(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_index_word_map(word2ind_filename='word2ind', ind2word_filename='ind2word'):
    word2ind = load_dict(word2ind_filename)
    ind2word = load_dict(ind2word_filename)
    return word2ind, ind2word

In [10]:
word2ind, ind2word = load_index_word_map()

In [11]:
vocab_size = len(word2ind)

In [12]:
def load_training_samples(filename='train.csv'):
    return pd.read_csv(filename, header=None)

In [13]:
dataset = load_training_samples()

In [14]:
def train_valid_split(x, y, test_size=0.3, random_state=43):
    assert len(x) == len(y), 'Feature and label must have same length.'
    np.random.RandomState(seed=random_state)
    length = len(x)
    choices = list(range(length))
    val_choices = np.random.choice(choices, int(length*test_size), replace=False).tolist()
    train_choices = list(set(choices) - set(val_choices))
    return x[train_choices, :], x[val_choices, :], y[train_choices, :], y[val_choices, :]

In [15]:
train_x, val_x, train_y, val_y = train_valid_split(dataset.values[:, :-1], dataset.values[:, -1:], test_size=0.3, random_state=43)

In [16]:
train_x.shape

(10972446, 20)

In [17]:
train_y.shape

(10972446, 1)

In [18]:
val_x.shape

(4702476, 20)

In [19]:
val_y.shape

(4702476, 1)

In [20]:
emb.shape

(81452, 50)

In [21]:
input_length = train_x.shape[1]

In [22]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, weights=[emb]))
model.add(Dropout(.2))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(Dropout(.2))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(Dropout(.2))
model.add(LSTM(100, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(.2))
# model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

In [22]:
model = load_model('model20180118_5.h5')

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            4072600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 100)           60400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 20, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 20, 100)           80400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 20, 100)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
__________

In [24]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [25]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=1, batch_size=3000)

Train on 10972446 samples, validate on 4702476 samples
Epoch 1/1


<keras.callbacks.History at 0x7f7b6c853198>

In [26]:
# save model
model.save('model20180119_1.h5')

In [27]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=1, batch_size=4000)

Train on 10972446 samples, validate on 4702476 samples
Epoch 1/1


<keras.callbacks.History at 0x7f7b442bb2e8>

In [28]:
# save model
model.save('model20180119_2.h5')

In [33]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=1, batch_size=3000)

Train on 10972446 samples, validate on 4702476 samples
Epoch 1/1


<keras.callbacks.History at 0x7fb78547c320>

In [34]:
# save model
model.save('model20180118_5.h5')