In [1]:
import numpy as np
import pandas as pd

In [2]:
import json

In [3]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten
# from keras.optimizers import Adam

Using TensorFlow backend.


In [4]:
def load_embedding(filename='embedding.csv'):
    return pd.read_csv(filename, header=None)

In [5]:
emb = load_embedding()

In [6]:
emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.58527,0.250145,0.469886,0.247563,-0.016136,-0.2289,-0.45698,0.184238,-0.532028,0.214238,...,0.005357,0.182222,0.264911,0.145107,-0.02059,0.970097,-0.372763,-0.182743,0.757045,-0.681987
1,-0.58402,0.39031,0.65282,-0.3403,0.19493,-0.83489,0.11929,-0.57291,-0.56844,0.72989,...,0.28586,-0.052162,-0.50818,-0.63459,0.33889,0.28416,-0.2034,-1.2338,0.46715,0.78858
2,0.25769,0.45629,-0.76974,-0.37679,0.59272,-0.063527,0.20545,-0.57385,-0.29009,-0.13662,...,0.030498,-0.39543,-0.38515,-1.0002,0.087599,-0.31009,-0.34677,-0.31438,0.75004,0.97065
3,-0.039369,1.2036,0.35401,-0.55999,-0.52078,-0.66988,-0.75417,-0.6534,-0.23246,0.58686,...,-0.60141,0.50403,-0.083316,0.20239,0.443,-0.060769,-0.42807,-0.084135,0.49164,0.085654
4,-0.2682,-0.4787,0.18099,-0.53837,-0.24021,-0.56203,0.20944,0.42358,-0.46147,0.76903,...,0.005399,-0.53344,-0.18706,0.52254,0.24361,0.051387,0.2721,-0.44433,0.019464,0.62782


In [7]:
emb_dim = emb.shape[1]

In [8]:
emb_dim

50

In [9]:
def load_dict(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_index_word_map(word2ind_filename='word2ind', ind2word_filename='ind2word'):
    word2ind = load_dict(word2ind_filename)
    ind2word = load_dict(ind2word_filename)
    return word2ind, ind2word

In [10]:
word2ind, ind2word = load_index_word_map()

In [11]:
vocab_size = len(word2ind)

In [12]:
def load_training_samples(filename='train.csv'):
    return pd.read_csv(filename, header=None)

In [13]:
dataset = load_training_samples()

In [14]:
def train_valid_split(x, y, test_size=0.3, random_state=43):
    assert len(x) == len(y), 'Feature and label must have same length.'
    np.random.RandomState(seed=random_state)
    length = len(x)
    choices = list(range(length))
    val_choices = np.random.choice(choices, int(length*test_size), replace=False).tolist()
    train_choices = list(set(choices) - set(val_choices))
    return x[train_choices, :], x[val_choices, :], y[train_choices, :], y[val_choices, :]

In [15]:
train_x, val_x, train_y, val_y = train_valid_split(dataset.values[:, :-1], dataset.values[:, -1:], test_size=0.3, random_state=43)

In [16]:
train_x.shape

(1946113, 10)

In [19]:
train_y.shape

(1946113, 1)

In [20]:
val_y.shape

(834048, 1)

In [21]:
emb.shape

(32930, 50)

In [22]:
input_length = 10

In [23]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, weights=[emb]))
model.add(Dropout(.2))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(Dropout(.2))
model.add(LSTM(100, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(.2))
# model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 50)            1646500   
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 100)           60400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
__________

In [25]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [27]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=10, batch_size=10000)

Train on 1946113 samples, validate on 834048 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9f1ab104a8>

In [34]:
model.optimizer.lr = 0.01

In [35]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=1, batch_size=200)

Train on 389061 samples, validate on 166740 samples
Epoch 1/1


<keras.callbacks.History at 0x7f536bd04dd8>

In [37]:
model.optimizer.lr = 0.001

In [38]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=2, batch_size=200)

Train on 389061 samples, validate on 166740 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f536bcc14a8>

In [28]:
# save model
model.save('model20171228.h5')