In [1]:
import numpy as np
import pandas as pd

In [2]:
import json

In [3]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

Using TensorFlow backend.


In [5]:
def load_embedding(filename='embedding.csv'):
    return pd.read_csv(filename, header=None)

In [6]:
emb = load_embedding()

In [7]:
emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.012651,0.010733,0.359927,-1.030055,-0.063139,0.371061,0.366795,0.625632,0.149722,-0.107267,...,-0.149429,0.9871,-0.474118,0.594343,0.715478,-0.531737,-0.729918,-0.246298,1.12134,0.320908
1,0.006646,-0.288331,1.110851,-0.306317,-0.45782,-0.503268,0.76574,-0.141126,0.720336,-0.328514,...,1.649552,1.032799,-0.377423,-0.061797,0.644555,0.171085,1.035091,-0.391946,-0.546641,0.105512
2,-0.58402,0.39031,0.65282,-0.3403,0.19493,-0.83489,0.11929,-0.57291,-0.56844,0.72989,...,0.28586,-0.052162,-0.50818,-0.63459,0.33889,0.28416,-0.2034,-1.2338,0.46715,0.78858
3,0.25769,0.45629,-0.76974,-0.37679,0.59272,-0.063527,0.20545,-0.57385,-0.29009,-0.13662,...,0.030498,-0.39543,-0.38515,-1.0002,0.087599,-0.31009,-0.34677,-0.31438,0.75004,0.97065
4,1.672343,-0.660894,0.753561,0.97293,-0.506347,-0.444259,-0.529368,-1.117493,-0.240684,1.023932,...,-0.26553,1.153708,1.019994,0.035231,-0.603387,0.496305,0.420478,-0.401456,0.192662,-0.569737


In [8]:
emb_dim = emb.shape[1]

In [10]:
def load_dict(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_index_word_map(word2ind_filename='word2ind', ind2word_filename='ind2word'):
    word2ind = load_dict(word2ind_filename)
    ind2word = load_dict(ind2word_filename)
    return word2ind, ind2word

In [11]:
word2ind, ind2word = load_index_word_map()

In [12]:
vocab_size = len(word2ind)

In [3]:
def load_training_samples(filename='train.csv'):
    return pd.read_csv(filename, header=None)

In [4]:
dataset = load_training_samples()

In [5]:
def train_valid_split(x, y, test_size=0.3, random_state=43):
    assert len(x) == len(y), 'Feature and label must have same length.'
    np.random.RandomState(seed=random_state)
    length = len(x)
    choices = list(range(length))
    val_choices = np.random.choice(choices, int(length*test_size), replace=False).tolist()
    train_choices = list(set(choices) - set(val_choices))
    return x[train_choices, :], x[val_choices, :], y[train_choices, :], y[val_choices, :]

In [7]:
train_x, val_x, train_y_, val_y_ = train_valid_split(dataset.values[:, :-1], dataset.values[:, -1:], test_size=0.3, random_state=43)

In [8]:
train_x.shape

(205429, 10)

In [9]:
train_y_.shape

(205429, 1)

In [15]:
def make_hotted_labels(dataset, vocab_size):
    n_samples = dataset.shape[0]
    indices = list(range(n_samples))
    hotted_labels = np.zeros((n_samples, vocab_size))
    for label_ind in range(dataset.shape[1]):
        hotted_labels[indices, dataset[:, label_ind]] = 1
    return hotted_labels

In [16]:
train_y = make_hotted_labels(train_y_, vocab_size)

In [17]:
val_y = make_hotted_labels(val_y_, vocab_size)

In [19]:
train_y.shape

(205429, 8762)

In [20]:
val_y.shape

(88041, 8762)

In [35]:
emb.shape

(8762, 50)

In [36]:
model = Sequential()

In [37]:
input_length = 10

In [38]:
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, weights=[emb]))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(LSTM(1000, activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [39]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 10, 50)            438100    
_________________________________________________________________
lstm_5 (LSTM)                (None, 10, 1000)          4204000   
_________________________________________________________________
lstm_6 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense_5 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_6 (Dense)              (None, 8762)              8770762   
Total params: 22,417,862
Trainable params: 22,417,862
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [44]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=2, batch_size=100)

Train on 205429 samples, validate on 88041 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f62c0b5ffd0>

In [45]:
# save model
model.save('model20171223.h5')