In [1]:
import numpy as np
import pandas as pd

In [2]:
import json

In [3]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten

Using TensorFlow backend.


In [4]:
def load_embedding(filename='embedding.csv'):
    return pd.read_csv(filename, header=None)

In [5]:
emb = load_embedding()

In [6]:
emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.111872,-0.971596,0.700518,-1.17877,-0.453448,0.39539,0.901024,-0.84667,-1.037827,-0.752525,...,-0.873244,-1.067232,0.899962,-0.466667,-0.15507,-0.447545,1.538786,-0.614972,-0.221534,0.282051
1,-0.58402,0.39031,0.65282,-0.3403,0.19493,-0.83489,0.11929,-0.57291,-0.56844,0.72989,...,0.28586,-0.052162,-0.50818,-0.63459,0.33889,0.28416,-0.2034,-1.2338,0.46715,0.78858
2,0.25769,0.45629,-0.76974,-0.37679,0.59272,-0.063527,0.20545,-0.57385,-0.29009,-0.13662,...,0.030498,-0.39543,-0.38515,-1.0002,0.087599,-0.31009,-0.34677,-0.31438,0.75004,0.97065
3,1.135078,-0.642963,-0.347493,0.481686,-1.096505,-0.720878,0.168508,0.523792,-0.43289,0.076722,...,-0.240541,0.165332,0.166037,0.029731,-1.362086,0.432089,1.354347,-0.531961,0.53464,-0.031217
4,0.173497,0.66558,0.145836,-0.040887,-0.13273,0.414063,0.577393,-0.525591,0.298661,0.284283,...,-0.755561,0.531924,-0.926138,-0.482626,-0.362259,1.091691,-0.036795,0.488126,0.574951,0.060907


In [7]:
emb_dim = emb.shape[1]

In [8]:
emb_dim

50

In [9]:
def load_dict(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def load_index_word_map(word2ind_filename='word2ind', ind2word_filename='ind2word'):
    word2ind = load_dict(word2ind_filename)
    ind2word = load_dict(ind2word_filename)
    return word2ind, ind2word

In [10]:
word2ind, ind2word = load_index_word_map()

In [11]:
vocab_size = len(word2ind)

In [12]:
def load_training_samples(filename='train.csv'):
    return pd.read_csv(filename, header=None)

In [13]:
dataset = load_training_samples()

In [14]:
def train_valid_split(x, y, test_size=0.3, random_state=43):
    assert len(x) == len(y), 'Feature and label must have same length.'
    np.random.RandomState(seed=random_state)
    length = len(x)
    choices = list(range(length))
    val_choices = np.random.choice(choices, int(length*test_size), replace=False).tolist()
    train_choices = list(set(choices) - set(val_choices))
    return x[train_choices, :], x[val_choices, :], y[train_choices, :], y[val_choices, :]

In [15]:
train_x, val_x, train_y_, val_y_ = train_valid_split(dataset.values[:, :-1], dataset.values[:, -1:], test_size=0.3, random_state=43)

In [16]:
train_x.shape

(389061, 10)

In [17]:
train_y_.shape

(389061, 1)

In [18]:
def make_hotted_labels(dataset, vocab_size):
    n_samples = dataset.shape[0]
    indices = list(range(n_samples))
    hotted_labels = np.zeros((n_samples, vocab_size))
    for label_ind in range(dataset.shape[1]):
        hotted_labels[indices, dataset[:, label_ind]] = 1
    return hotted_labels

In [19]:
train_y = make_hotted_labels(train_y_, vocab_size)

In [20]:
val_y = make_hotted_labels(val_y_, vocab_size)

In [21]:
train_y.shape

(389061, 13063)

In [22]:
val_y.shape

(166740, 13063)

In [23]:
emb.shape

(13063, 50)

In [24]:
input_length = 10

In [25]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=input_length, weights=[emb]))
model.add(Dropout(.2))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(Dropout(.2))
model.add(LSTM(1000, activation='relu', return_sequences=True))
model.add(Dropout(.2))
model.add(Dense(1000, activation='relu'))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

In [26]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 50)            653150    
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 1000)          4204000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 1000)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 1000)          8004000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 10, 1000)          0         
_________________________________________________________________
dense_1 (Dense)              (None, 10, 1000)          1001000   
__________

In [27]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [28]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=1, batch_size=200)

Train on 389061 samples, validate on 166740 samples
Epoch 1/1


<keras.callbacks.History at 0x7f2f407de860>

In [45]:
model.optimizer.lr = 0.01

In [46]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=1, batch_size=200)

Train on 390105 samples, validate on 167187 samples
Epoch 1/1


<keras.callbacks.History at 0x7f335022d9b0>

In [47]:
model.optimizer.lr = 0.001

In [48]:
model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=2, batch_size=200)

Train on 390105 samples, validate on 167187 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f335022dda0>

In [54]:
# save model
model.save_weights('model20171227.h5')

In [29]:
model.save('model20171227_2.h5')