In [1]:
## Settings:
# some config values 
max_features = 75825 #90000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 50 # max number of words in a question to use


In [2]:
import os
import time
import tensorflow as tf
import numpy as np # linear algebra
import random
import os 
os.environ['PYTHONHASHSEED'] = '11'
np.random.seed(22)
random.seed(33)
tf.set_random_seed(44)

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
import gc

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, CuDNNGRU
from keras.layers import Bidirectional, GlobalMaxPooling1D
from keras.models import Model
from keras.optimizers import Adam
from keras.initializers import glorot_uniform
from keras.callbacks import Callback
from keras.models import clone_model
import keras.backend as K

Using TensorFlow backend.


In [3]:
t0 = time.time()

In [4]:
train_df = pd.read_csv("../input/ndsc-beginner/train.csv")
test_df = pd.read_csv("../input/ndsc-beginner/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (666615, 4)
Test shape :  (172402, 3)


In [5]:
## split to train and val
# train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## fill up the missing values
train_X = train_df["title"].fillna("_na_").values
# val_X = val_df["title"].fillna("_na_").values
test_X = test_df["title"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features,
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'’“”')
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
# val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
trunc = 'pre'
train_X = pad_sequences(train_X, maxlen=maxlen, truncating=trunc)
# val_X = pad_sequences(val_X, maxlen=maxlen, truncating=trunc)
test_X = pad_sequences(test_X, maxlen=maxlen, truncating=trunc)

## Get the target values
train_y = train_df['Category'].values
# val_y = val_df['Category'].values

**Glove Embeddings:**

In [6]:
EMBEDDING_FILE = '../input/popular-embedding/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_1 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_1[i] = embedding_vector

del embeddings_index; gc.collect() 

  """


0

**Wiki News FastText Embeddings:**

In [7]:
EMBEDDING_FILE = '../input/popular-embedding/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_2 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_2[i] = embedding_vector
        
del embeddings_index; gc.collect()         

  """


0

**Paragram Embeddings:**

In [8]:
EMBEDDING_FILE = '../input/popular-embedding/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_3 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_3[i] = embedding_vector

del embeddings_index; gc.collect()         

  """


0

**Word2vec Embeddings:**

In [9]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go
from gensim.models import KeyedVectors

EMBEDDING_FILE = '../input/popular-embedding/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_4 = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix_4[i] = embedding_vector
        
del embeddings_index; gc.collect()         

0

** Combine :**

In [10]:
embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4), axis=1)  
del embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4
gc.collect()
np.shape(embedding_matrix)

(75825, 1200)

In [11]:
print(f'Done preprocessing {time.time() - t0:.1f}s')

Done preprocessing 431.2s


In [12]:
class ExponentialMovingAverage(Callback):
    def __init__(self, model, decay=0.999, mode='epoch', n=100):
        """
        mode: 'epoch': Do update_weights every epoch.
              'batch':                   every n batches.
        n   :
        """
        self.decay = decay
        self.mode = mode
        self.ema_model = clone_model(model)
        self.ema_model.set_weights(model.get_weights())
        self.n = n
        if self.mode is 'batch':
            self.cnt = 0
        self.ema_weights = [K.get_value(w) for w in model.trainable_weights]
        self.n_weights = len(self.ema_weights)
        super(ExponentialMovingAverage, self).__init__()

    def on_batch_end(self, batch, logs={}):
        if self.mode is 'batch':
            self.cnt += 1
            if self.cnt % self.n == 0:
                self.update_weights()

    def on_epoch_end(self, epoch, logs={}):
        if self.mode is 'epoch':
            self.update_weights()
        for var, w in zip(self.ema_model.trainable_weights, self.ema_weights):
            K.set_value(var, w)

    def update_weights(self):
        for w_old, var_new in zip(self.ema_weights, self.model.trainable_weights):
            w_old += (1 - self.decay) * (K.get_value(var_new) - w_old)

**GRU:**

In [13]:
def create_rnn_model(rnn, maxlen, embedding, max_features, embed_size,
                     rnn_dim=64, dense1_dim=100, dense2_dim=50,
                     embed_trainable=False, seed=123):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding],
                  trainable=embed_trainable)(inp)
    x = Dense(dense1_dim, activation='relu',
              kernel_initializer=glorot_uniform(seed=seed))(x)
    x = Bidirectional(rnn(rnn_dim, return_sequences=True,
                          kernel_initializer=glorot_uniform(seed=seed)))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(dense2_dim, activation='relu',
              kernel_initializer=glorot_uniform(seed=seed))(x)
    x = Dense(58, activation='softmax',
              kernel_initializer=glorot_uniform(seed=seed))(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model

In [14]:

embed_ids = [list(range(300)), list(range(300, 600)),
             list(range(600, 900)), list(range(900, 1200))]
embed_ids_dict = {1: [embed_ids[0], embed_ids[1], embed_ids[2], embed_ids[3]],
                  2: [embed_ids[0] + embed_ids[1],
                      embed_ids[0] + embed_ids[2],
                      embed_ids[0] + embed_ids[3],
                      embed_ids[1] + embed_ids[2],
                      embed_ids[1] + embed_ids[3],
                      embed_ids[2] + embed_ids[3]],
                  3: [embed_ids[0] + embed_ids[1] + embed_ids[2],
                      embed_ids[0] + embed_ids[1] + embed_ids[3],
                      embed_ids[0] + embed_ids[2] + embed_ids[3],
                      embed_ids[1] + embed_ids[2] + embed_ids[3]],
                  4: [embed_ids[0] + embed_ids[1] + embed_ids[2] + embed_ids[3]]}
embed_ids_lst = embed_ids_dict[2]
embed_size = 600

rnn = CuDNNGRU
embed_trainable = False

n_models = 6
epochs = 7
batch_size = 512
dense1_dim = rnn_dim = 128
dense2_dim = 2 * rnn_dim

ema_n = int(len(train_y) / batch_size / 10)
decay = 0.9
scores = []

oof_pred = np.zeros((len(train_X),58))
# pred_avg = np.zeros((len(val_y), 58))
pred_test_avg = np.zeros((test_df.shape[0], 58))
for i in range(n_models):
    t1 = time.time()
    seed = 101 + 11 * i
    cols_in_use = embed_ids_lst[i % len(embed_ids_lst)]
    model = create_rnn_model(rnn, maxlen, embedding_matrix[:, cols_in_use],
                             max_features, embed_size,
                             rnn_dim=rnn_dim,
                             dense1_dim=dense1_dim,
                             dense2_dim=dense2_dim,
                             embed_trainable=embed_trainable,
                             seed=seed)
    ema = ExponentialMovingAverage(model, decay=decay, mode='batch', n=ema_n)
    model.fit(train_X, train_y, batch_size=batch_size, epochs=epochs,
              callbacks=[ema], verbose=1)
    m = ema.ema_model
    t_per_epoch = (time.time() - t1) / epochs
#     pred = m.predict([val_X])
    train_pred = m.predict([train_X])
#     print(pred.shape)
    oof_pred += train_pred
    pred_test = m.predict([test_X])
    pred_test_avg += pred_test
#     f1_one, thresh_one = f1_best(val_y, pred)
#     f1_avg, thresh_avg = f1_best(val_y, pred_avg / (i + 1))
#     nll_one = metrics.log_loss(val_y, pred)
#     nll_avg = metrics.log_loss(val_y, pred_avg / (i + 1))
#     auc_one = metrics.roc_auc_score(val_y, pred)
#     auc_avg = metrics.roc_auc_score(val_y, pred_avg)
    print(f'  n_model:{i + 1} epoch:{epochs} ' +
          f'Time:{time.time() - t1:.1f}s  {t_per_epoch:.1f}s/epoch')


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
  n_model:1 epoch:7 Time:240.7s  25.7s/epoch
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
  n_model:2 epoch:7 Time:235.5s  24.7s/epoch
Epoch 1/7

In [15]:
pred_test_avg /= n_models
np.save('pred_EMA.np',pred_test_avg )
print('finish saving numpy array')

oof_pred /= n_models
np.save('oof_EMA.np',oof_pred)
print('finished saving oof file')

y_te = [np.argmax(preds) for preds in pred_test_avg]

# pred_test_avg = (pred_test_avg>thresh_avg).astype(int)
out_df = pd.DataFrame({"title":test_df["title"].values})
out_df['Category'] = y_te
out_df.to_csv("submission.csv", index=False)

print(f'Done:{time.time() - t0:.1f}s')

finish saving numpy array
finished saving oof file
Done:1874.1s
