## A simple notebook to show how to implement retraining of model with unfrozen embedding layer after prior training with frozen embedding(or vice versa)

### NOTE: In this notebook I've chosen the order of unfrozen -> frozen because it worked better for this extremely simple model

In [None]:
import gc
import re
import os
import pandas as pd
import numpy as np
from unidecode import unidecode
print(os.listdir("../input"))

In [None]:
from keras.models import Model, Sequential
from keras import layers
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.callbacks import Callback
from keras import optimizers

# F1 score metric (with 0.5 threshold)

In [None]:
def f1_score(true,pred):
    pred = K.cast(K.greater(pred,0.5), K.floatx())

    groundPositives = K.sum(true) + K.epsilon()
    correctPositives = K.sum(true * pred) + K.epsilon()
    predictedPositives = K.sum(pred) + K.epsilon()

    precision = correctPositives / predictedPositives
    recall = correctPositives / groundPositives

    m = (2 * precision * recall) / (precision + recall)

    return m

In [None]:
train_df = pd.read_csv('../input/train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv('../input/test.csv', usecols = ['question_text'])

# Minor cleaning of data

In [None]:
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)

def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

In [None]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: clean_text(str(x)))
test_df['question_text'] = test_df['question_text'].apply(lambda x: clean_text(str(x)))

# Split data for training and test

In [None]:
train_sentences = train_df['question_text']
train_labels = train_df['target']
test_sentences = test_df['question_text']

In [None]:
gc.collect()

# Tokenize and process the questions 

In [None]:
max_features = 20000
maxlen = 100

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)

In [None]:
tokenizer.fit_on_texts(list(train_sentences) + list(test_sentences))

In [None]:
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [None]:
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

# Load and process embedding matrix

In [None]:
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) #embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
del word_index, embeddings_index, all_embs, tokenized_test, tokenized_train, tokenizer, train_sentences, test_sentences, nb_words
gc.collect()

# Hyperparams for model

In [None]:
batch_size = 1024
epochs = 4
embed_size = 300

# Method to create your model

### Takes a boolean to set the 'trainable' parameter of embedding layer 

In [None]:
def dnn_model(train_flag = True):
    inp = layers.Input(shape=(maxlen, ))
    x = layers.Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_flag)(inp)
    x = layers.Bidirectional(layers.CuDNNLSTM(64, return_sequences = True))(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(32, activation="tanh")(x)
    x = layers.Dense(1, activation="sigmoid", kernel_initializer='glorot_normal')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[f1_score])
    return model

In [None]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_f1_score", mode="max", patience=2)
callbacks = [checkpoint, early_stopping]

## Initially training a model with fixed embedding layer

### Note the significantly fewer trainable parameters compared to a model with trainable embedding layer

In [None]:
model = dnn_model(False)
model.summary()

In [None]:
model.fit(X_train, train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_split=0.20, callbacks=callbacks)

# Training a model with trainable embedding layer

In [None]:
model = dnn_model()
model.summary()

# Load weights from previously trained model and retrain

In [None]:
model.load_weights(weight_path)

In [None]:
model.fit(X_train, train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_split=0.20, callbacks=callbacks)

In [None]:
model.load_weights(weight_path)

# Predict on test data and create submission csv

In [None]:
y_pred = model.predict(X_test, batch_size=batch_size)
y_pred = [x for i in y_pred for x in i]

In [None]:
sample = pd.read_csv('../input/sample_submission.csv')

In [None]:
sample['prediction'] = pd.Series(y_pred)

### Using 0.5 as prediction threshold which maybe a bit too high

In [None]:
sample['prediction'] = sample['prediction'].apply(lambda x: 0 if x <= 0.5 else 1)

In [None]:
sample.to_csv('submission.csv', index=False)

In [None]:
sample['prediction'].value_counts()