In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub

%matplotlib inline

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
def read_txt(PATH, Ln=500):
  txt = (open(PATH, 'r')).read()
  txt = txt.replace('\n', '')
  l_1 = []
  for i in range(len(txt)//Ln):
    l_1.append(txt[i*Ln:(i+1)*Ln])
  return l_1

PATH = './gdrive/My Drive/DL/Style/Nabokov-all.txt'
nktxt = read_txt(PATH)

PATH2 = './gdrive/My Drive/DL/Style/Shakespeare-all.txt'
sptxt = read_txt(PATH2)

PATH3 = './gdrive/My Drive/DL/Style/Twain-all.txt'
twtxt = read_txt(PATH3)


In [0]:
dict1 = {'text': nktxt, 'author': 'nabokov'}
dict2 = {'text': sptxt, 'author': 'shakespeare'}
dict3 = {'text': twtxt, 'author': 'twain'}


nk = pd.DataFrame(dict1)
sp = pd.DataFrame(dict2)
tw = pd.DataFrame(dict3)

author = [nk, sp, tw]
df = pd.concat(author)

spooky = pd.read_csv('./gdrive/My Drive/DL/NLP/Kaggle/SAI/train.csv').drop('id', axis=1)
spooky['author'] = 'other'

data = pd.concat([df, spooky])

# data.to_csv(r'./gdrive/My Drive/DL/NLP/Kaggle/SAI/data.csv',index=False)

In [35]:
X = data.text.astype('str')
y = data.author.astype('category')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 10000 

tokenizer = Tokenizer(num_words=max_words, oov_token='<oov>') 
tokenizer.fit_on_texts(X) 
sequences = tokenizer.texts_to_sequences(X) 
word_index = tokenizer.word_index 

print('Found {} unique tokens.'.format(len(word_index)))

maxlen = 256
X = pad_sequences(sequences, maxlen=maxlen, truncating="post")

from sklearn.preprocessing import OneHotEncoder

y = np.asarray(y)
onehot_encoder = OneHotEncoder(sparse=False)
encoded = y.reshape(len(y), 1)
y = onehot_encoder.fit_transform(encoded)

print('Shape of data tensor: ', X.shape)
print('Shape of label tensor: ', y.shape)

embeddings_index = {}
gl_PATH = './gdrive/My Drive/DL/NLP/GloVe/glove.6B.200d.txt' 
f = open(gl_PATH)
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_dim = 200
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items(): 
  if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
      

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=1)

tr, X, y, tokenizer, sequences, word_index, embeddings_index = [], [], [], [], [], [], []

Using TensorFlow backend.


Found 196364 unique tokens.
Shape of data tensor:  (62195, 256)
Shape of label tensor:  (62195, 4)
Found 400000 word vectors.


In [36]:
!pip install keras-layer-normalization

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Embedding, LSTM, Dense, Input, Dropout, GRU, Conv1D, MaxPooling1D, BatchNormalization, Activation, concatenate
from keras.layers import Bidirectional, Flatten, RepeatVector, Permute, Multiply, Lambda, TimeDistributed
from keras import backend as K

from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras_layer_normalization import LayerNormalization

Collecting keras-layer-normalization
  Downloading https://files.pythonhosted.org/packages/ea/f3/a92ce51219280eea003911722046db17eaebf5f26679a73887a5c357abe4/keras-layer-normalization-0.13.0.tar.gz
Building wheels for collected packages: keras-layer-normalization
  Building wheel for keras-layer-normalization (setup.py) ... [?25l[?25hdone
  Created wheel for keras-layer-normalization: filename=keras_layer_normalization-0.13.0-cp36-none-any.whl size=5209 sha256=a8f771ef76165bb5360134a4e92f86ee6ca1b60afc611804985280b2d3e19234
  Stored in directory: /root/.cache/pip/wheels/50/2b/71/d1d06f71d78c46a9912dc89a5bb46f357cf64fa05883fadc64
Successfully built keras-layer-normalization
Installing collected packages: keras-layer-normalization
Successfully installed keras-layer-normalization-0.13.0


In [0]:
units = 32
lr = 0.0005
patience = 5


inputs = Input(shape=(256,), dtype='int32')
x = Embedding(max_words, embedding_dim, input_length=maxlen)(inputs)

x = Conv1D(units * 2, 
           7,
           padding="same",
           kernel_regularizer=l2(0.01),
           kernel_initializer=keras.initializers.he_normal(seed=42))(x)
x = BatchNormalization()(x)
c = Activation('relu')(x)

b = GRU(units,
        return_sequences=True,        
        kernel_initializer=keras.initializers.Orthogonal(seed=42),
        dropout=0.2, recurrent_dropout=0.2
        )(x)
x = LayerNormalization()(b)

c = concatenate([c, x])

b = GRU(units,
        return_sequences=True,        
        kernel_initializer=keras.initializers.Orthogonal(seed=42),
        # dropout=0.2, recurrent_dropout=0.2
        )(c)
x = LayerNormalization()(b)

c = concatenate([c, x])
c = Flatten()(c)

outputs = Dense(4, activation="softmax")(c)

model = Model(inputs=inputs, outputs=outputs)

model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False

model.compile(optimizer=Adam(lr=lr),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train,
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=patience, 
          batch_size=2048)

cb = EarlyStopping(monitor='val_loss', 
                   mode='min', 
                   verbose=0, 
                   patience=patience,
                   restore_best_weights=True)

model.compile(optimizer=Adam(lr=lr/3),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train, 
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=99, 
          batch_size=2048,
          callbacks=[cb])

model.compile(optimizer=Adam(lr=lr/6),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train, 
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=99, 
          batch_size=2048,
          callbacks=[cb])


print('===Evaluation===')
model.evaluate(X_test, y_test)






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 49756 samples, validate on 6219 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 49756 samples, validate on 6219 samples
Epoch 1/99
Epoch 2/99
Epoch 3/99
Epoch 4/99
Epoch 5/99
Epoch 6/99
Epoch 7/99
Epoch 8/99
Epoch 9/99
Epoch 10/99
Epoch 11/99
Epoch 12/99
Epoch 13/99
Epoch 14/99
Epoch 15/99
Epoch 16/99
Epoch 17/99
Epoch 18/99
Epoch 19/99
Epoch 20/99
Epoch 21/99

In [0]:
model.save('./gdrive/My Drive/DL/NLP/Kaggle/SAI/model_2.h5')