In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
from tf import keras

%matplotlib inline

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def read_txt(PATH, Ln=600):
  '''
  Cut raw texts in to chunks of 600 characters.
  '''

  txt = (open(PATH, 'r')).read()
  txt = txt.replace('\n', ' ')
  l_1 = []
  for i in range(len(txt)//Ln):
    l_1.append(txt[i*Ln:(i+1)*Ln])
  return l_1

In [0]:
def replace_broken_words(df, author):
  '''
  Broken words at the beginnig and the end of each chunk are removed.
  '''

  l1 = []
  for t in df['text']:
    l2 = []
    for i, v in enumerate(t):
      if v == ' ':
        l2.append(i)
    s = l2[0]
    e = l2[-1]
    l1.append(t[s:e])
  
  df1 = pd.DataFrame(l1)
  df1['author'] = author
  df1.rename(columns={0:'text'}, inplace=True)
  return df1

In [0]:
# Data Preparation

PATH = './gdrive/My Drive/DL/Style/Nabokov-all.txt'
natxt = read_txt(PATH)

PATH2 = './gdrive/My Drive/DL/Style/Austen-all.txt'
autxt = read_txt(PATH2)

PATH3 = './gdrive/My Drive/DL/Style/Dumas-all.txt'
dutxt = read_txt(PATH3)

dict1 = {'text': natxt, 'author': 'Nabokov'}
dict2 = {'text': autxt, 'author': 'Austen'}
dict3 = {'text': dutxt, 'author': 'Twain'}


na = pd.DataFrame(dict1)
au = pd.DataFrame(dict2)
du = pd.DataFrame(dict3)

na = replace_broken_words(na, 'Nabokov')
au = replace_broken_words(au, 'Austen')
du = replace_broken_words(du, 'Dumas')

author = [na, au, du]

df = pd.concat(author)

In [0]:
X = df.text.astype('str')
y = df.author.astype('category')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 10000  # We will keep only the 10000 most common words

tokenizer = Tokenizer(num_words=max_words, oov_token='<oov>') 
tokenizer.fit_on_texts(X) 
sequences = tokenizer.texts_to_sequences(X) # list: string - numbers(indices)
word_index = tokenizer.word_index # dict: word - number(index)

print('Found {} unique tokens.'.format(len(word_index)))

# Furthermore, we need to pad the sequences so that their lengths are the same and do not exceed a specific maximum length.
maxlen = 256
X = pad_sequences(sequences, maxlen=maxlen, truncating="post")

from sklearn.preprocessing import OneHotEncoder

# Transform the target authors to one-hot encoding
y = np.asarray(y)
onehot_encoder = OneHotEncoder(sparse=False)
encoded = y.reshape(len(y), 1)
y = onehot_encoder.fit_transform(encoded)

print('Shape of data tensor: ', X.shape)
print('Shape of label tensor: ', y.shape)

embeddings_index = {}
gl_PATH = './gdrive/My Drive/DL/NLP/GloVe/glove.6B.200d.txt' 
f = open(gl_PATH)
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_dim = 200
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items(): 
  if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
      

from sklearn.model_selection import train_test_split

# Split data into training, validation and test data sets.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=1)

# tr, X, y, tokenizer, sequences, word_index, embeddings_index = [], [], [], [], [], [], []

Found 80526 unique tokens.
Shape of data tensor:  (35874, 256)
Shape of label tensor:  (35874, 3)
Found 400000 word vectors.


In [0]:
!pip install keras-layer-normalization

from tf import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Embedding, LSTM, Dense, Input, Dropout, GRU, Conv1D, MaxPooling1D, BatchNormalization, Activation, concatenate
from keras.layers import Bidirectional, Flatten, RepeatVector, Permute, Multiply, Lambda, TimeDistributed
from keras import backend as K

from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras_layer_normalization import LayerNormalization

Collecting keras-layer-normalization
  Downloading https://files.pythonhosted.org/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz
Building wheels for collected packages: keras-layer-normalization
  Building wheel for keras-layer-normalization (setup.py) ... [?25l[?25hdone
  Created wheel for keras-layer-normalization: filename=keras_layer_normalization-0.14.0-cp36-none-any.whl size=5268 sha256=a8854a749551b37042819b7141716ffc89f8a960c9984aa5bb15d5e99503ee92
  Stored in directory: /root/.cache/pip/wheels/54/80/22/a638a7d406fd155e507aa33d703e3fa2612b9eb7bb4f4fe667
Successfully built keras-layer-normalization
Installing collected packages: keras-layer-normalization
Successfully installed keras-layer-normalization-0.14.0


In [0]:
# We build a baseline model for the style discriminator.
# We use 200d GloVe pre-trained model as word embedding layer, followed by a 1d convolutional layer and max-pooling.
# The output of the pooling layer are fed into two stacked GRU layers. 
# Furthermore, two skip connections are built such that information flows from lower-level feature respresentations are allowed.

units = 32
lr = 0.0005
patience = 5


inputs = Input(shape=(256,), dtype='int32')
x = Embedding(max_words, embedding_dim, input_length=maxlen)(inputs)

x = Conv1D(units * 2, 
           7,
           padding="same",
           kernel_regularizer=l2(0.01),
           kernel_initializer=keras.initializers.he_normal(seed=42))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
c = MaxPooling1D(3)(x)

b = GRU(units,
        return_sequences=True,        
        kernel_initializer=keras.initializers.Orthogonal(seed=42),
        dropout=0.2, recurrent_dropout=0.2
        )(c)
x = LayerNormalization()(b)

c = concatenate([c, x])

b = GRU(units,
        return_sequences=True,        
        kernel_initializer=keras.initializers.Orthogonal(seed=42),
        # dropout=0.2, recurrent_dropout=0.2
        )(c)
x = LayerNormalization()(b)

c = concatenate([c, x])
c = Flatten()(c)

outputs = Dense(3, activation="softmax")(c)

model = Model(inputs=inputs, outputs=outputs)

model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False

model.compile(optimizer=Adam(lr=lr),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train,
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=patience, 
          batch_size=2048)

cb = EarlyStopping(monitor='val_loss', 
                   mode='min', 
                   verbose=0, 
                   patience=patience,
                   restore_best_weights=True)

model.compile(optimizer=Adam(lr=lr/3),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train, 
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=99, 
          batch_size=2048,
          callbacks=[cb])

model.compile(optimizer=Adam(lr=lr/6),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train, 
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=99, 
          batch_size=2048,
          callbacks=[cb])


print('===Evaluation===')
model.evaluate(X_test, y_test)







Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 28699 samples, validate on 3587 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 28699 samples, validate on 3587 samples
Epoch 1/99
Epoch 2/99
Epoch 3/99
Epoch 4/99
Epoch 5/99
Epoch 6/99
Epoch 7/99
Epoch 8/99
Epoch 9/99
Epoch 10/99
Epoch 11/99
Epoch 12/99
Epoch 13/99
Epoch 14/99
Epoch 15/99
Epoch 16/99
Epoch 17/99
Epoch 18/99
Epoch 19/99
Epoch 20/99
Epoch 21/99
Epoch 22/99
Epoch 23/99
Epoch 24/99
Epoch 25/99
Epoch 26/99
Epoch 27/99
Epoch 28/99
Epoch 29/99
Epoch 30/99
Epoch 31/99
Epoch 32/99
Epoch 33/99
Train on 28699 samples, validate on 3587 samples
Epoch 1/99
Epoch 2/99
Epoch 3/99
Epoch 4/99
Epoch 5/99
Epoch 6/99
===Evaluation===


[0.4546111916817949, 0.9690635451505016]

In [0]:
# Model are saved and made available on github
model.save('./gdrive/My Drive/DL/Style/model_base.h5')

In [0]:
# Test demo using donor texts, results are stored on github

PATH = './gdrive/My Drive/DL/Style/donor.csv'
dn = pd.read_csv(PATH)

X_ts = dn.text.astype('str')
y_ts = dn.author.astype('category')

tokenizer.fit_on_texts(X_ts) 
sequences = tokenizer.texts_to_sequences(X_ts) 
word_index = tokenizer.word_index 

X_ts = pad_sequences(sequences, maxlen=maxlen, truncating="post")

y_ts = np.asarray(y_ts)
onehot_encoder = OneHotEncoder(sparse=False)
encoded = y_ts.reshape(len(y_ts), 1)
y_ts = onehot_encoder.fit_transform(encoded)

yhat = model.predict(X_ts)

In [0]:
pd.DataFrame(y_ts, columns=['Austen', 'Dumas', 'Nabokov']).to_csv(r'./gdrive/My Drive/DL/Style/donor_y.csv', index=False)
pd.DataFrame(yhat, columns=['Austen', 'Dumas', 'Nabokov']).to_csv(r'./gdrive/My Drive/DL/Style/donor_yhat.csv', index=False)