In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib
import tensorflow as tf
%tensorflow_version 2.x
import tensorflow_hub as hub
from tensorflow import keras


%matplotlib inline

#from google.colab import drive
#drive.mount('/content/gdrive')

TensorFlow is already loaded. Please restart the runtime to change versions.


In [0]:
import nltk
from nltk.tag.stanford import StanfordNERTagger


!wget 'https://nlp.stanford.edu/software/stanford-ner-2018-10-16.zip'
!unzip stanford-ner-2018-10-16.zip

nltk.download('punkt')

st = StanfordNERTagger('/content/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/content/stanford-ner-2018-10-16/stanford-ner.jar',
                       encoding='utf-8')

--2019-11-17 07:01:49--  https://nlp.stanford.edu/software/stanford-ner-2018-10-16.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180358328 (172M) [application/zip]
Saving to: ‘stanford-ner-2018-10-16.zip’


2019-11-17 07:02:03 (13.3 MB/s) - ‘stanford-ner-2018-10-16.zip’ saved [180358328/180358328]

Archive:  stanford-ner-2018-10-16.zip
   creating: stanford-ner-2018-10-16/
  inflating: stanford-ner-2018-10-16/README.txt  
  inflating: stanford-ner-2018-10-16/ner-gui.bat  
  inflating: stanford-ner-2018-10-16/build.xml  
  inflating: stanford-ner-2018-10-16/stanford-ner.jar  
  inflating: stanford-ner-2018-10-16/sample-conll-file.txt  
  inflating: stanford-ner-2018-10-16/sample.ner.txt  
  inflating: stanford-ner-2018-10-16/stanford-ner-3.9.2-sources.jar  
   creating: stanford-ner-2018-10-16/lib/
  inflating: stanford-ner-2018-1

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [0]:
from nltk.tokenize import word_tokenize

text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

In [0]:
classified_text 

[('While', 'O'),
 ('in', 'O'),
 ('France', 'LOCATION'),
 (',', 'O'),
 ('Christine', 'PERSON'),
 ('Lagarde', 'PERSON'),
 ('discussed', 'O'),
 ('short-term', 'O'),
 ('stimulus', 'O'),
 ('efforts', 'O'),
 ('in', 'O'),
 ('a', 'O'),
 ('recent', 'O'),
 ('interview', 'O'),
 ('with', 'O'),
 ('the', 'O'),
 ('Wall', 'ORGANIZATION'),
 ('Street', 'ORGANIZATION'),
 ('Journal', 'ORGANIZATION'),
 ('.', 'O')]

In [0]:
def read_txt(PATH, Ln=600):
  '''
  Cut raw texts in to chunks of 600 characters.
  '''
  txt = (open(PATH, 'r')).read()
  txt = txt.replace('\n', ' ')
  l_1 = []
  for i in range(len(txt)//Ln):
    l_1.append(txt[i*Ln:(i+1)*Ln])
  return l_1

In [0]:
def replace_broken_words(df, author):
  '''
  Broken words at the beginnig and the end of each chunk are removed.
  '''

  l1 = []
  for t in df['text']:
    l2 = []
    for i, v in enumerate(t):
      if v == ' ':
        l2.append(i)
    s = l2[0]
    e = l2[-1]
    l1.append(t[s:e])
  
  df1 = pd.DataFrame(l1)
  df1['author'] = author
  df1.rename(columns={0:'text'}, inplace=True)
  return df1

In [0]:
# Data Preparation

PATH = './gdrive/My Drive/DL/Style/Nabokov-all.txt'
natxt = read_txt(PATH)

PATH2 = './gdrive/My Drive/DL/Style/Austen-all.txt'
autxt = read_txt(PATH2)

PATH3 = './gdrive/My Drive/DL/Style/Dumas-all.txt'
dutxt = read_txt(PATH3)

dict1 = {'text': natxt, 'author': 'Nabokov'}
dict2 = {'text': autxt, 'author': 'Austen'}
dict3 = {'text': dutxt, 'author': 'Dumas'}


na = pd.DataFrame(dict1)
au = pd.DataFrame(dict2)
du = pd.DataFrame(dict3)

na = replace_broken_words(na, 'Nabokov')
au = replace_broken_words(au, 'Austen')
du = replace_broken_words(du, 'Dumas')

author = [na, au, du]
df = pd.concat(author)

na = na.sample(3000)
au = au.sample(3000)
du = du.sample(3000)

author = [na, au, du]
df3000 = pd.concat(author)

In [0]:
df.to_csv('./gdrive/My Drive/DL/Style/raw_text.csv')
df3000.to_csv('./gdrive/My Drive/DL/Style/raw_text_3000.csv')

In [0]:
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
from nltk.corpus import stopwords
#stop_words = stopwords.words('english')
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.svm import SVC
import xgboost as xgb

In [0]:
url = 'https://raw.githubusercontent.com/fy164251/text_style_transfer/master/Datasets/raw_text.csv'
df = pd.read_csv(url)

X = df.text.values
y = df.author.astype('category')

lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(y.values)

xtrain, xvalid, ytrain, yvalid = train_test_split(X, y, stratify=y, random_state=1, test_size=0.2, shuffle=True)
xvalid, xtest, yvalid, ytest = train_test_split(xvalid, yvalid, test_size=0.5, random_state=1, shuffle=True)

print(xtrain.shape, xvalid.shape)

(28699,) (3587,)


In [0]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

# Baselines

## TF-IDF Features + Logistic Regression

In [0]:
#TF-IDF features
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), use_idf=1, smooth_idf=1, sublinear_tf=1,
                     stop_words='english')

tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)

print("The size of the learnt vocabulary is ", len(tfv.vocabulary_))

def display_scores(vectorizer, tfidf_result):
    scores = zip(vectorizer.get_feature_names(), np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    count = 0
    for item in sorted_scores: 
        print(item[0], item[1])
        count += 1
        if count>=10:
            break

#See top 10 TF-IDF scores
print('Top 10 TF-IDF scores are --- \n')
display_scores(tfv,xvalid_tfv)



#Logistic Regression on TF-IDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print("logloss for TF-IDF + LR : " + str(multiclass_logloss(yvalid, predictions)))

The size of the learnt vocabulary is  87243
Top 10 TF-IDF scores are --- 

s 65.32906399321196
said 62.012499674117485
d 36.62009665023877
did 34.57324722586879
man 34.27772053098513
little 33.23118294228048
time 32.770131950506126
artagnan 32.46868149759843
d artagnan 32.46868149759843
like 31.982415663713393




logloss : 0.15634701358774225


## Count Vectorizer + Logistic Regression

In [0]:
ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), stop_words='english')
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv = ctv.transform(xtrain)
xvalid_ctv = ctv.transform(xvalid)

clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
predictions_rounded = clf.predict(xvalid_ctv)
print("logloss for CVectorizer + LR : ", multiclass_logloss(yvalid, predictions))
print(classification_report(yvalid, predictions_rounded))

KeyboardInterrupt: ignored

## Naive Bayes + TFIDF and Naive Bayes + Count Vectorizer

In [0]:
#Multinomial Naive Bayes on TF-IDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print("logloss for Multinomial Naive Bayes: ", multiclass_logloss(yvalid, predictions))

#Multinomial Naive Bayes on Count Vectors
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
print("logloss for Multinomial Naive Bayes with count vectorizer : ", multiclass_logloss(yvalid, predictions))

logloss for Multinomial Naive Bayes:  0.0812769676344614
logloss for Multinomial Naive Bayes with count vectorizer :  0.04169366813496661


## TF-IDF + SVD --> SVM

In [0]:
# SVD on TF-IDF --> SVM

svd = decomposition.TruncatedSVD(n_components=100)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

clf = SVC(C=1.0, probability=True)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)
predictions_rounded = clf.predict(xvalid_svd_scl)
print(" SVM logloss : ", multiclass_logloss(yvalid, predictions))
print(classification_report(yavlid, predictions_rounded))

## TFIDF + SVD --> XGBOOST

In [0]:
# XGBoost on SVD features
clf = xgb.XGBClassifier()
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)
predictions_rounded = clf.predict(xvalid_svd)

print ("XGBOOST logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print(classification_report(yvalid, predictions_rounded))

XGBOOST logloss: 0.077 
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       677
           1       0.98      0.97      0.98      1318
           2       0.97      0.98      0.98      1592

    accuracy                           0.98      3587
   macro avg       0.98      0.97      0.97      3587
weighted avg       0.98      0.98      0.98      3587



In [0]:
url = 'https://raw.githubusercontent.com/fy164251/text_style_transfer/master/Datasets/raw_text.csv'
df = pd.read_csv(url)

X = df.text.astype('str')
y = df.author.astype('category')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 10000  # We will keep only the 10000 most common words

tokenizer = Tokenizer(num_words=max_words, oov_token='<oov>') 
tokenizer.fit_on_texts(X) 
sequences = tokenizer.texts_to_sequences(X) # list: string - numbers(indices)
word_index = tokenizer.word_index # dict: word - number(index)

print('Found {} unique tokens.'.format(len(word_index)))

# Furthermore, we need to pad the sequences so that their lengths are the same and do not exceed a specific maximum length.
maxlen = 256
X = pad_sequences(sequences, maxlen=maxlen, truncating="post")

from sklearn.preprocessing import OneHotEncoder

# Transform the target authors to one-hot encoding
y = np.asarray(y)
onehot_encoder = OneHotEncoder(sparse=False)
encoded = y.reshape(len(y), 1)
y = onehot_encoder.fit_transform(encoded)

print('Shape of data tensor: ', X.shape)
print('Shape of label tensor: ', y.shape)

'''
embeddings_index = {}
gl_PATH = './gdrive/My Drive/DL/NLP/GloVe/glove.6B.200d.txt' 
f = open(gl_PATH)
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))

embedding_dim = 200
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items(): 
  if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector'''
      

from sklearn.model_selection import train_test_split

# Split data into training, validation and test data sets.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=1)

# tr, X, y, tokenizer, sequences, word_index, embeddings_index = [], [], [], [], [], [], []

In [0]:
!pip install keras-layer-normalization

from keras.models import Sequential, Model, load_model
from keras.layers import Embedding, LSTM, Dense, Input, Dropout, GRU, Conv1D, MaxPooling1D, BatchNormalization, Activation, concatenate
from keras.layers import Bidirectional, Flatten, RepeatVector, Permute, Multiply, Lambda, TimeDistributed
from keras import backend as K

from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras_layer_normalization import LayerNormalization

Collecting keras-layer-normalization
  Downloading https://files.pythonhosted.org/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz
Building wheels for collected packages: keras-layer-normalization
  Building wheel for keras-layer-normalization (setup.py) ... [?25l[?25hdone
  Created wheel for keras-layer-normalization: filename=keras_layer_normalization-0.14.0-cp36-none-any.whl size=5268 sha256=6bbecf33c0f6d5a02cf72b8c12f042fbbef1e4209661995ced8790320c20fbf9
  Stored in directory: /root/.cache/pip/wheels/54/80/22/a638a7d406fd155e507aa33d703e3fa2612b9eb7bb4f4fe667
Successfully built keras-layer-normalization
Installing collected packages: keras-layer-normalization
Successfully installed keras-layer-normalization-0.14.0


In [0]:
# We build a baseline model for the style discriminator.
# We use 200d GloVe pre-trained model as word embedding layer, followed by a 1d convolutional layer and max-pooling.
# The output of the pooling layer are fed into two stacked GRU layers. 
# Furthermore, two skip connections are built such that information flows from lower-level feature respresentations are allowed.

units = 32
lr = 0.0005
patience = 5


inputs = Input(shape=(256,), dtype='int32')
x = Embedding(max_words, embedding_dim, input_length=maxlen)(inputs)

x = Conv1D(units * 2, 
           7,
           padding="same",
           kernel_regularizer=l2(0.01),
           kernel_initializer=keras.initializers.he_normal(seed=42))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
c = MaxPooling1D(3)(x)

b = GRU(units,
        return_sequences=True,        
        kernel_initializer=keras.initializers.Orthogonal(seed=42),
        dropout=0.2, recurrent_dropout=0.2
        )(c)
x = LayerNormalization()(b)

c = concatenate([c, x])

b = GRU(units,
        return_sequences=True,        
        kernel_initializer=keras.initializers.Orthogonal(seed=42),
        # dropout=0.2, recurrent_dropout=0.2
        )(c)
x = LayerNormalization()(b)

c = concatenate([c, x])
c = Flatten()(c)

outputs = Dense(3, activation="softmax")(c)

model = Model(inputs=inputs, outputs=outputs)

model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False

model.compile(optimizer=Adam(lr=lr),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train,
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=patience, 
          batch_size=2048)

cb = EarlyStopping(monitor='val_loss', 
                   mode='min', 
                   verbose=0, 
                   patience=patience,
                   restore_best_weights=True)

model.compile(optimizer=Adam(lr=lr/3),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train, 
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=99, 
          batch_size=2048,
          callbacks=[cb])

model.compile(optimizer=Adam(lr=lr/6),
              loss="categorical_crossentropy",
              metrics=["acc"])

model.fit(x=X_train, 
          y=y_train,
          validation_data=[X_val, y_val],
          epochs=99, 
          batch_size=2048,
          callbacks=[cb])


print('===Evaluation===')
model.evaluate(X_test, y_test)







Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 28699 samples, validate on 3587 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 28699 samples, validate on 3587 samples
Epoch 1/99
Epoch 2/99
Epoch 3/99
Epoch 4/99
Epoch 5/99
Epoch 6/99
Epoch 7/99
Epoch 8/99
Epoch 9/99
Epoch 10/99
Epoch 11/99
Epoch 12/99
Epoch 13/99
Epoch 14/99
Epoch 15/99
Epoch 16/99
Epoch 17/99
Epoch 18/99
Epoch 19/99
Epoch 20/99
Epoch 21/99
Epoch 22/99
Epoch 23/99
Epoch 24/99
Epoch 25/99
Epoch 26/99
Epoch 27/99
Epoch 28/99
Epoch 29/99
Epoch 30/99
Epoch 31/99
Epoch 32/99
Epoch 33/99
Train on 28699 samples, validate on 3587 samples
Epoch 1/99
Epoch 2/99
Epoch 3/99
Epoch 4/99
Epoch 5/99
Epoch 6/99
===Evaluation===


[0.4546111916817949, 0.9690635451505016]

In [0]:
# Model are saved and made available on github
model.save('./gdrive/My Drive/DL/Style/model_base.h5')

In [0]:
# Test demo using donor texts, results are stored on github
from keras.models import load_model
model = load_model('./gdrive/My Drive/DL/Style/model_base.h5', custom_objects={'LayerNormalization': LayerNormalization})







Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [0]:
PATH = './gdrive/My Drive/DL/Style/donor.csv'
dn = pd.read_csv(PATH)

X_ts = dn.text.astype('str')
y_ts = dn.author.astype('category')

tokenizer.fit_on_texts(X_ts) 
sequences = tokenizer.texts_to_sequences(X_ts) 
# word_index = tokenizer.word_index 

X_ts = pad_sequences(sequences, maxlen=maxlen, truncating="post")

y_ts = np.asarray(y_ts)
onehot_encoder = OneHotEncoder(sparse=False)
encoded = y_ts.reshape(len(y_ts), 1)
y_ts = onehot_encoder.fit_transform(encoded)

yhat = model.predict(X_ts)

In [0]:
# CS230/outputs/Rand-donor-text (original)
txt1 = ["She sat at the window of the train, her head thrown back, one leg stretched across to the empty seat before her. The window frame trembled with the speed of the motion, the pane hung over empty darkness, and dots of light slashed across the glass as luminous streaks, once in a while.", 
        "Her leg, sculptured by the tight sheen of the stocking, its long line running straight, over an arched instep, to the tip of a foot in a high-heeled pump, had a feminine elegance that seemed out of place in the dusty train car and oddly incongruous with the rest of her.", 
        "She wore a battered camel's hair coat that had been expensive, wrapped shapelessly about her slender, nervous body. The coat collar was raised to the slanting brim of her hat. A sweep of brown hair fell back, almost touching the line of her shoulders.", 
        "Her face was made of angular planes, the shape of her mouth clear-cut, a sensual mouth held closed with inflexible precision. She kept her hands in the coat pockets, her posture taut, as if she resented immobility, and unfeminine, as if she were unconscious of her own body and that it was a woman's body.", 
        "She sat listening to the music. It was a symphony of triumph. The notes flowed up, they spoke of rising and they were the rising itself, they were the essence and the form of upward motion, they seemed to embody every human act and thought that had ascent as its motive.", 
        "It was a sunburst of sound, breaking out of hiding and spreading open. It had the freedom of release and the tension of purpose. It swept space clean, and left nothing but the joy of an unobstructed effort.", 
        "Only a faint echo within the sounds spoke of that from which the music had escaped, but spoke in laughing astonishment at the discovery that there was no ugliness or pain, and there never had had to be. It was the song of an immense deliverance."]

# CS230/outputs/Rand_117M_10000_Nabokov-All-3.txt
txt2 = ["She sat at the window of the train, sighing as she did that, but then she got up and move to the seat opposite her and sat there too, reclining. The window creaked as it swung into motion, the pane of the window half opened, and there was darkness again, and dots of light slashed across the glass as luminous streaks, once in a while.", 
        "Her leg, sculptured by the tight sheen of the stocking, its long line running straight, now stretched out to an angle, above the knee, to the tip of a foot in a high-heeled pump, had a feminine elegance that seemed out of place in the dusty train car and even more so because of the comical expression on her face.",  
        "The calves of her high-heeled shoes were bare, and her bag was full of sand, wrapped shapelessly about her slender, nervous body. She put on the shoes, and the collar turned out to be long. A sweep of brown hair fell back, touching her left shoulder.",  
        "Her forehead, cheek and all of her arms were those of a babushka, blush seeping into the spectrum of her cheeks, a sensual mouth held closed with inflexible precision. She straightened, fidgeted, brushed her bare arms with her handkerchief, and then her hand reached down, as if just barely, on her knees, for she had grabbed something and her face was quivering over a woman's body.", 
        "In the familiarity of the room, she looked up, relaxed. It was a symphony of triumph. The air was warmly flowing music, notes spoke of rising and they were the rising itself, they were the essence and the form of upward motion, and the closer they came, the more distinct and overwhelming became their conviction that if they attained.",  
        "It was a sunburst of sound, breaking out of hiding and spreading open. It had the freedom of release and the tension of purpose. It swept space clean, and immediately the air, having at once filled with a tempestuous sigh.",
        "The sonorous note of the sound grew and filled with its ominous premonition, but spoke in laughing astonishment at the discovery that there was no ugliness or pain in its every pronouncement. It was the song of an immense deliverance."]


# CS230/outputs/Rand-output-ngram.txt
txt3 = ["She sat at the light of the draw, her one thrown behind, one leg stretched against to the empty seat before her. The light state trembled with the streak of the pass, the skin hung on let dark, and dots of light smashed across the stone so one streaks, once in a while.",
        "Her foot, cut by the thin sheen of the stocking, its large print running straight, else an rounded foot, to the snap of a sum in a dear-heeled up, had a affected poetry that seemed past of time in the gray head van and oddly several with the bed of her.", 
        "She wore a battered mule's little hat that had been expensive, covered shapelessly about her little, emotional one. The face collar was formed to the garbling brim of her jacket. A reflection of do little down back, almost touching the line of her shoulders.", 
        "Her face was made of angular planes, the state of her tongue clear-except, a straight mouth new close with firm care. She kept her hand in the wash pockets, her turn taut, as if she resented standing, and offensive, so if she were dead of her own head and that it was a woman's body.", 
        "She sat ear to the music. It was a brass of skin. The notes flowed knight, they step of back and they were the rising her, they were the case and the form of upward stream, they seemed to connect every hand act and thought that had current how its motive.",  
        "It was a sunburst of road, breaking apparently of destruction and spreading distributed. It had the freedom of release and the tension of reason. It swept field plain, and pink lightweight even the joy of an open crack.",  
        "Merely a dim reply within the sounds check of that out which the arrangement had escaped, without spoke in laughing astonishment at the find that there was none ugliness or spasm, and there not had had to be. It was the lay of an gigantic issue."]

def prepare_inputs(X): 
  tokenizer.fit_on_texts(X) 
  sequences = tokenizer.texts_to_sequences(X) 
  X = pad_sequences(sequences, maxlen=maxlen, truncating="post")
  return X

X1 = prepare_inputs(txt1)
X2 = prepare_inputs(txt2)
X3 = prepare_inputs(txt3)

In [0]:
yhat1 = model.predict(X1)
yhat2 = model.predict(X2)
yhat3 = model.predict(X3)

# The Rand text looks extremely Nabokov-like for the model.
print(yhat1)
print('\n', yhat2)
print('\n', yhat3)

# We should expect a possitive number if the text style is tranferred to be more like Nabokov.
print('\n', yhat1[:, -1] - yhat2[:, -1])
print('\n', yhat1[:, -1] - yhat3[:, -1])


 [[1.9874492e-06 1.5040224e-03 9.9849403e-01]
 [6.0623637e-07 2.0041092e-05 9.9997938e-01]
 [8.2889194e-08 1.6244162e-06 9.9999833e-01]
 [8.3698222e-04 1.6383591e-04 9.9899918e-01]
 [5.3102599e-04 2.1514185e-02 9.7795480e-01]
 [6.2312851e-05 1.2646959e-04 9.9981123e-01]
 [7.2156760e-04 4.0704547e-03 9.9520797e-01]]

 [[2.08121864e-03 2.50375215e-02 9.72881198e-01]
 [2.57652991e-05 1.79159618e-03 9.98182654e-01]
 [1.22583215e-05 7.13620238e-06 9.99980569e-01]
 [5.20651869e-04 6.49833382e-05 9.99414325e-01]
 [3.17843743e-02 7.82148913e-02 8.90000761e-01]
 [9.02135980e-06 5.02097129e-04 9.99488831e-01]
 [1.75791024e-06 1.64208643e-04 9.99834061e-01]]

 [[9.77837626e-05 2.66789142e-02 9.73223329e-01]
 [1.11950875e-07 4.16828925e-06 9.99995708e-01]
 [7.34423338e-07 3.48073104e-06 9.99995828e-01]
 [6.22700536e-05 1.05061314e-04 9.99832630e-01]
 [6.21210262e-02 5.43152587e-03 9.32447493e-01]
 [1.73683584e-05 2.58376695e-05 9.99956846e-01]
 [1.91121435e-04 2.90205772e-03 9.96906817e-01]]

 [ 

In [0]:
pd.DataFrame(y_ts, columns=['Austen', 'Dumas', 'Nabokov']).to_csv(r'./gdrive/My Drive/DL/Style/donor_y.csv', index=False)
pd.DataFrame(yhat, columns=['Austen', 'Dumas', 'Nabokov']).to_csv(r'./gdrive/My Drive/DL/Style/donor_yhat.csv', index=False)

In [0]:
!pip install transformers



In [0]:
# DistilBert
# from transformers import BertTokenizer, TFBertModel
from transformers import TFDistilBertModel, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

100%|██████████| 363423424/363423424 [00:14<00:00, 25576232.17B/s]


NameError: ignored

In [0]:
X = df.text.astype('str')
y = df.author.astype('category')

tokenizer.tokenize(X) 
# sequences = tokenizer.texts_to_sequences(X) # list: string - numbers(indices)
# word_index = tokenizer.word_index # dict: word - number(index)

# # Furthermore, we need to pad the sequences so that their lengths are the same and do not exceed a specific maximum length.
# maxlen = 256
# X = pad_sequences(sequences, maxlen=maxlen, truncating="post")

# from sklearn.preprocessing import OneHotEncoder

# # Transform the target authors to one-hot encoding
# y = np.asarray(y)
# onehot_encoder = OneHotEncoder(sparse=False)
# encoded = y.reshape(len(y), 1)
# y = onehot_encoder.fit_transform(encoded)

# # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# # Train and evaluate using tf.keras.Model.fit()
# history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
#                     validation_data=valid_dataset, validation_steps=7)

ValueError: ignored