<a href="https://colab.research.google.com/github/dungwoong/NN/blob/main/EminemRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# starter code from ageron/handson-ml2

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB:
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

[K     |████████████████████████████████| 1.1 MB 8.4 MB/s 
[K     |████████████████████████████████| 4.2 MB 7.4 MB/s 
[K     |████████████████████████████████| 596 kB 10.1 MB/s 
[K     |████████████████████████████████| 86 kB 4.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 5.8 MB/s 
[?25h

In [2]:
eminem_url = "https://raw.githubusercontent.com/dungwoong/NN/main/eminem/MASTER.txt"
filepath = keras.utils.get_file("eminem.txt", eminem_url)
with open(filepath) as f:
  eminem_text = f.read()

Downloading data from https://raw.githubusercontent.com/dungwoong/NN/main/eminem/MASTER.txt


We have to deal with \u2005, \u200b, \u2060, 'á', 'ó', 'е'

In [3]:
list_of_bad_chars = ['\u2005', '\u200b', '\u2060', 'á', 'ó', 'е']
for char in list_of_bad_chars:
  print(char, "---------------")
  idx = eminem_text.find(char)
  print(eminem_text[idx-5:idx], "HERE:", eminem_text[idx:idx+5])
  print("------------")

  ---------------
g
But HERE:  we'r
------------
​ ---------------
.A.T. HERE: ​
Her
------------
⁠ ---------------
etter HERE: ⁠—*gu
------------
á ---------------
up)
C HERE: állat
------------
ó ---------------
, adi HERE: ós
I 
------------
е ---------------
ut ev HERE: еr si
------------


In [4]:
# ok so I think first one is a space, second is a \n, third one is a space...?
eminem_text = eminem_text.replace('\u2005', ' ')
eminem_text = eminem_text.replace('\u200b', '\n')
eminem_text = eminem_text.replace('\u2060', ' ')
eminem_text = eminem_text.replace('á', 'a')
eminem_text = eminem_text.replace('ó', 'o')
eminem_text = eminem_text.replace('е', 'e')
eminem_text = eminem_text.replace('’', "'")

# Tokenization

In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(eminem_text)

In [6]:
print(tokenizer.texts_to_sequences(["First", "Second"]))
print(tokenizer.sequences_to_texts([[25, 4, 10, 9, 3], [9, 2, 17, 6, 7, 14]]))

[[25, 4, 10, 9, 3], [9, 2, 17, 6, 7, 14]]
['f i r s t', 's e c o n d']


In [7]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

In [8]:
print(max_id, dataset_size)

57 96447


In [9]:
complete_text = tokenizer.sequences_to_texts([np.arange(1, max_id, step=1)])
print(sorted(complete_text[0].split(" ")))

['', '', '\n', '!', '"', '$', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—']


shakespeare dataset size was 1115394 so it was legit 100 times larger

In [10]:
# note that u put in list, cuz there's a list for documents, but we'll only have 1 document
[encoded] = np.array(tokenizer.texts_to_sequences([eminem_text])) - 1

# Stateful RNNs

I forgot my shakespeare model was tokenized differently than the eminem one so I can't do transfer learning :((((

This is how you'd do it tho

```weights = old_model_layer.get_weights()```

```new_model_layer.set_weights(weights)```

In [11]:
train_size = dataset_size * 90 // 100
batch_size = 1
n_steps = 100
window_length = n_steps + 1

In [12]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [13]:
# if you put recurrent dropout you can't use GPU
model = keras.models.Sequential([
                                 keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, batch_input_shape=[batch_size, None, max_id]),
                                 keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2),
                                 keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

class ResetStatesCallback(keras.callbacks.Callback):
  def on_epoch_begin(self, epoch, logs):
    self.model.reset_states()

cb_checkpoint = keras.callbacks.ModelCheckpoint('model.h5')

In [14]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback(), cb_checkpoint])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
def preprocess(texts):
  X = np.array(tokenizer.texts_to_sequences(texts)) - 1
  return tf.one_hot(X, max_id)

In [16]:
X_new = preprocess(["How are yo"])
#Y_pred = model.predict_classes(X_new)
Y_pred = np.argmax(model(X_new), axis=-1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char

'u'

WTF IS DIS BRAH

In [17]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model(X_new)[0, -1:, :] # last set of probabilities
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

# generates text I think
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [34]:
for temperature in [0.2, 0.4, 0.6, 0.8, 1, 2]:
  print("TEMPERATURE:", temperature, "--------------")
  print(complete_text("t", n_chars=400, temperature=temperature))

TEMPERATURE: 0.2 --------------
t it in the motherfuckin' been (woo!)
that's how much we have in common (woo!)
but i got a stop and i made in common (yah!)
that's how much we have in common (woo!)
i said, "i don't know is everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on ever
TEMPERATURE: 0.4 --------------
ty in one (yeah)
i'm the tayin' the fuck to the fuck out with no stop (yeah)

[chorus]
the coupless i got a fuckin' deck to stuck into everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack on everyone
fack, fack, fack on everyon
TEMPERATURE: 0.6 --------------
ter, i had a suckin' ind it on the record black in the moct the scause i was nothin' through how the

Fack on everyone is the chorus of Kamikaze. Definitely need better processing. However, the input data is really small, and it makes sense that choruses are showing up(because they have such a high probability of occurring compared to everything else)

If I set temperature too high then I get random stuff and it's not even words.

In [41]:
print(complete_text("t", n_chars=800, temperature=0.6))

traver i have in common (woo!)
i'm a fuckin' bijaes (yeah)

[chorus]
now your anders, but i'm gone wond to spit the back of the corner
treats in a strayg when i was a better fuckin' dictic
but in the windows the some hit
to greaters to me and down
cheat to go f-oe and i been gonit to cymborsic, i'm fuckin' weing
i have no stuck of anyrody cangs and i ain't got your i find record
but i'm a tige cugs out and some moon and i've not to be coles
but like i'm the more and the treater back to mure to the world you only the trippin' like the coater
and i'm a side a motherfuckin' been in a digher called and heart and the controin (woo!)
that's how much we have in common (what?)
i'm not snap (i'm through ind the controm me me way the motherfuckers
in a couple of the letter une and i say, "damn, chask


In [40]:
print(complete_text("t", n_chars=1200, temperature=0.7))

the word (yeah!)
no some money out it a punk (fack)
i'm on a right are at reap you
shreaked the kimikaze, gonna
shoulda might and a vock lyating to pen site right's endin' is one
the wellin' a sale on the care is up like a pillar rips
i can't gotta tool and you with a windred here in common (i'm a domp)
may, then i got a fuckin' dezike into everyone
fack, fack on everyone
fack, fidd 'cause i got a scourply for me one bagk
at might and it's colia then i'm a (yeah!)
that's how much we have in common (woo!)
now you wrote it and i'm s uncompon'ly these call you slawly
i hell the bules like a concome, real every time i'm gonna
god me turn in common (yeah)
i'm better be go night the sleefin'
like a lough the decks abrun chome
i don't even got a greated 'em undered up on my norgligais
so you frockin' like i rush me
want i wanna time up on the want rimb bulated up like a brog, hope in my tool, i know as explobumb on the plan (what?)
i been gonna botton a punk and she's a becks
i can sand in my

In [44]:
print(complete_text("t", n_chars=300, temperature=0.5))

ter in the crade in common (woo!!)
but i don't kame and i told me for the moner
i'm a killer but if you can't get for me on your hundred in the recorts
i heard the one the conce from her scints a could straught off of me with me rope in the becked in the wand to put the track in the dect
i'm fuckin' 


In [45]:
print(complete_text("t", n_chars=300, temperature=0.2))

tracked in the motherfuckin' record be the controls
i don't take a tropped in the controls and i say i say i'm a (what?)
i don't know what i stick in the mic (yeah)
that's how much we have in common (woa!)
i feel like a fuckin' dick and i'm a kamikaze, kamikaze, kamikaze (kamikaze, kamikaze, kamikaze
