In [1]:
import numpy as np
import pandas as pd

In [2]:
# READ DATA
movies_raw_df = pd.read_csv(r'C:\Users\Julia Chang\Downloads\archive.zip')
movies_raw_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [3]:
# FILTER DATA
movies_to_select = ((movies_raw_df['Genre'] == 'horror') &
                   # Restrict to American movies.
                   (movies_raw_df['Origin/Ethnicity'] == 'American') &
                   # Only movies from 2000.
                   (movies_raw_df['Release Year'] > 1999))

In [4]:
horror_df = movies_raw_df[movies_to_select]['Plot']
horror_df.head()

13617    In November 1999, tourists and fans of The Bla...
13640    Matthew Van Helsing, the alleged descendant of...
13681    A small group of fervent Roman Catholics belie...
13731    Cotton Weary, now living in Los Angeles and th...
13763    Amy Mayfield, a student at a prestigious film ...
Name: Plot, dtype: object

In [5]:
horror_df.shape

(260,)

In [6]:
# PREPROCESS DATA
# Join all plots into a string.
horror_str = horror_df.str.cat(sep=' ')

In [7]:
import spacy

# Load language model.
nlp = spacy.load('en_core_web_sm', disable = ['parser', 'tagger', 'ner'])

# Extract tokens (words).
def get_tokens(doc_text):
    # This pattern is a modification of the defaul filter from the
    # Tokenizer() object in keras.preprocessing.text. 
    # It just indicates which patters no skip.
    skip_pattern = '\r\n \n\n \n\n\n!"-#$%&()--.*+,-./:;<=>?@[\\]^_`{|}~\t\n\r '
    
    tokens = [token.text.lower() for token in nlp(doc_text) if token.text not in skip_pattern]
    
    return tokens

# Get tokens.
tokens = get_tokens(horror_str)
# Let us see the first tokens.
tokens[0:9]



['in', 'november', '1999', 'tourists', 'and', 'fans', 'of', 'the', 'blair']

In [8]:
len(tokens)

165891

In [9]:
# FEATURE EXTRACTION
len_0 = 25
tokens[0:len_0] # features
tokens[len_0:len_0+1] # target

['resident']

In [10]:
# Generate sequences.
train_len = len_0 + 1
text_sequences = []
for i in range(train_len, len(tokens)):
    # Construct sequence.
    seq = tokens[i - train_len: i]
    # Append.
    text_sequences.append(seq)

In [11]:
# First five sequences
for i in range(0,5):
    print(' '.join(text_sequences[i]))
    print('-----')

in november 1999 tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident
-----
november 1999 tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff
-----
1999 tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff a
-----
tourists and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff a former
-----
and fans of the blair witch project descend on the small town of burkittsville maryland where the film is set local resident jeff a former psychiatric
-----


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
# VECTORIZATION
# Encode character sequences as numerical features
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences) # creates dictionary with numbers
# Get numeric sequences
sequences = tokenizer.texts_to_sequences(text_sequences) # each sentence uses a dictionary so there's a list of numbers
sequences[0]

[8,
 12576,
 12575,
 2397,
 2,
 12574,
 5,
 1,
 5560,
 630,
 2195,
 2932,
 20,
 1,
 452,
 157,
 5,
 12573,
 7486,
 42,
 1,
 117,
 7,
 362,
 231,
 2933]

In [14]:
vocabulary_size = len(tokenizer.word_counts) # vocabulary size = # unique tokens
sequences = np.array(sequences) # store sequences in a numpy array

In [15]:
# X-Y SPLIT
from tensorflow.keras.utils import to_categorical
X = sequences[:,:-1] # take all the rows, take all the columns except the last one
X

array([[    8, 12576, 12575, ...,     7,   362,   231],
       [12576, 12575,  2397, ...,   362,   231,  2933],
       [12575,  2397,     2, ...,   231,  2933,   297],
       ...,
       [   20,     4,  1551, ...,    22,     1,    59],
       [    4,  1551,  1684, ...,     1,    59,     5],
       [ 1551,  1684,    22, ...,    59,     5,     6]])

In [16]:
X.shape

(165865, 25)

In [17]:
seq_len = X.shape[1]
y = sequences[:,-1] # select all last row indices
y

array([2933,  297,    4, ...,    5,    6,  169])

In [18]:
# Convert to categorical (we add +1 because Keras needs a placeholder.)
y = to_categorical(y, num_classes=(vocabulary_size + 1))
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
# MODEL DEFINITION
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size,
                       output_dim=seq_len))
    model.add(LSTM(units=50, return_sequences=True))
    model.add(LSTM(units=50))
    model.add(Dense(units=50, activation='relu'))
    model.add(Dense(units=vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    # Build the model by providing an input shape
    model.build(input_shape=(None, seq_len)) # model needed to be explicitly built before calling model.summary()
    
    model.summary()
    
    return model

In [27]:
import tensorflow as tf
print(tf.__version__)

2.17.0


In [30]:
# Create model and see summary
model = create_model(vocabulary_size+1, seq_len)

In [31]:
# MODEL FIT
model.fit(x=X, y=y, batch_size=128, epochs=10, verbose=1)
# get model metrics.
loss, accuracy = model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Epoch 1/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 39ms/step - accuracy: 0.0647 - loss: 7.2953
Epoch 2/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 37ms/step - accuracy: 0.0771 - loss: 6.5872
Epoch 3/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 35ms/step - accuracy: 0.0929 - loss: 6.3643
Epoch 4/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 40ms/step - accuracy: 0.1156 - loss: 6.1360
Epoch 5/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 35ms/step - accuracy: 0.1233 - loss: 5.9537
Epoch 6/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 37ms/step - accuracy: 0.1276 - loss: 5.8355
Epoch 7/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 39ms/step - accuracy: 0.1348 - loss: 5.7235
Epoch 8/10
[1m1296/1296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 36ms/step - accuracy: 0.1385 - loss: 5.6290
Epoch 9/

In [32]:
# Model Fit 2
model.fit(x=X, y=y, batch_size=200, epochs=10, verbose=1)
loss, accuracy = model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Epoch 1/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 50ms/step - accuracy: 0.1506 - loss: 5.3500
Epoch 2/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 42ms/step - accuracy: 0.1519 - loss: 5.3052
Epoch 3/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 46ms/step - accuracy: 0.1534 - loss: 5.2540
Epoch 4/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 48ms/step - accuracy: 0.1572 - loss: 5.1929
Epoch 5/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 45ms/step - accuracy: 0.1581 - loss: 5.1482
Epoch 6/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 53ms/step - accuracy: 0.1595 - loss: 5.0935
Epoch 7/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 52ms/step - accuracy: 0.1638 - loss: 5.0409
Epoch 8/10
[1m830/830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 48ms/step - accuracy: 0.1627 - loss: 4.9971
Epoch 9/10
[1m830/830[

In [33]:
# Model Fit 3
model.fit(x=X, y=y, batch_size=500, epochs=10, verbose=1)
loss, accuracy = model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Epoch 1/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 117ms/step - accuracy: 0.1733 - loss: 4.8144
Epoch 2/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 113ms/step - accuracy: 0.1755 - loss: 4.7853
Epoch 3/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 128ms/step - accuracy: 0.1782 - loss: 4.7605
Epoch 4/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 133ms/step - accuracy: 0.1770 - loss: 4.7527
Epoch 5/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 129ms/step - accuracy: 0.1773 - loss: 4.7329
Epoch 6/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 112ms/step - accuracy: 0.1791 - loss: 4.7219
Epoch 7/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 143ms/step - accuracy: 0.1803 - loss: 4.6896
Epoch 8/10
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 138ms/step - accuracy: 0.1827 - loss: 4.6734
Epoch 9/10
[1m3

In [36]:
# Model Fit 4
model.fit(x=X, y=y, batch_size=500, epochs=100, verbose=1)
loss, accuracy = model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Epoch 1/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 73ms/step - accuracy: 0.2008 - loss: 4.4791
Epoch 2/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 73ms/step - accuracy: 0.1982 - loss: 4.4756
Epoch 3/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 73ms/step - accuracy: 0.2034 - loss: 4.4381
Epoch 4/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.2067 - loss: 4.4027
Epoch 5/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.2067 - loss: 4.3867
Epoch 6/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 74ms/step - accuracy: 0.2115 - loss: 4.3637
Epoch 7/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.2124 - loss: 4.3486
Epoch 8/100
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 73ms/step - accuracy: 0.2133 - loss: 4.3335
Epoch 9/100
[1m

In [37]:
# Model Fit 5
model.fit(x=X, y=y, batch_size=500, epochs=700, verbose=1)
loss, accuracy = model.evaluate(x=X, y=y)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Epoch 1/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 73ms/step - accuracy: 0.3287 - loss: 3.3711
Epoch 2/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 75ms/step - accuracy: 0.3288 - loss: 3.3751
Epoch 3/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 73ms/step - accuracy: 0.3292 - loss: 3.3682
Epoch 4/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 73ms/step - accuracy: 0.3320 - loss: 3.3529
Epoch 5/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 73ms/step - accuracy: 0.3333 - loss: 3.3457
Epoch 6/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 74ms/step - accuracy: 0.3335 - loss: 3.3401
Epoch 7/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.3366 - loss: 3.3255
Epoch 8/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.3354 - loss: 3.3271
Epoch 9/700
[1m

[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 93ms/step - accuracy: 0.4219 - loss: 2.7622
Epoch 133/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 90ms/step - accuracy: 0.4207 - loss: 2.7688
Epoch 134/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 106ms/step - accuracy: 0.4242 - loss: 2.7589
Epoch 135/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 93ms/step - accuracy: 0.4197 - loss: 2.7727
Epoch 136/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 111ms/step - accuracy: 0.4245 - loss: 2.7482
Epoch 137/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 84ms/step - accuracy: 0.4235 - loss: 2.7535
Epoch 138/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 85ms/step - accuracy: 0.4255 - loss: 2.7400
Epoch 139/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 85ms/step - accuracy: 0.4256 - loss: 2.7398
Epoch 140/70

[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 212ms/step - accuracy: 0.4686 - loss: 2.4716
Epoch 263/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 189ms/step - accuracy: 0.4710 - loss: 2.4626
Epoch 264/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 191ms/step - accuracy: 0.4755 - loss: 2.4346
Epoch 265/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 180ms/step - accuracy: 0.4753 - loss: 2.4406
Epoch 266/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 150ms/step - accuracy: 0.4766 - loss: 2.4369
Epoch 267/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 162ms/step - accuracy: 0.4716 - loss: 2.4475
Epoch 268/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 179ms/step - accuracy: 0.4749 - loss: 2.4349
Epoch 269/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 175ms/step - accuracy: 0.4746 - loss: 2.4375
Epoch 

[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 118ms/step - accuracy: 0.5028 - loss: 2.2772
Epoch 393/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 120ms/step - accuracy: 0.5010 - loss: 2.2885
Epoch 394/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 118ms/step - accuracy: 0.5006 - loss: 2.2892
Epoch 395/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 118ms/step - accuracy: 0.5074 - loss: 2.2646
Epoch 396/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 118ms/step - accuracy: 0.5078 - loss: 2.2655
Epoch 397/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 120ms/step - accuracy: 0.5080 - loss: 2.2601
Epoch 398/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 119ms/step - accuracy: 0.5074 - loss: 2.2600
Epoch 399/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 118ms/step - accuracy: 0.5081 - loss: 2.2547
Epoch 

[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 105ms/step - accuracy: 0.5272 - loss: 2.1470
Epoch 523/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 106ms/step - accuracy: 0.5263 - loss: 2.1588
Epoch 524/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 114ms/step - accuracy: 0.5255 - loss: 2.1567
Epoch 525/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 115ms/step - accuracy: 0.5253 - loss: 2.1545
Epoch 526/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 109ms/step - accuracy: 0.5286 - loss: 2.1455
Epoch 527/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 104ms/step - accuracy: 0.5291 - loss: 2.1426
Epoch 528/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 107ms/step - accuracy: 0.5258 - loss: 2.1559
Epoch 529/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 111ms/step - accuracy: 0.5294 - loss: 2.1389
Epoch 

[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 152ms/step - accuracy: 0.5467 - loss: 2.0489
Epoch 653/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 136ms/step - accuracy: 0.5482 - loss: 2.0450
Epoch 654/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 123ms/step - accuracy: 0.5497 - loss: 2.0354
Epoch 655/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 128ms/step - accuracy: 0.5451 - loss: 2.0535
Epoch 656/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 143ms/step - accuracy: 0.5363 - loss: 2.0976
Epoch 657/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 129ms/step - accuracy: 0.5398 - loss: 2.0728
Epoch 658/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 133ms/step - accuracy: 0.5418 - loss: 2.0702
Epoch 659/700
[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 123ms/step - accuracy: 0.5437 - loss: 2.0602
Epoch 

In [38]:
# SAVE MODEL
from pickle import dump
dump(tokenizer, open('tokenizer','wb'))
model.save('model.h5')



In [40]:
from tensorflow.keras.models import load_model
model = load_model('model.h5') # reloading the model



In [53]:
# GENERATE NEW TEXT
from keras.preprocessing.sequence import pad_sequences
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    # Set seed_text as input_text.
    input_text = seed_text
    output_text = input_text.split() # initialize output_text with the seed_text split into words
    
    for i in range(num_gen_words):
        # Encode input text.
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_probs = model.predict(pad_encoded, verbose=0)
        pred_word_ind = np.argmax(pred_probs, axis=-1)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [50]:
# EXAMPLE 1
sample_text = horror_df.iloc[100][:383]
print(sample_text)

Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders Blaine and Frank has his arm chopped off before he is able to shoot the attacker in the head. Afterwards, detectives find seven bodies in the house, all of which have had their eyes ripped out.


In [51]:
seed_text = sample_text[:190]
print(seed_text)

Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders 


In [55]:
generated_text = generate_text(model, tokenizer, seq_len, seed_text, 40)
print(seed_text + ' ' + generated_text + '...')

Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders  Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders blaine and frank is expelled and to move back to the hospital but is unable to handle the cellphone chips and notices a campsite and runs to talk to the bathroom and destroys his restraints as being disarmed doing with...


In [56]:
# EXAMPLE 2
seed_text2 = 'the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear'

In [57]:
generated_text2 = generate_text(model, tokenizer, seq_len, seed_text, 80)
print(seed_text2 + ' ' + generated_text2 + '...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear Officer Frank Williams (Steven Vidler) and his partner Blaine investigate an abandoned house, where they find a young woman with her eyes ripped out. A large figure with an axe then murders blaine and frank is expelled and to move back to the hospital but is unable to handle the cellphone chips and notices a campsite and runs to talk to the bathroom and destroys his restraints as being disarmed doing with a way to accept that bession was driven drugs by the cabin but survives the truth about foot as a miscarriage and the group are cut the gun and the two militia members are seen to the ground in the...


In [59]:
# EXAMPLE 3
seed_text3 = movies_raw_df[movies_raw_df['Genre'] == 'comedy']['Plot'].iloc[330]
print(seed_text3)

Cocky college football star Francis Finnegan has his eye on the attractive Gloria van Dayham, as does his rival, Larry Stacey.
Francis gets a job in a department store owned by Stacey's father, where salesgirl June Cort develops an attraction to him. Finnegan proposes that Stacey's store sponsor a football team, which causes rival shop owner Whimple to do likewise. The team's head cheerleader, Mimi, falls for team mascot Joe, meanwhile, and everybody pairs off with the perfect partner after the big game.


In [60]:
generated_text3 = generate_text(model, tokenizer, seq_len, seed_text3, 90)
print(seed_text3 + ' ' + generated_text3 + '...')

Cocky college football star Francis Finnegan has his eye on the attractive Gloria van Dayham, as does his rival, Larry Stacey.
Francis gets a job in a department store owned by Stacey's father, where salesgirl June Cort develops an attraction to him. Finnegan proposes that Stacey's store sponsor a football team, which causes rival shop owner Whimple to do likewise. The team's head cheerleader, Mimi, falls for team mascot Joe, meanwhile, and everybody pairs off with the perfect partner after the big game. Cocky college football star Francis Finnegan has his eye on the attractive Gloria van Dayham, as does his rival, Larry Stacey. Francis gets a job in a department store owned by Stacey's father, where salesgirl June Cort develops an attraction to him. Finnegan proposes that Stacey's store sponsor a football team, which causes rival shop owner Whimple to do likewise. The team's head cheerleader, Mimi, falls for team mascot Joe, meanwhile, and everybody pairs off with the perfect partner

In [62]:
# ADDING TEMPERATURE PARAMETER

def generate_text_temp(model, tokenizer, seq_len, seed_text, num_gen_words, temperature):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        # Encode input text.
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # Add if the input text does not have length len_0.
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # Get learned distribution.
        pred_distribution = model.predict(pad_encoded, verbose=0)[0]
        
        # Apply temperature transformation.
        new_pred_distribution = np.power(pred_distribution, (1/temperature))
        new_pred_distribution = new_pred_distribution / new_pred_distribution.sum()
        
        # Sample from modified distribution.
        choices = range(new_pred_distribution.size)
        
        pred_word_ind = np.random.choice(a=choices, p=new_pred_distribution)
        
        # Convert from numeric to word.
        pred_word = tokenizer.index_word[pred_word_ind]
        # Attach predicted word.
        input_text += ' ' + pred_word
        # Append new word to the list.
        output_text.append(pred_word)
        
    return ' '.join(output_text)        

In [64]:
generated_text4 = generate_text_temp(model, tokenizer, seq_len, seed_text2, 82, 0.9)
print(seed_text2 + ' ' + generated_text4 + '...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear frankie who once she had been discharged revealing the officers pick winston who raised when mathias uses david 's fingers into a blood of killing their bodies while grady rolls henry she ordered to suffer from didi destroying bed of his wrist supposed on newly confused and 1953 ziegler out of a miscarriage and attacked he her own fault ashley crenna and candles is responsible and lived attacking the mistress 's key into the living family but she is murdered he and...


In [67]:
generated_text5 = generate_text_temp(model, tokenizer, seq_len, seed_text2, 82, 0.5)
print(seed_text2 + ' ' + generated_text5 + '...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear frankie who are made by the monster are attracting bends in washington sets progresses the bye rage but jumps heavily ends and the curse is unable to deny 100 jake appears to battle as their torsos as she slowly stops princess reaper and destroy the firefly residence as a protective measure from the infirmary wydell spent exasperated commits but gone and interrogated with the stairs to cooperate to get out to rescue lisa 's crafted bullets eleanor pauline wakes through hades africa...


In [68]:
generated_text6 = generate_text_temp(model, tokenizer, seq_len, seed_text2, 82, 0.1)
print(seed_text2 + ' ' + generated_text6 + '...')

the film starts in a dark house where a group of teenagers friends meet to spend the weekend when they suddenly hear frankie who are made by the fbi seven test is heard in a wheelchair while finishing markus clippings deer of the nanny and is maimed by smoking out of the advice kreeg finds the burnt of then buds filled with used at 1408 to 18 unhinged jason in a patrol officer in the hospital where she is strapped to the car stock manley her wound in the insistence mental monster reports the paint alive wants to visit carl scales the group saying...
