In [4]:
import tensorflow as tf
import string
import requests
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

In [5]:
import re

url_pat = re.compile(r'(http\S+|www\.\S+)', flags=re.IGNORECASE)
num_pat = re.compile(r'\d+')

def remove_urls_nums(text):
    text = num_pat.sub('', text)       # remove numbers
    text = url_pat.sub('', text)       # remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text) # remove all non-alphabet or numeric values
    return text.strip()

In [6]:
import os

def split_file(input, output, max_size = 6 * 1024**2):
    os.makedirs(output, exist_ok=True)
    base = os.path.basename(input)
    part = 0
    current_size = 0
    out = None

    with open(input, "rb") as f:
        for line in f:
            if out is None or current_size + len(line) > max_size:
                if out:
                    out.close()
                part += 1
                out_path = os.path.join(output, f"chunk.part{part:d}.txt")
                out = open(out_path, "wb")
                current_size = 0
            out.write(line)
            current_size += len(line)
    if out:
        print(f"Split {input} into {part} parts")
        out.close()
    return part

In [4]:
#split_troll = split_file("combined_gen.txt", "chunks_gen", max_size = 6 * 1024**2 ) 

In [7]:
with open("chunk.part19.txt", "r", encoding = "utf-8") as t:
    text = t.read()
    
data = text.splitlines()
data = [remove_urls_nums(line) for line in data]
data = [dt for dt in data if dt.strip() != ""]

In [29]:
print(data[:900])

['Baltimorebased', 'division', 'of', 'All', 'Inventions', 'of', 'a', 'Generation', 'Think', 'about', 'what', 'you', 'do', 'in', 'the', 'morning', 'every', 'procedure', 'you', 'take', 'part', 'in', 'includes', 'an', 'Larger', 'role', 'for', 'Chinese', 'Indian', 'inventors', 'in', 'US', 'This', 'is', 'brought', 'out', 'in', 'a', 'new', 'study', 'published', 'by', 'the', 'Natio', 'No', 'snow', 'job', 'For', 'serial', 'inventor', 'Arra', 'David', 'the', 'product', 'category', 'was', 'ripe', 'for', 'a', 'makeover', 'He', 'invented', 'the', 'Rebo', 'City', 'firm', 'sues', 'gaming', 'giants', 'Eleven', 'contends', 'the', 'wireless', 'controllers', 'for', 'the', 'Nintendo', 'Wii', 'Sony', 'Playstation', 'Four', 'Shows', 'in', 'the', 'Next', 'Two', 'Weeks', 'Sure', 'I', 'Can', 'Do', 'That', 'Atelier', 'Highlandtown', 'Holiday', 'Craft', 'Show', 'Sunday', 'w', 'WikiAnswers', 'Did', 'the', 'invention', 'of', 'the', 'cell', 'phone', 'inspire', 'other', 'Mobile', 'Phones', 'question', 'Did', 'the',

In [6]:
#model = load_model("troll_model.keras")

  saveable.load_own_variables(weights_store.get(inner_path))


#### LSTM model and train test preparation

In [8]:
tokenizer = Tokenizer(num_words=84294, oov_token="<UNK>")
tokenizer.fit_on_texts(data) #it's going to fit on the data in the forms of lines.

In [9]:
encoded_text=tokenizer.texts_to_sequences(data)

In [10]:
vocab_size = len(tokenizer.word_index) + 1

### Prepare data for training

In [11]:
data_list=[]
for i in encoded_text:
    if len(i)>1:
        for j in range(2,len(i)+1):
            data_list.append(i[:j])
#             print(i[:j]) # if you want to check data

#### Paddding

In [12]:
max_length=25
#max length of line is 25 token per line

In [13]:
sequences=pad_sequences(data_list,maxlen=max_length,padding="pre") # we set the lenght size equal to max_length

In [14]:
X=sequences[:,:-1]
y=sequences[:,-1].astype('int32')

In [15]:
X.shape, y.shape

((30735, 24), (30735,))

In [16]:
seq_length=X.shape[1]

#### Build Model
- We will build a simple LSTM model

In [17]:
model=Sequential()
model.add(Embedding(vocab_size,50)) 
#The first layer is the Embedded layer that uses 50-length vectors
#return_sequences=True because we add another LSTM
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation="relu"))
model.add(Dense(vocab_size,activation="softmax")) 

In [18]:
model.build(input_shape=(None, seq_length))

In [19]:
model.summary()

In [20]:
model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [47]:
model.fit(X,y,batch_size=32,epochs=200)

Epoch 1/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 31ms/step - accuracy: 0.0320 - loss: 7.5488
Epoch 2/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 31ms/step - accuracy: 0.0389 - loss: 7.2329
Epoch 3/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 34ms/step - accuracy: 0.0467 - loss: 6.9745
Epoch 4/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 37ms/step - accuracy: 0.0487 - loss: 6.7755
Epoch 5/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 33ms/step - accuracy: 0.0527 - loss: 6.6142
Epoch 6/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 39ms/step - accuracy: 0.0580 - loss: 6.4400
Epoch 7/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 32ms/step - accuracy: 0.0623 - loss: 6.2310
Epoch 8/200
[1m961/961[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 37ms/step - accuracy: 0.0712 - loss: 6.0164
Epoch 9/200
[1m

<keras.src.callbacks.history.History at 0x243c60502e0>

### Text Generation

In [21]:
text_length = 15

def generate_text(input_text, no_lines):
    general_text = []
    original_input = input_text  # keep the original prefix

    for i in range(no_lines):
        text = []
        for _ in range(text_length):
            encoded = tokenizer.texts_to_sequences([input_text])
            encoded = pad_sequences(encoded, maxlen=seq_length, padding="pre")
            y_pred = np.argmax(model.predict(encoded), axis=-1)

            predicted_word = ""
            for word, index in tokenizer.word_index.items():
                if index == y_pred:
                    predicted_word = word
                    break

            input_text = input_text + ' ' + predicted_word
            text.append(predicted_word)

        line = original_input + " " + " ".join(text)
        general_text.append(line)

        input_text = text[-1]

    return general_text

In [79]:
input_text="There are"
text_produced=generate_text(input_text,3)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

['There are million ants for every person in the world is a street smh people need to',
 'There are overcome fear act as if it wer impossible to fail and it shall be easy',
 'There are button as gould raises the most spirit in life is a minute or am slowly']

In [81]:
input_text="How do"
text_produced=generate_text(input_text,3)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

['How do i check my stats for fantasy football i forget the website hhahahahahaha man no happy',
 'How do thanksgiving donniewahlberg i hope your results with legs to see his realskipbayless colts to stay',
 'How do with alzheimers retirement revolution the new reality the twins weekly reunion nothing only drink credible']