In [2]:
import tensorflow as tf
import string
import requests
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import re

url_pat = re.compile(r'(http\S+|www\.\S+)', flags=re.IGNORECASE)
num_pat = re.compile(r'\d+')

def remove_urls_nums(text):
    text = num_pat.sub('', text)       # remove numbers
    text = url_pat.sub('', text)       # remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text) # remove all non-alphabet or numeric values
    return text.strip()                        # some data had numbers attatched to letters and num_pat couldn't remove them

In [4]:
import os

def split_file(input, output, max_size = 6 * 1024**2):
    os.makedirs(output, exist_ok=True)
    base = os.path.basename(input)
    part = 0
    current_size = 0
    out = None

    with open(input, "rb") as f:
        for line in f:
            if out is None or current_size + len(line) > max_size:
                if out:
                    out.close()
                part += 1
                out_path = os.path.join(output, f"chunk.part{part:d}.txt")
                out = open(out_path, "wb")
                current_size = 0
            out.write(line)
            current_size += len(line)
    if out:
        print(f"Split {input} into {part} parts")
        out.close()
    return part

In [5]:
split_troll = split_file("combined_troll.txt", "chunks", max_size = 6 * 1024**2 ) 

Split combined_troll.txt into 49 parts


In [5]:
url_pat = re.compile(r'http\S+', flags = re.IGNORECASE)

with open("chunk.part20.txt", "r", encoding = "utf-8") as t:
    text = t.read()
    
data = text.splitlines()
data = [remove_urls_nums(line) for line in data]

#### LSTM model and train test preparation

In [6]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(data) #it's going to fit on the data in the forms of lines.

In [7]:
encoded_text=tokenizer.texts_to_sequences(data)

In [8]:
vocab_size=len(tokenizer.word_counts)+1 # always add plus one in tensorflow

### Prepare data for training

In [9]:
data_list=[]
for i in encoded_text:
    if len(i)>1:
        for j in range(2,len(i)+1):
            data_list.append(i[:j])
# put each word in the array

#### Paddding

In [10]:
max_length=40
#max length of line is 40 tokens per line in each tweet

In [11]:
sequences=pad_sequences(data_list,maxlen=max_length,padding="pre")

In [12]:
X=sequences[:,:-1]
y=sequences[:,-1].astype('int32')

In [13]:
X.shape, y.shape

((452326, 39), (452326,))

In [14]:
seq_length=X.shape[1]

#### Build Model
- We will build a simple LSTM model

In [15]:
model=Sequential()
model.add(Embedding(vocab_size,50)) 
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation="relu"))
model.add(Dense(vocab_size,activation="softmax"))

In [16]:
model.build(input_shape=(None, seq_length))

In [17]:
model.summary()

In [18]:
model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [24]:
history = model.fit(X,y,batch_size=256,epochs=150)

Epoch 1/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m698s[0m 395ms/step - accuracy: 0.0234 - loss: 9.5607
Epoch 2/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m713s[0m 404ms/step - accuracy: 0.0405 - loss: 8.9632
Epoch 3/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m722s[0m 408ms/step - accuracy: 0.0519 - loss: 8.5205
Epoch 4/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m721s[0m 408ms/step - accuracy: 0.0583 - loss: 8.1945
Epoch 5/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m723s[0m 409ms/step - accuracy: 0.0627 - loss: 7.9082
Epoch 6/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m736s[0m 417ms/step - accuracy: 0.0670 - loss: 7.6299
Epoch 7/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m749s[0m 424ms/step - accuracy: 0.0710 - loss: 7.3398
Epoch 8/150
[1m1767/1767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m737s[0m 417ms/step - accuracy: 0.0753

### Text Generation

In [19]:
text_length = 15

def generate_text(input_text, no_lines):
    general_text = []
    original_input = input_text  # keep the original prefix

    for i in range(no_lines):
        text = []
        for _ in range(text_length):
            encoded = tokenizer.texts_to_sequences([input_text])
            encoded = pad_sequences(encoded, maxlen=seq_length, padding="pre")
            y_pred = np.argmax(model.predict(encoded), axis=-1)

            predicted_word = ""
            for word, index in tokenizer.word_index.items():
                if index == y_pred:
                    predicted_word = word
                    break

            input_text = input_text + ' ' + predicted_word
            text.append(predicted_word)

        line = original_input + " " + " ".join(text)
        general_text.append(line)

        input_text = text[-1]

    return general_text

In [31]:
input_text="There are"
text_produced=generate_text(input_text,3)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43

 'There are good workout is the best relief gaboriks kenssnapback talinatalie shanpanda jonathonaalders iamchiefsosa freshaziceblue andrewmajano karkarj',
 'There are gotherpeepin nick ovoxojulio taylor megankrings the jennabeadles jo trilljo rob satiuqul su ade yoshie cathleenking']

In [32]:
input_text="How do"
text_produced=generate_text(input_text,3)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50

['How do you workout with me lillianolivia imjesssayinx denissemorales gelleesh sheiskandi laurenderise sfgamerbabe avyzmprotect runnarghhh runnarghhh runnarghhh',
 'How do woosang knight knightkingbaal earth astraeanixie unthinkable livelaughlynn abbie starr steven wellness hayden billy devin hughes',
 'How do knuckles demoalpha sillyman mady madybaker jaylon ayeejaysimp fj medeya bruno vallencourt max jones musgrove raven']

In [20]:
input_text="When will"
text_produced=generate_text(input_text,3)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 761ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

['When will heartcatchr inkscrblr inkscrblr inkscrblr chloeravioli anglebre austinfortner medinax medinax ronsouthwick ronsouthwick ibleedcamo vodkahelps simbra simbra',
 'When will metalmoccha heartcatchr inkscrblr inkscrblr inkscrblr inkscrblr inkscrblr ddubsnyrangel ddubsnyrangel medinax medinax joeadamo tharealcorygunz twinrose twinrose',
 'When will metalmoccha jeremyjdaguilar jeremyjdaguilar jeremyjdaguilar tim farrizjusoffe trentwilliams synergyblitz inkscrblr synergyblitz ddubsnyrangel ddubsnyrangel selfemploydking synergyblitz misslouisekay']