In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('GFM_data.csv', sep = '\t')

In [2]:
import tensorflow as tf # build model
import string # get set of punctuations
import requests # get data file in notebook

In [3]:
#!pip install nltk
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#nltk.download('wordnet')
#nltk.download('stopwords')
from nltk.corpus import wordnet

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
def extract_entities(text):
    names = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                names.append(' '.join(c[0] for c in chunk.leaves()))
    new_text = text
    for name in names:
        if name in text:
            new_text = new_text.replace(name, 'NLP')
    return new_text

def clean_text(x):
    ## removing names
    x = extract_entities(x)
    ## normalizing text by stripping white space and lower casing
    x =  x.lower().strip()
    ## removing urls
    x = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', x)
    ## removing phone numbers
    x = re.sub('\([0-9]{3}\)\s*[0-9]{3}-[0-9]{4}','',x)
    ## strip all non alphanumeric things
    x = re.sub('\n',' ',x)
    x = re.sub("[^a-zA-Z0-9 #]",'',x)
    x = re.sub("\s+",' ',x)
    text = x.replace('\n', ' ').lower()# lowercase text
    text = REPLACE_IP_ADDRESS.sub('', text) # remove ip address
    text = REPLACE_BY_SPACE_RE.sub(' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    # originally:
    # text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    text = [w for w in text.split() if not w in STOPWORDS]# delete stopwords from text   
    return text

In [None]:
no_nan = df.dropna()
no_nan['Text'] = no_nan['Text'].apply(clean_text)
text = no_nan.get('Text').to_numpy()

In [None]:
#text # check what's inside

In [None]:
tokens = [] # combine words
for lst in text:
    tokens += lst
len(tokens)

In [None]:
#text = ' '.join(text) # join all array element
#new_text = clean_text(text)
#new_text
length = 51 # learn initial 50 words, predict the next word
lines = [] # lst of all training sequences
tokens = text[0]
sufficiency = 200_000
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    lines.append(line)
    if i > sufficiency:
        break

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import LambdaCallback

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>") # OOV adds new words outside the trained words
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines) # all words become numbers

In [None]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:,-1] # 50 first words in X, the last word in y

In [None]:
# one-hot encoding
vocab_size = len(tokenizer.word_index) + 1
y = to_categorical(y, num_classes=vocab_size)

In [None]:
sequence_length = len(X[0]) # Length of each list in X is 50 (might also use X.shape[1])
 
# LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length = sequence_length)) # input dim - vocabsize, output dim = 50
# 100 hidden layers
model.add(LSTM(100, return_sequences=True)) # do twice, return_sequences=True
model.add(LSTM(100)) 
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.summary()

In [None]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
    text = []
    
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')

        y_predict = model.predict_classes(encoded)

        predicted_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_predict:
                predicted_word = word
                break
        seed_text = seed_text + ' ' + predicted_word
        text.append(predicted_word)
    return ' '.join(text)

In [None]:
alltext = df['Text'].str.cat(sep='. ').lower()
#alltext

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X, y, batch_size=256, epochs=100)

In [None]:
seed_text=alltext[:len(df['Text'][0])+16]
seq_length = len(X[0])
#seed_text

In [None]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
    text = []

    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')

        y_predict = model.predict_classes(encoded)

        predicted_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_predict:
                predicted_word = word
                break
        print('--SEED TEXT--')
        print(seed_text)
        seed_text = seed_text + ' ' + predicted_word
        print('--PREDICTED WORDS--', predicted_word)
        print('----------------------------------------------------------')
        text.append(predicted_word)
    return ' '.join(text)

In [None]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 100)

https://kgptalkie.com/text-generation-using-tensorflow-keras-and-lstm/

https://www.youtube.com/watch?v=VAMKuRAh2nc&t=1607s&ab_channel=KGPTalkie