Firstly, import the necessary modules in order to create the LSTM and process the data.

In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import tensorflow
import keras as keras

# set seeds for reproducability
from numpy.random import seed
tensorflow.random.set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

Then create a an array which contains all of the comments from our CSV.

In [18]:
NagoyaComments = pd.read_csv('NagoyaComments.csv')
Comments = []

for x in NagoyaComments.Comments:
    Comments.append(x)






Next we have to clean the data in our CSV so it can be used to to train the model, so we remove punctuation and convert it all to  lower case.

In [19]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in Comments]
corpus[:10]

['i made this ',
 'whoever is reading this i hope you will get everything you dream of in 2021',
 'listening to chill lofi beats really helps in studying\n albert einstein probably',
 'tbh you could put anything uncle iroh says put it over a lofi beat and the songs quality would go up by 10000',
 'the easy south america naively ski because pantry reversely worry athwart a careful brake fearful fearless superficial passbook',
 'this might be a cringy heart opening comment but this year i was supposed to go to nagoya for a couple of months with an scholarship since i was a teenager it was my dream to live in japan for a while and i worked super hard for it but the pandemic came and my country and pretty much the whole world was closed a week before i was going to take the planei have listened to japan inspired mixes for a couple of years now longing for the moment of going there now they hit in a different wayedit since this commment is getting a lot of attention i think its fair to give

Next we tokenise the entries in our array.

In [20]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]



[[7, 574],
 [7, 574, 12],
 [541, 13],
 [541, 13, 234],
 [541, 13, 234, 12],
 [541, 13, 234, 12, 7],
 [541, 13, 234, 12, 7, 106],
 [541, 13, 234, 12, 7, 106, 3],
 [541, 13, 234, 12, 7, 106, 3, 35],
 [541, 13, 234, 12, 7, 106, 3, 35, 60]]

Pad out the sequences.

In [21]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

Create and train the model.

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

In [None]:
model.fit(predictors, label, epochs=100, verbose=5)

In [22]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [67]:
print (generate_text("wow", 6, model, max_sequence_len))
print (generate_text("I don't", 16, model, max_sequence_len))
print (generate_text("Heres", 20, model, max_sequence_len))
print (generate_text("Really", 5, model, max_sequence_len))
print (generate_text("Sometimes", 12, model, max_sequence_len))

Wow I Just Ordered Pokemon Cards From
I Don'T Was Listening To This While Struggling With My Homework When All Of A Sudden I Heard
Heres Fact You Are Someday I Heard Uncle Iroh Tt Going To Cry That It Way That This Hits So Chill
Really Does Anybody Love The Comments
Sometimes You For This Is Now I Have A Million Qestions Rn Here
