In [None]:
import tensorflow as tf
import string
import requests
import csv
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = []
with open('judgesSentencesOnly.csv', newline='') as file:
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        sentences.append(row)
sents = [sent[0] for sent in sentences if len(sent[0]) > 2]
print(sents[:10])

In [None]:
sents = " ".join(sents)
sents[:30]

In [None]:
def clean_text(data):
  words = data.split()
  table = str.maketrans('', '', string.punctuation)
  words = [word.translate(table) for word in words]
  words = [word for word in words if word.isalpha() ]
  words = [word.lower() for word in words]
  return words

cleanSents = clean_text(sents)
cleanSents[:10]

In [None]:
predLength = 15
groups = []
for i in range(predLength, len(cleanSents)):
    part = cleanSents[i - predLength: i]
    grouping = ' '.join(part)
    groups.append(grouping)
print(len(groups))
groups[0]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(groups)
sequences = tokenizer.texts_to_sequences(groups)
print(type(sequences))
sequences = np.array(sequences)
print(type(sequences))

In [None]:
print(np.shape(sequences))
X = sequences[:, :-1]
y = sequences[:, -1]
print(X[0])
print(y[0])

In [None]:
uniqueWords = 1 + len(tokenizer.word_index)
uniqueWords

In [None]:
y = to_categorical(y, num_classes=uniqueWords)

In [None]:
rnnModel = tf.keras.models.Sequential()
rnnModel.add(tf.keras.layers.Embedding(uniqueWords, 14, input_length=14))
rnnModel.add(tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
rnnModel.add(tf.keras.layers.LSTM(256, return_sequences=False, dropout=0.3, recurrent_dropout=0.2))
rnnModel.add(tf.keras.layers.Dense(units = 256, activation='relu'))
rnnModel.add(tf.keras.layers.Dense(units = 256, activation='relu'))
rnnModel.add(tf.keras.layers.Dense(units = uniqueWords, activation='softmax'))
rnnModel.summary()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
rnnModel.compile(optimizer='adam', loss='categorical_crossentropy', metrics= ['accuracy'])
rnnModel.fit(X, y, batch_size=128, epochs=50)

In [None]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, numWords):
  predText = []
  for _ in range(numWords):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=text_seq_length, truncating='pre')
    y_pred = model.predict_classes(encoded)
    predWord = ''
    for word, index in tokenizer.word_index.items():
      if index == y_pred:
        predWord = word
        break
    seed_text = seed_text + ' ' + predWord
    predText.append(predWord)
  return ' '.join(predText)
    

In [None]:
seed_text = groups[200]
generate_text_seq(rnnModel, tokenizer, 14, seed_text, 15)