In [1]:
import tensorflow as tf
import string
import requests
import csv
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = []
with open('judgesSentencesOnly.csv', newline='') as file:
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        sentences.append(row)
sents = [sent[0] for sent in sentences if len(sent[0]) > 2]
print(sents[:10])

['\ufeffSentence', ' 1 September 23, 1932, the Secretary of Agriculture, acting under the Packers and Stockyards Act, 1921,1 ordered an inquiry and gave notice of a hearing to determine the reasonableness of rates charged by market agencies doing business at the Union Stockyards in Chicago', ' After protracted hearings and argument, he made findings of fact, announced his conclusion that the existing rates were unreasonable, and fixed new maximum rates', ' The appellants, who conduct market agencies, petitioned for rehearing', ' This the secretary denied, but by a supplemental order he increased some rates', ' An amended petition for rehearing was dismissed and the appellants then filed their bill in the District Court seeking an injunction against enforcement of the original and supplemental orders', ' The case was heard by three judges, who granted an interlocutory injunction', " At final hearing, the appellants offered in evidence the record of the proceedings before the secretary a

In [3]:
sents = " ".join(sents)
sents[:30]

'\ufeffSentence  1 September 23, 193'

In [4]:
def clean_text(data):
  words = data.split()
  table = str.maketrans('', '', string.punctuation)
  words = [word.translate(table) for word in words]
  words = [word for word in words if word.isalpha() ]
  words = [word.lower() for word in words]
  return words

cleanSents = clean_text(sents)
cleanSents[:10]

['september',
 'the',
 'secretary',
 'of',
 'agriculture',
 'acting',
 'under',
 'the',
 'packers',
 'and']

In [5]:
predLength = 15
groups = []
for i in range(predLength, len(cleanSents)):
    part = cleanSents[i - predLength: i]
    grouping = ' '.join(part)
    groups.append(grouping)
print(len(groups))
groups[0]

81522


'september the secretary of agriculture acting under the packers and stockyards act ordered an inquiry'

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(groups)
sequences = tokenizer.texts_to_sequences(groups)
print(type(sequences))
sequences = np.array(sequences)
print(type(sequences))

<class 'list'>
<class 'numpy.ndarray'>


In [7]:
print(np.shape(sequences))
X = sequences[:, :-1]
y = sequences[:, -1]
print(X[0])
print(y[0])

(81522, 15)
[ 663    1  123    2 1186 2739   48    1 5652    4 2738   22  579   24]
1657


In [8]:
uniqueWords = 1 + len(tokenizer.word_index)
uniqueWords

5653

In [9]:
y = to_categorical(y, num_classes=uniqueWords)

In [10]:
rnnModel = tf.keras.models.Sequential()
rnnModel.add(tf.keras.layers.Embedding(uniqueWords, 14, input_length=14))
rnnModel.add(tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))
rnnModel.add(tf.keras.layers.LSTM(256, return_sequences=False, dropout=0.3, recurrent_dropout=0.2))
rnnModel.add(tf.keras.layers.Dense(units = 256, activation='relu'))
rnnModel.add(tf.keras.layers.Dense(units = 256, activation='relu'))
rnnModel.add(tf.keras.layers.Dense(units = uniqueWords, activation='softmax'))
rnnModel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 14, 14)            79142     
_________________________________________________________________
lstm (LSTM)                  (None, 14, 256)           277504    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_2 (Dense)              (None, 5653)              1452821   
Total params: 2,466,363
Trainable params: 2,466,363
Non-trainable params: 0
______________________________________________

In [11]:
print(X.shape)
print(y.shape)

(81522, 14)
(81522, 5653)


In [12]:
rnnModel.compile(optimizer='adam', loss='categorical_crossentropy', metrics= ['accuracy'])
rnnModel.fit(X, y, batch_size=64, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fe820f467d0>

In [13]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, numWords):
  predText = []
  for _ in range(numWords):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=text_seq_length, truncating='pre')
    y_pred = model.predict_classes(encoded)
    predWord = ''
    for word, index in tokenizer.word_index.items():
      if index == y_pred:
        predWord = word
        break
    seed_text = seed_text + ' ' + predWord
    predText.append(predWord)
  return ' '.join(predText)

In [15]:
seed_text = groups[200]
#generate_text_seq(rnnModel, tokenizer, 14, seed_text, 15)