**Legal Text Generation using Recurrent Neural Networks**

**Dushyant Pathak**

**201701062**

**A slightly modified, uncustomized approach.**

1.   Used inbuilt one hot function to compare the results
2.   Smaller dataset.



In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
from __future__ import print_function
import os, io, sys, random
import numpy as np
import tensorflow as tf
from tensorflow import one_hot
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Activation, Dropout, Input, Lambda, Reshape, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.utils import to_categorical, plot_model

In [3]:
  ## Data Fetch and Clean

import numpy as np
import io

path_to_file = "/content/drive/My Drive/data_1.txt"
path_to_test = "/content/drive/My Drive/test.txt"

with io.open(path_to_file, encoding='utf-8') as corpus:
    text = corpus.read()

## Total number of chars in the vocabulary - with repetition
LENGTH = len(text)


Tx = 11 # length of each example in characters 

vocab = sorted(set(list(text))) # Set of all the characters in the corpus - to remove redundancies

## Mapping all characters to indices by enumerating all chars present
char_to_indices = dict((ch, idx) for idx, ch in enumerate(vocab))
## Mapping chars to indices 
index_to_char = dict((idx, ch) for idx, ch in enumerate(vocab))

sentences = [] # X
mapped_chars = [] # Y

step = 3

for i in range(0, LENGTH - Tx, step):
    temp_text = text[i: i+Tx]
    sentences.append(temp_text[:-1])
    mapped_chars.append(temp_text[-1])

m = len(sentences)

X = np.zeros((m, Tx - 1, len(vocab)))
Y = np.zeros((m, len(vocab)))

## One hot encoding the chars, per example(sentence wise)
for i, example in enumerate(sentences):
    X[i, :, :] = one_hot([char_to_indices[ch] for ch in example], depth=len(vocab))
    Y[i, :] = one_hot(char_to_indices[mapped_chars[i]], depth=len(vocab))

# Conversion to Numpy array for ease of use
X = np.asarray(X)
Y = np.asarray(Y)

#==============printing data dimesions=========================================
print(f"Length of corpus: {LENGTH}")
print(f"X.shape = {X.shape}")
print(f"Y.shape = {Y.shape}")
print(f"Number of examples: {m}") 

Length of corpus: 415756
X.shape = (138582, 10, 91)
Y.shape = (138582, 91)
Number of examples: 138582


In [0]:
def get_example(index = None):
    
    ## retrieves the example at index position in X is index is passed, otherwise random example is obtained
    ## :param index: index of example desired to be retrieved
    ## :return: string of text
    

    if index is None:
        index = np.random.randint(low=0, high=m)

    curr_x = [index_to_char[idx] for idx in np.argmax(X[index, :, :], axis=1)]
    curr_y = index_to_char[np.argmax(Y[index, :])]

    x_y = (''.join(curr_x), curr_y)

    return x_y

In [5]:
"""
testing a single example and the time it took to retrieve it
"""

import time

start = time.process_time()
example = get_example()
end = time.process_time()

print(f"Sample X: {example[0]}\nCorresponding Y: {example[1]}")
print(f"\nTime taken for acquiring this example: {end - start} seconds")

Sample X:  on object
Corresponding Y: i

Time taken for acquiring this example: 0.0004414549999864903 seconds


In [0]:
"""
network architecture creation
model creation
plot_model allows me to see what my neural network looks like
"""

def lstm_model(Tx, vocab, output_length):
  # network architecture LSTM -> Dropout -> Reshape -> LSTM -> Dropout -> Dense

  # define the initial hidden state a0 and initial cell state c0
  a0 = Input(shape=(output_length,), name='a0')
  c0 = Input(shape=(output_length,), name='c0')
  a = a0
  c = c0

  X = Input(shape=(Tx, len(vocab)), name='X')
  
  a, _, c = LSTM(units=output_length, activation='tanh', return_state=True, dtype='float32', name=f'lstm_1')(X, [a, c])
  a = Dropout(rate=0.2, name=f'dropout_1')(a)
  a = Reshape((1, output_length), name='reshape_1')(a) # needed after a dropout layer for another LSTM layer
  a = LSTM(units=output_length, activation='tanh', dtype='float32', name=f'lstm_2')(a)
  a = Dropout(rate=0.2, name=f'dropout_2')(a)
  out = Dense(units=len(vocab), activation='softmax', name=f'dense')(a)
    
  model = Model(inputs=[X, a0, c0], outputs=out, name='lstm_model')

  return model

In [7]:
"""
creating the model and the summary of it
"""
#====================Creating important variables===============================
n_a = 256 # number of hidden state dimensions for each LSTM cell

a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))
#===============================================================================

model = lstm_model(Tx=Tx - 1, vocab=vocab, output_length=n_a)

plot_model(model, to_file='/content/drive/My Drive/nn_graph.png')

model.summary()

Model: "lstm_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
X (InputLayer)                  [(None, 10, 91)]     0                                            
__________________________________________________________________________________________________
a0 (InputLayer)                 [(None, 256)]        0                                            
__________________________________________________________________________________________________
c0 (InputLayer)                 [(None, 256)]        0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 356352      X[0][0]                          
                                                                 a0[0][0]                

In [8]:
learning_rate = 0.01
learning_rate_decay = 0.001

optimizer = Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, decay=learning_rate_decay)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

batch_size = 100

model.fit([X, a0, c0], Y, batch_size=batch_size, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fac201f2048>

In [0]:
## Shows the model

filepath = '/content/drive/My Drive/dkp_model.h5'

In [0]:
model.save('/content/drive/My Drive/dkp_model.h5')

In [0]:
lstm_model = load_model(filepath)

In [12]:
accuracy = lstm_model.evaluate([X, a0, c0], Y, verbose=0)[1]
print(f"Accuracy on the training set: {round(accuracy*100, 4)}%")

Accuracy on the training set: 82.4537%


In [13]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds) 
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def generate_output():
      diversity = random.choice([0.2, 0.5, 0.7, 1.0, 1.2, 1.4])
      diversity = 0.2
      print(f'Diversity: {diversity}')
      generated = ''
      sentence = 'The government proposes'
      generated += sentence
      if len(sentence) > 10:
        sentence = sentence[:10]
      elif len(sentence) < 10:
        rem = 10 - len(sentence)
        sentence += ' ' * rem
      sys.stdout.write(generated + ' ')
      a0 = np.zeros((1, n_a))
      c0 = np.zeros((1, n_a))
      sys.stdout.write(sentence)
      for i in range(1000):
          x_pred = np.zeros((1, Tx-1, len(vocab)))
          for t, char in enumerate(sentence):
              if char != '0':
                  x_pred[0, t, char_to_indices[char]] = 1.
          preds = lstm_model.predict([x_pred, a0, c0], verbose=0)[0]
          next_index = sample(preds, temperature = 1.0)
          next_char = index_to_char[next_index]

          generated += next_char
          sentence = sentence[1:] + next_char

          sys.stdout.write(next_char)
          sys.stdout.flush()

          if next_char == '\n':
              print('\n')
          elif next_char == '\t':
              print('\t')

generate_output()

Diversity: 0.2
The government proposes The governmental authority of the Other Party shall not 


be made to the applicable laws and regulations. The measure or licences the former Party or they to promoting 


good through a means Xhat increased import 


to adopts or maintain a Tribunal shall be made in the part shall, in its Schedule in Annex 7(M) effort Deter prior to arbitration procedures, do not used by any produced or but and the administrative vaw of the equity to an international law berelate in agriculture. 











2. The Parties. Each Party shall grounds in 


resultions on     (vii) the term “strIgtar rights of inconsimed in Part 2 of the service supplier requirements specific commitment, on the rate laws and regulations, a goods for the objective rew insurational international legal 


measure Disputing such in directive, nothondings do not carry ad leasure commercial creditted shall be applied. 











(b) 		
with respect to Japan, investment app)icated 


2 or  