In [2]:
import numpy as np 
import pandas as pd 

import nltk
import json

from numpy import array
from numpy import asarray
from numpy import zeros

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from sklearn.model_selection import StratifiedKFold
from keras.layers import Activation, Dropout
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras import regularizers
from keras import optimizers

from tqdm import tnrange, tqdm_notebook

import gensim
from gensim.models import Word2Vec

Using TensorFlow backend.


In [3]:
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/{}.json"

with open(path.format('train_test_split'), 'r') as readfile:
    train_test_split = json.load(readfile)
    readfile.close()

In [4]:
#Open cases
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/{}.json"

with open(path.format('text_none'), 'r') as readfile:
    text_none = json.load(readfile)
    readfile.close()
    
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/{}.json"

with open(path.format('text_english'), 'r') as readfile:
    text_english = json.load(readfile)
    readfile.close()

In [5]:
# Load embeddings
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/embeddings/{}"
echr_100 = Word2Vec.load(path.format('echt2vec_100.txt'))
echr_200 = Word2Vec.load(path.format('echt2vec_200.txt'))
print("ECHR")

law_100 = gensim.models.KeyedVectors.load_word2vec_format(path.format('Law2Vec.100d.txt'), binary=False)
law_200 = gensim.models.KeyedVectors.load_word2vec_format(path.format('Law2Vec.200d.txt'), binary=False)
print("LAW")
word_100 = dict()
f = open(path.format('glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_100[word] = coefs
f.close()


word_200 = dict()
f = open(path.format('glove.6B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_200[word] = coefs
f.close()
print("GLOVE")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


ECHR


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


LAW
GLOVE


In [6]:
# Fucntions
def getPart(ID,part,stopwords):
    """
    Returns clean case part
    """
    
    if stopwords == 'none':
        doc = text_none[ID][part]
    elif stopwords == 'english':
        doc = text_english[ID][part]
    return doc


def getCorpus(article,part,stopwords):
    """
    returns the train test corpus along with target
    """
    split = train_test_split[article]
    
    train_corpus = []
    test_corpus = []
    
    for ID in split['v_train'] + split['nv_train']:
        doc = getPart(ID,part,stopwords)
        train_corpus.append(doc)
        
    for ID in split['v_test']+ split['nv_test']:
        doc = getPart(ID,part,stopwords)
        test_corpus.append(doc)
        
    train_target = [1]*len(split['v_train']) + [0]*len(split['nv_train'])
    test_target = [1]*len(split['v_test']) + [0]*len(split['nv_test'])
    
    
    return train_corpus, test_corpus, train_target, test_target
    
    
train_corpus, test_corpus, train_target, test_target = getCorpus('6','procedure','none')
print(len(train_corpus),len(test_corpus),len(train_target),sum(train_target),len(test_target),sum(test_target))

1008 447 1008 504 447 391


## Train Models

In [12]:
def fitCNN(article,part,stopwords,embedding,dimension,batch_size,num_epochs,num_filters,kernel_size):
    
    #training params
    batch_size = batch_size
    num_epochs = num_epochs

    #model parameters
    num_filters = num_filters
    kernel_size = kernel_size
    weight_decay = 1e-4

    train_corpus, test_corpus, train_target, test_target = getCorpus('6','procedure','none')
    docs,labels = train_corpus,train_target
    
    # prepare tokenizer
    t = Tokenizer()
    t.fit_on_texts(docs)
    vocab_size = len(t.word_index) + 1
    # integer encode the documents
    encoded_docs = t.texts_to_sequences(docs)
    doc_length = [len(x) for x in encoded_docs]
    #print(encoded_docs)
    # pad documents to a max length of 4 words
    max_length = max(doc_length)
    print(max_length)
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    #print(padded_docs)

    embeddings_index = embedding
    

    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, dimension))
    for word, i in t.word_index.items():
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(len(embedding_matrix))
    
    
    X = np.array(padded_docs)
    Y = np.array(labels)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=90)
    cvscores = []
    for train, test in kfold.split(X, Y):
      # create model
        embedding_layer = Embedding(vocab_size, 
                                    dimension, 
                                    weights=[embedding_matrix], 
                                    input_length=max_length, 
                                    trainable=False)
          
        model = Sequential()
        model.add(embedding_layer)
        model.add(Conv1D(num_filters, 3, activation='relu', padding='same'))
        model.add(Conv1D(num_filters, 4, activation='relu', padding='same'))
        model.add(Conv1D(num_filters, 5, activation='relu', padding='same'))
        model.add(GlobalMaxPooling1D())
        """model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
        model.add(GlobalMaxPooling1D())"""
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

        ada = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
        model.compile(loss='binary_crossentropy', optimizer=ada, metrics=['accuracy'])
        
        model.fit(X[train], Y[train], epochs=10, verbose=0,batch_size=128)
        
            # evaluate the model
        scores = model.evaluate(X[test], Y[test], verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
        
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

echr	100	Article 6	both	english	0.755863


In [13]:
batch_size = 50
num_epochs =10
num_filters = 100
kernel_size = 3

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 70.79%
acc: 65.35%
acc: 70.30%
acc: 70.79%
acc: 68.00%
69.05% (+/- 2.12%)


# epochs

In [73]:
batch_size = 50
num_epochs =5
num_filters = 100
kernel_size = 5

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 71.29%
acc: 70.79%
acc: 70.30%
acc: 73.27%
acc: 71.50%
71.43% (+/- 1.01%)


In [74]:
batch_size = 50
num_epochs =15
num_filters = 100
kernel_size = 5

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 68.81%
acc: 70.30%
acc: 69.80%
acc: 71.29%
acc: 74.50%
70.94% (+/- 1.95%)


## filters

In [None]:
batch_size = 50
num_epochs =10
num_filters = 50
kernel_size = 5

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 69.80%
acc: 66.83%
acc: 69.31%
acc: 74.75%
acc: 72.50%
70.64% (+/- 2.73%)


In [None]:
batch_size = 50
num_epochs =10
num_filters = 150
kernel_size = 5

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 67.82%
acc: 67.33%
acc: 70.79%
acc: 72.28%
acc: 73.50%
70.34% (+/- 2.42%)


# kernel

In [None]:
batch_size = 50
num_epochs =10
num_filters = 100
kernel_size = 10

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 71.78%
acc: 68.32%
acc: 71.78%
acc: 78.71%
acc: 78.50%
73.82% (+/- 4.11%)


In [None]:
batch_size = 50
num_epochs =10
num_filters = 100
kernel_size = 20

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 70.30%
acc: 65.35%
acc: 71.78%
acc: 73.76%
acc: 72.50%
70.74% (+/- 2.92%)


In [None]:
batch_size = 50
num_epochs =10
num_filters = 100
kernel_size = 30

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 66.83%
acc: 63.86%
acc: 72.77%
acc: 68.81%
acc: 69.50%
68.36% (+/- 2.95%)


In [8]:
batch_size = 50
num_epochs =10
num_filters = 100
kernel_size = 50

fitCNN('6','both','english',echr_200,200,batch_size,num_epochs,num_filters,kernel_size)

796
7631




acc: 71.78%
acc: 65.35%
acc: 67.82%


KeyboardInterrupt: 