In [1]:
import numpy as np 
import pandas as pd 

import nltk
import json

from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from sklearn.model_selection import StratifiedKFold
from keras.layers import Activation, Dropout
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras import regularizers
from keras import optimizers

import gensim
from gensim.models import Word2Vec

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

Using TensorFlow backend.


In [2]:
target = pd.read_csv('/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/target.csv')
print(len(target))
target.head()

9703


Unnamed: 0,id,date,2,3,5,6,7,8,9,10,11,13,14,18
0,HUDOC-ECHR-1982-001-57417,1982-03-26,-1,-1,-1,0,-1,-1,-1,-1,-1,-1,-1,-1
1,HUDOC-ECHR-1982-001-57580,1982-09-23,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,0,-1
2,HUDOC-ECHR-1983-001-57554,1983-04-25,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1
3,HUDOC-ECHR-1983-001-57591,1983-11-23,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,-1
4,HUDOC-ECHR-1984-001-57465,1984-10-26,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1


In [4]:
#Open cases
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/{}.json"

with open(path.format('text_none'), 'r') as readfile:
    text = json.load(readfile)
    readfile.close()
    
print(len(text.keys()))
print(list(text.keys())[0])
print(len(text['HUDOC-ECHR-2012-001-110881']))

9703
HUDOC-ECHR-1982-001-57417
7


In [82]:
embedding = 'glove.6B.100d.txt'

# load the whole embedding into memory
embed_path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/embeddings/{}"

embeddings_index = dict()
f = open(embed_path.format(embedding), encoding='utf-8',errors='ignore')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))


Loaded 400000 word vectors.


In [10]:
# Load embeddings
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/embeddings/{}"
echr_100 = Word2Vec.load(path.format('echt2vec_100.txt'))
echr_200 = Word2Vec.load(path.format('echt2vec_200.txt'))
print("ECHR")

law_100 = gensim.models.KeyedVectors.load_word2vec_format(path.format('Law2Vec.100d.txt'), binary=False)
law_200 = gensim.models.KeyedVectors.load_word2vec_format(path.format('Law2Vec.200d.txt'), binary=False)
print("LAW")
word_100 = dict()
f = open(path.format('glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_100[word] = coefs
f.close()


word_200 = dict()
f = open(path.format('glove.6B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_200[word] = coefs
f.close()
print("GLOVE")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


ECHR


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


LAW
GLOVE


In [20]:
def featureSet(article,part,stop_words = None):
    """
    returns the feature set corrisponding to article and part 
    """
    df = target[['id',article]].sample(frac=1)
    
    #Get balanced dataset
    nvLen = len(df[df[article] == 0]) 
    vLen = len(df[df[article] == 1])
    minLen = min(nvLen,vLen)

    nvID = df[df[article] == 0][0:minLen]['id']
    vID = df[df[article] == 1][0:minLen]['id']
    
    nvCorpus = []
    vCorpus = []
    for ID in nvID:
        nvCorpus.append(text[ID][part])

    for ID in vID:
        vCorpus.append(text[ID][part])
        
    corpus = nvCorpus + vCorpus
    targets = [0]*minLen + [1]*minLen
    print("corpus",len(corpus))
    
    return corpus, array(targets)

In [21]:
def fitCNN(article,part,max_length,embedding,dimension):
    
    #training params
    batch_size = 256 
    num_epochs = 8 

    #model parameters
    num_filters = 64 
    embed_dim = 300 
    weight_decay = 1e-4
    
    
    article = article
    part = part
    docs,labels = featureSet(article,part,[])

    # prepare tokenizer
    t = Tokenizer()
    t.fit_on_texts(docs)
    vocab_size = len(t.word_index) + 1
    # integer encode the documents
    encoded_docs = t.texts_to_sequences(docs)
    #print(encoded_docs)
    # pad documents to a max length of 4 words
    max_length = max_length
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    #print(padded_docs)

    """embed_path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/embeddings/{}"
    try:
        embeddings_index = Word2Vec.load(embed_path.format(embedding))
    except:
        embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(embed_path.format(embedding), binary=False)
    embeddings_index.init_sims(replace=True)"""
    embeddings_index = embedding
    
    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, dimension))
    for word, i in tqdm(t.word_index.items()):
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    
    X = padded_docs
    Y = labels

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=95)
    cvscores = []
    for train, test in kfold.split(X, Y):
      # create model
        embedding_layer = Embedding(vocab_size, 
                                    dimension, 
                                    weights=[embedding_matrix], 
                                    input_length=max_length, 
                                    trainable=False)
        
        """embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)"""
        
        
        
        model = Sequential()
        model.add(embedding_layer)
        model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
        model.add(MaxPooling1D(2))
        model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
        model.add(GlobalMaxPooling1D())
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
        model.add(Dense(1, activation='sigmoid'))  #multi-label (k-hot encoding)

        adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
        model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
        
        model.fit(X[train], Y[train], epochs=10, verbose=0,batch_size=128)
        
            # evaluate the model
        scores = model.evaluate(X[test], Y[test], verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
        
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [22]:
fitCNN('6','procedure',2000,law_100,100)

corpus 1120


100%|██████████| 8118/8118 [00:00<00:00, 222203.54it/s]


acc: 76.34%
acc: 71.43%
acc: 69.64%
acc: 67.41%
acc: 65.18%
70.00% (+/- 3.80%)


In [None]:
def fitModel(article,part,max_length,embedding,dimension):
    article = article
    part = part
    docs,labels = featureSet(article,part,[])

    # prepare tokenizer
    t = Tokenizer()
    t.fit_on_texts(docs)
    vocab_size = len(t.word_index) + 1
    # integer encode the documents
    encoded_docs = t.texts_to_sequences(docs)
    #print(encoded_docs)
    # pad documents to a max length of 4 words
    max_length = max_length
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    #print(padded_docs)

    embed_path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/embeddings/{}"
    try:
        embeddings_index = Word2Vec.load(embed_path.format(embedding))
    except:
        embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(embed_path.format(embedding), binary=False)
    embeddings_index.init_sims(replace=True)

    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, dimension))
    for word, i in tqdm(t.word_index.items()):
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    
    X = padded_docs
    Y = labels

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=90)
    cvscores = []
    for train, test in kfold.split(X, Y):
      # create model
        model = Sequential()
        e = Embedding(vocab_size, dimension, weights=[embedding_matrix], input_length=max_length, trainable=False)
        model.add(e)
        model.add(Flatten())
        model.add(Dense(100, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        # Compile model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])    # Fit the model

        model.fit(X[train], Y[train], epochs=50, verbose=0,batch_size=20)
            # evaluate the model
        scores = model.evaluate(X[test], Y[test], verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))