Used to create embedding feature sets from various word embeddings

The same process as for n-gram features initially. i.e. get a balanced dataset.
<br>
Then stopwords are removed from the documents. This is done for three variations of stopwords as with n-gram feature selection. 
<br>
Then for each document a mean embedding vector is created using the various differnet embeddings. This is done by cycling through the document word by word adding the vector associated to that word to a list. Then after all the words have been cycled throught the average by column is taken for all the vectors. 

<br>
These features are then used in the same way as the n-gram features to fit models. 
<br> 
The embeddings included are:
<ul>
    <li> law2vec 100 dimensions 
    <li> law2vec 200 dimensions
    <li> various wiki 
    <li> custom echt2vec 100,200 and 500 
</ul>

In [2]:
import numpy as np 
import pandas as pd 

import nltk
import json


import gensim
from gensim.models import Word2Vec

In [21]:
embed_path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/embeddings/{}"
#embeddings = gensim.models.KeyedVectors.load_word2vec_format(embed_path.format("echt2vec_100.txt"), binary=False)

embeddings = Word2Vec.load(embed_path.format('echt2vec_500.txt'))
embeddings.init_sims(replace=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [5]:
target = pd.read_csv('/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/target.csv')
print(len(target))
target.head()

8703


Unnamed: 0,id,date,2,3,5,6,7,8,9,10,...,19,34,35,37,41,46,P1,P4,P12,P7
0,HUDOC-ECHR-1999-001-58225,1999-03-25,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
1,HUDOC-ECHR-1999-001-58226,1999-03-25,-1,-1,-1,0,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
2,HUDOC-ECHR-1999-001-58227,1999-03-25,-1,-1,-1,2,-1,2,-1,-1,...,-1,-1,2,-1,2,-1,2,-1,-1,-1
3,HUDOC-ECHR-1999-001-58239,1999-04-29,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1
4,HUDOC-ECHR-1999-001-58251,1999-05-20,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1


In [6]:
#Open cases
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/{}.json"

with open(path.format('text_alpha'), 'r') as readfile:
    text = json.load(readfile)
    readfile.close()
    
print(len(text.keys()))
print(list(text.keys())[0])
print(len(text['HUDOC-ECHR-2012-001-110881']))


8703
HUDOC-ECHR-1999-001-58225
7


In [7]:
def featureSet(article,part,stop_words = None):
    """
    returns the feature set corrisponding to article and part 
    """
    df = target[['id',article]]
    
    #Get balanced dataset
    nvLen = len(df[df[article] == 0]) 
    vLen = len(df[df[article] == 1])
    minLen = min(nvLen,vLen)

    nvID = df[df[article] == 0][0:minLen]['id']
    vID = df[df[article] == 1][0:minLen]['id']
    
    nvCorpus = []
    vCorpus = []
    for ID in nvID:
        nvCorpus.append(text[ID][part])

    for ID in vID:
        vCorpus.append(text[ID][part])
        
    corpus = nvCorpus + vCorpus
    targets = ['nonviolation']*minLen + ['violation']*minLen
    
    
    docs_vectors = pd.DataFrame() # creating empty final dataframe
    #stopwords = nltk.corpus.stopwords.words('english') # removing stop words
    for doc in corpus:
        temp = pd.DataFrame()  # creating a temporary dataframe(store value for 1st doc & for 2nd doc remove the details of 1st & proced through 2nd and so on..)
        for word in doc.split(' '): # looping through each word of a single document and spliting through space
            if word not in stop_words: # if word is not present in stopwords then (try)
                try:
                    word_vec = embeddings[word] # if word is present in embeddings(goole provides weights associate with words(300)) then proceed
                    temp = temp.append(pd.Series(word_vec), ignore_index = True) # if word is present then append it to temporary dataframe
                except:
                    pass
        doc_vector = temp.mean() # take the average of each column(w0, w1, w2,........w300)
        docs_vectors = docs_vectors.append(doc_vector, ignore_index = True) # append each document value to the final dataframe
    
    docs_vectors['target'] = targets
        
    return docs_vectors    

In [25]:
article = '6'
part = 'procedure'
features = featureSet(article,part,[])



In [23]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,0.011321,0.01446,-0.016181,0.001387,-0.019687,-0.016117,-0.036738,0.025446,-0.013823,0.003356,...,0.020419,0.021468,0.00725,-0.00032,0.012819,0.018111,0.028388,0.027493,0.003604,nonviolation
1,0.009993,0.006624,-0.014365,0.000621,-0.016161,-0.016264,-0.036446,0.028414,-0.012915,-0.001069,...,0.025133,0.03153,0.010492,0.002114,0.014982,0.014008,0.027591,0.024884,-0.003666,nonviolation
2,0.010176,0.015237,-0.01358,0.001344,-0.01526,-0.018593,-0.036681,0.02462,-0.016559,0.002768,...,0.019496,0.020421,0.009352,0.00078,0.011334,0.016373,0.025633,0.024931,0.002685,nonviolation
3,0.012798,0.017465,-0.016626,0.001322,-0.01692,-0.018701,-0.036673,0.027618,-0.01454,0.003669,...,0.019341,0.020002,0.008825,-0.000664,0.009478,0.022386,0.028831,0.028553,0.006616,nonviolation
4,0.006456,0.004688,-0.008301,0.002175,-0.015984,-0.013255,-0.031683,0.024989,-0.013704,-0.003329,...,0.019678,0.024906,0.004739,0.003573,0.01665,0.012128,0.020167,0.023561,-0.002069,nonviolation


### Fit models

In [10]:
#Imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import autosklearn.classification

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import json

import functions 
classif = functions.classification()

In [11]:
def trainTest(features):
    """Return train test split from features"""
    #print(features['target'])
    X = features.drop(columns='target')
    X=(X-X.mean())/X.std() #Normalise
    y = features['target']
    return train_test_split(X, y, test_size=0.10, random_state=90)

def k_fold(X_train, y_train,C,cv = 10,kernel='linear'):
    """
    Returns k_fold accuracy for given C parameter
    """
    model = SVC(kernel=kernel, C=C)
    scores = cross_val_score(model, X_train, y_train, cv=cv)
    accuracy = scores.mean()
    return accuracy

In [12]:
#Article 3 - ECHR 100
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.5066666666666666
0.001 0.728818620992534
0.01 0.766453667105841
0.1 0.7865107597716292
1 0.7536144049187528
10 0.7179995608256476


In [18]:
#Article 3 - ECHR 200
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.6403293807641633
0.001 0.7643785682916117
0.01 0.7930764163372859
0.1 0.7822134387351778
1 0.7643302591128677
10 0.7688735177865613


In [24]:
#Article 3 - ECHR 500
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.722538427755819
0.001 0.7929776021080369
0.01 0.817584541062802
0.1 0.7490206411945542
1 0.7179468599033817
10 0.7179468599033817


In [48]:
#Artcile 3 - 100
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.5044444444444445
0.001 0.6957180500658762
0.01 0.7198243302591129
0.1 0.7094005270092227
1 0.7291106719367588
10 0.7225340360122969


In [55]:
#Artcile 3 - 200
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.5199538866930171
0.001 0.7528173034694774
0.01 0.7798858146684233
0.1 0.7669433465085639
1 0.7510430390865173
10 0.7533135704874836


In [15]:
#Article 6 - ECHR 100
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.7372532587441001
0.001 0.8293370410277061
0.01 0.8414597867531552
0.1 0.8367444484351136
1 0.8221235438592671
10 0.8148208008184499


In [20]:
#Article 6 - ECHR 200
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.795410004970197
0.001 0.8498924333286171
0.01 0.8595607361696107
0.1 0.8329070573168916
1 0.8099563565916783
10 0.8063720101725783


In [26]:
#Article 6 - ECHR 500
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.8353607746251491
0.001 0.8655844697670538
0.01 0.8632343284610892
0.1 0.83177606541795
1 0.8269571510976155
10 0.8269571510976155


In [45]:
#Article 6 - 100
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.774883998505313
0.001 0.8256333373240026
0.01 0.8390641155407538
0.1 0.8282802392950301
1 0.8221378739891817
10 0.8075915223677003


In [52]:
#Article 6 - 200
X_train, X_test, y_train, y_test = trainTest(features)
hyper_c = [0.0001,0.001,0.01,0.1,1,10]
for C in hyper_c:
    accuracy = k_fold(X_train, y_train,C)
    print(C, accuracy)

0.0001 0.808810671774723
0.001 0.8451326534684357
0.01 0.8559459155501863
0.1 0.8486876140515086
1 0.7991563362755449
10 0.7771595868569128
