# Used to create feature sets
N-gram features

In [1]:
#Imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import json
import pickle

import functions 
classif = functions.classification()

In [2]:
#Open cases
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/{}.json"

with open(path.format('text_alpha'), 'r') as readfile:
    text = json.load(readfile)
    readfile.close()
    
print(len(text.keys()))
print(list(text.keys())[0])
print(len(text['HUDOC-ECHR-2012-001-110881']))

8703
HUDOC-ECHR-1999-001-58225
7


In [3]:
#Check target 
target = pd.read_csv('/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/target.csv')
print(len(target))
target.head()

8703


Unnamed: 0,id,date,2,3,5,6,7,8,9,10,...,19,34,35,37,41,46,P1,P4,P12,P7
0,HUDOC-ECHR-1999-001-58225,1999-03-25,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
1,HUDOC-ECHR-1999-001-58226,1999-03-25,-1,-1,-1,0,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
2,HUDOC-ECHR-1999-001-58227,1999-03-25,-1,-1,-1,2,-1,2,-1,-1,...,-1,-1,2,-1,2,-1,2,-1,-1,-1
3,HUDOC-ECHR-1999-001-58239,1999-04-29,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1
4,HUDOC-ECHR-1999-001-58251,1999-05-20,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1


In [4]:
legal_stopwords =  pickle.load( open( "/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/stopwords.pickle", "rb" ) )
len(legal_stopwords)

215

In [5]:
def featureSet(article,part,max_features,ngram_range,stop_words):
    """
    returns the feature set corrisponding to article and part 
    """
    df = target[['id',article]]
    
    #Get balanced dataset
    nvLen = len(df[df[article] == 0]) 
    vLen = len(df[df[article] == 1])
    minLen = min(nvLen,vLen)

    nvID = df[df[article] == 0][0:minLen]['id']
    vID = df[df[article] == 1][0:minLen]['id']
    
    nvCorpus = []
    vCorpus = []
    for ID in nvID:
        nvCorpus.append(text[ID][part])

    for ID in vID:
        vCorpus.append(text[ID][part])
        
    corpus = nvCorpus + vCorpus
    targets = ['nonviolation']*minLen + ['violation']*minLen
    
    #Vectorise
    #
    vectorizer = CountVectorizer(max_features= max_features,ngram_range=ngram_range,stop_words =stop_words)
    X = vectorizer.fit_transform(corpus,y=targets)
    X.toarray()
    
    #Features
    features = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())
    features['target'] = targets
    
    return features
    
article = '6'
part = 'facts'
max_features = 2000
ngram_range = (1,4)


#features = featureSet(article,part,max_features,ngram_range)

#features.to_csv()
#features.head()

### English stopwords

In [6]:
file ='english'
stop_words = 'english'

path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/features/{}/article{}_{}.csv".format(file,'{}','{}')

print(path)
articles = ['3','6','8']
parts = ["full","procedure","facts","circumstances","relevant","law","other"]
max_features = 2000
ngram_range = (1,4)

for article in articles:
    print(article)
    for part in parts:
        print(part)
        features = featureSet(article,part,max_features,ngram_range,stop_words)
        features.to_csv(path.format(article,part),index=False)

/Users/conorosully/Documents/Legal-Case-Prediction/data/features/english/article{}_{}.csv
3
full
procedure
facts
circumstances
relevant
law
other
6
full
procedure
facts
circumstances
relevant
law
other
8
full
procedure
facts
circumstances
relevant
law
other


### Legal-stopwords

In [7]:
file ='legal'
stop_words = legal_stopwords

path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/features/{}/article{}_{}.csv".format(file,'{}','{}')

print(path)
articles = ['3','6','8']
parts = ["full","procedure","facts","circumstances","relevant","law","other"]
max_features = 2000
ngram_range = (1,4)

for article in articles:
    print(article)
    for part in parts:
        print(part)
        features = featureSet(article,part,max_features,ngram_range,stop_words)
        features.to_csv(path.format(article,part),index=False)

/Users/conorosully/Documents/Legal-Case-Prediction/data/features/legal/article{}_{}.csv
3
full
procedure
facts
circumstances
relevant
law
other
6
full
procedure
facts
circumstances
relevant
law
other
8
full
procedure
facts
circumstances
relevant
law
other


### None stop-words

In [8]:
file ='none'
stop_words = None

path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/features/{}/article{}_{}.csv".format(file,'{}','{}')

print(path)
articles = ['3','6','8']
parts = ["full","procedure","facts","circumstances","relevant","law","other"]
max_features = 2000
ngram_range = (1,4)

for article in articles:
    print(article)
    for part in parts:
        print(part)
        features = featureSet(article,part,max_features,ngram_range,stop_words)
        features.to_csv(path.format(article,part),index=False)

/Users/conorosully/Documents/Legal-Case-Prediction/data/features/none/article{}_{}.csv
3
full
procedure
facts
circumstances
relevant
law
other
6
full
procedure
facts
circumstances
relevant
law
other
8
full
procedure
facts
circumstances
relevant
law
other


### Get number of documents