In [None]:
import pandas as pd
import numpy as np
import sklearn.ensemble
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
import lime
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
import warnings
import joblib
warnings.filterwarnings("ignore")
%matplotlib inline


In [None]:
import html,nltk
from nltk.corpus import wordnet 
from collections import Counter 
from string import digits

def text_cleaning(text, escape_list=[], stop=[]):
    """
    Text cleaning function:
    """
    text=text.lower()
    StopWords = list(set(stopwords.words('dutch')))
    custom_stop = StopWords + stop
    text = html.unescape(text)
    text = re.sub('[^A-Za-z]+', ' ', text)
    text=text.replace('/',' ').replace('?',' ').replace(',',' ').replace('\'',' ')
    tokens=nltk.word_tokenize(text)
    tokens=([token for token in tokens if token not in custom_stop]) 
    return ' '.join(tokens)


In [None]:
#Load data
df = pd.read_csv("../data/court_cases.csv", lineterminator='\n', index_col=0)
df['Full Text'] = df['process'] + ' ' + df['considerations']
df.dropna(subset=['Full Text'],inplace=True)
df = df.sample(frac=1).reset_index(drop=True).copy()
df['Full Text'] = df['Full Text'].apply(text_cleaning)

In [None]:
df_full = df
rus = RandomUnderSampler(random_state=42)
X= df[['process', 'considerations', 'instance','Full Text']]
y = df[['outcome']]
X_rus, y_rus = rus.fit_resample(X,y)
X_rus['outcome'] = y_rus
df_full = X_rus

#### Full dataset

In [None]:
X,y = df_full['Full Text'], df_full.outcome

In [None]:
params = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'svm__C': [0.001,0.01,0.1,1,10,100,1000],
}

tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2')
svm = LinearSVC(random_state=42)
pipeline = Pipeline(steps = [('tfidf',tfidf),('svm',svm)])
skf_cv = StratifiedKFold(n_splits=5, random_state = 42)
rsc = RandomizedSearchCV(pipeline,params,c v= skf_cv,scoring='accuracy', random_state=42,n_jobs=-1)
rsc.fit(X,y)
print(f'Best CrossValidated accuracy achieved via SVM is : {round(rsc.best_score_*100,2)} %')
# Best params for SVM is: 
rsc.best_params_

In [None]:
params_xgb = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'xgb__eta': [0.01,0.015,0.025,0.05, 0.1],
    'xgb__gamma':[0.05,.1,.3,.5,.7,.9,1],
    'xgb__max_depth' : [3,5,7,9,12,15,17,25,50,100],
    'xgb__min_child_weight' : [1,3,5,7],
    'xgb__subsample' : [0.6,.7,.8,.9,1],
    'xgb__colsample_bytree' : [.6,.7,.8,.9,1],
    'xgb__lambda' : [0.01,.1,1],
    'xgb__alpha': [0,.1,.5,1]
}

tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2')
xgb = XGBClassifier(TREE_METHOD = 'gpu_hist', random_state=42)
pipeline_xgb = Pipeline(steps = [('tfidf',tfidf),('xgb',xgb)])
skf_cv = StratifiedKFold(n_splits=5, random_state = 42)
rsc_xgb = RandomizedSearchCV(pipeline_xgb,params_xgb,cv= skf_cv,scoring='accuracy', random_state=42,n_jobs=-1)
rsc_xgb.fit(X,y)
print(f'Best CrossValidated accuracy achieved via XGBoost is : {round(rsc_xgb.best_score_*100,2)} %\n')
# Best params for XGBoost is: 
rsc.best_params_

#### Only court cases at Rechtbank Den Hague

In [None]:
df_lower_court = df[df.instance == "['Rechtbank Den Haag']"]
X= df_lower_court [['process', 'considerations', 'instance','Full Text']]
y = df_lower_court [['outcome']]
X_rus, y_rus = rus.fit_resample(X,y)
X_rus['outcome'] = y_rus
df_lower_court  = X_rus

In [None]:
df_lower_court = df_lower_court.sample(frac=1).reset_index(drop=True).copy()

In [None]:
X,y = df_lower_court['Full Text'], df_lower_court.outcome

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2')
svm = LinearSVC(random_state=42)
pipeline = Pipeline(steps = [('tfidf',tfidf),('svm',svm)])
skf_cv = StratifiedKFold(n_splits=5, random_state = 42)
rsc = RandomizedSearchCV(pipeline,params,c v= skf_cv,scoring='accuracy', random_state=42,n_jobs=-1)
rsc.fit(X,y)
print(f'Best CrossValidated accuracy achieved via SVM is : {round(rsc.best_score_*100,2)} %')
# Best params for SVM is: 
rsc.best_params_

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2')
xgb = XGBClassifier(TREE_METHOD = 'gpu_hist', random_state=42)
pipeline_xgb = Pipeline(steps = [('tfidf',tfidf),('xgb',xgb)])
skf_cv = StratifiedKFold(n_splits=5, random_state = 42)
rsc_xgb = RandomizedSearchCV(pipeline_xgb,params_xgb,cv= skf_cv,scoring='accuracy', random_state=42,n_jobs=-1)
rsc_xgb.fit(X,y)
print(f'Best CrossValidated accuracy achieved via XGBoost is : {round(rsc_xgb.best_score_*100,2)} %\n')
# Best params for XGBoost is: 
rsc.best_params_

#### Only court cases at Raad van State

In [None]:
df_higher_court = df[df.instance == "['Raad van State']"]
X= df_higher_court  [['process', 'considerations', 'instance','Full Text']]
y = df_higher_court  [['outcome']]
X_rus, y_rus = rus.fit_resample(X,y)
X_rus['outcome'] = y_rus
df_higher_court  = X_rus

In [None]:
df_higher_court = df_higher_court.sample(frac=1).reset_index(drop=True).copy()

In [None]:
X,y = df_higher_court['Full Text'], df_higher_court.outcome

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2')
svm = LinearSVC(random_state=42)
pipeline = Pipeline(steps = [('tfidf',tfidf),('svm',svm)])
skf_cv = StratifiedKFold(n_splits=5, random_state = 42)
rsc = RandomizedSearchCV(pipeline,params,c v= skf_cv,scoring='accuracy', random_state=42,n_jobs=-1)
rsc.fit(X,y)
print(f'Best CrossValidated accuracy achieved via SVM is : {round(rsc.best_score_*100,2)} %')
# Best params for SVM is: 
rsc.best_params_

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2')
xgb = XGBClassifier(TREE_METHOD = 'gpu_hist', random_state=42)
pipeline_xgb = Pipeline(steps = [('tfidf',tfidf),('xgb',xgb)])
skf_cv = StratifiedKFold(n_splits=5, random_state = 42)
rsc_xgb = RandomizedSearchCV(pipeline_xgb,params_xgb,cv= skf_cv,scoring='accuracy', random_state=42,n_jobs=-1)
rsc_xgb.fit(X,y)
print(f'Best CrossValidated accuracy achieved via XGBoost is : {round(rsc_xgb.best_score_*100,2)} %\n')
# Best params for XGBoost is: 
rsc.best_params_