In [None]:
import pandas as pd
import numpy as np
import sklearn.ensemble
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import RandomizedSearchCV
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import seaborn as sns
import warnings
from imblearn.under_sampling import RandomUnderSampler
import joblib
from nltk import word_tokenize
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
import html,nltk
from nltk.corpus import wordnet 
from collections import Counter 
from string import digits

def text_cleaning(text, escape_list=[], stop=[]):
    """
    Text cleaning function:
    """
    text=text.lower()
    StopWords = list(set(stopwords.words('dutch')))
    custom_stop = StopWords + stop
    text = html.unescape(text)
    text = re.sub('[^A-Za-z]+', ' ', text)
    text=text.replace('/',' ').replace('?',' ').replace(',',' ').replace('\'',' ')
    tokens=nltk.word_tokenize(text)
    tokens=([token for token in tokens if token not in custom_stop]) 
    return ' '.join(tokens)

In [None]:
df = pd.read_csv("../data/court_cases.csv", lineterminator='\n', index_col=0)
df['Full Text'] = df['process'] + ' ' + df['considerations']
df.dropna(subset=['Full Text'],inplace=True)
df = df.sample(frac=1).reset_index(drop=True).copy()
df['Full Text'] = df['Full Text'].apply(text_cleaning)
df['Full Text'] = df['Full Text'].apply(lambda x: ' '.join(word_tokenize(x)[-500:]))

In [None]:
rus = RandomUnderSampler(random_state=42)
X= df[['process', 'considerations', 'instance','Full Text']]
y = df[['outcome']]
X_rus, y_rus = rus.fit_resample(X,y)
X_rus['outcome'] = y_rus
df = X_rus

In [None]:
X,y = df_full['Full Text'], df_full.outcome
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=42)

##### Training SVM

In [None]:
params = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'svm__C': [0.001,0.01,0.1,1,10,100,1000],
}

tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', ngram_range= (1,1))
svm = LinearSVC(random_state=42)
pipeline = Pipeline(steps = [('tfidf',tfidf),('svm',svm)])
skf_cv = StratifiedKFold(n_splits = 5, random_state = 42)
rsc = RandomizedSearchCV(pipeline,params,c v= skf_cv,scoring='accuracy', random_state=42,n_jobs=-1)
rsc.fit(X,y)
print(f'Best CrossValidated accuracy achieved via SVM is : {round(rsc.best_score_*100,2)} %')
# Best params for SVM is: 
rsc.best_params_

In [None]:
# Best params for SVM is: 
rsc.best_params_

In [None]:
feature_names = rsc.best_estimator_.named_steps["tfidf"].get_feature_names()
feature_importance = rsc.best_estimator_.named_steps["svm"].coef_.flatten()
fi = pd.DataFrame({'FeatureNames':feature_names,'FeatureImportance':feature_importance}).sort_values('FeatureImportance')

In [None]:
#fi['color'] = fi.FeatureImportance.apply(lambda x:'Positive' if x>=0 else 'Negative')

In [None]:
def plot_features_of_explained_model(imp):
    imp = imp.copy()
    imp.set_index('FeatureNames',inplace=True)
    imp['color'] = imp['FeatureImportance'].apply(lambda x:'Positive' if x>=0 else 'Negative')
    imp['FeatureImportance'] = imp['FeatureImportance'].apply(abs)
    imp = imp.sort_values('FeatureImportance',ascending=False)[:50]
    sns.set(rc={'figure.figsize':(12,12)})
    palette = ["#55a868","#c44e52"]
    ax = sns.barplot(x=imp['FeatureImportance'], y=imp.index,hue=imp['color'],palette=palette, dodge=False,hue_order=["Positive", "Negative"]).set_title(f'Feature Importance :',fontsize=15)
    plt.show()

In [None]:
plot_features_of_explained_model(fi)

In [None]:
pipeline_svm = Pipeline(steps = [('tfidf',TfidfVectorizer(sublinear_tf=True, norm='l2',ngram_range=(1,1),max_df=.5)),
                        ('svm',CalibratedClassifierCV(LinearSVC(C=.1),method='isotonic'))])

pipeline_svm.fit(X,y)

In [None]:
# Saving the features
fi.FeatureImportance = fi.FeatureImportance.apply(abs)
fi.sort_values('FeatureImportance',ascending=False).to_csv('../features/svm_1_1_builtin_500_words.csv',index=False)

#### LIME

In [None]:
def get_feature_importance_for_full_document_parallel(X,y,idx,pipeline,num_features):

    explainer = LimeTextExplainer(class_names=[0,1])
    exp = explainer.explain_instance(X, pipeline.predict_proba, num_features=num_features, labels=[0,1])
    imp = pd.DataFrame(exp.as_list(label=1),columns=['word','importance'])
    imp['ID'] = idx
    imp['Real Class'] = y
    imp['Predicted Class'] = pipeline.predict([X]).reshape(1,-1)[0,0]
    imp['Predicted Class Probability'] = pipeline.predict_proba([X]).max()
    return imp

In [None]:
#to speed up lime with multiprocessing

import multiprocessing
import time
from tqdm import tqdm

def get_line_fi_multiprocessing(X,y,pipeline):
    start = time.time()

    with multiprocessing.Pool(processes=1) as pool:
        results = pool.starmap(get_feature_importance_for_full_document_parallel, [(X[idx],y[idx],idx,pipeline,500) for idx in range(len(X))])
        
    full_df = pd.concat(results)
    end = time.time()

    total_time = end - start
    print('Time :',total_time)
    print(f'With multiprocessing the job was finished in {int(total_time/3600)} hours {int(round(((total_time/3600)%1)*60,0))} minutes.')
    
    return full_df

In [None]:
lime_svm_fi = get_line_fi_multiprocessing(X,y,pipeline_svm)

In [None]:
lime_svm_fi.to_csv('../features/svm_1_1_lime_500_words.csv.gz',index=False,compression='gzip')

#### SHAP

In [None]:
import shap

In [None]:
X,y = df['Full Text'], df.outcome
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.3, random_state=3)

def get_shap_values_full(pipeline_svm,X_test,classifier_name):
    X_tfidf_tfidf = pipeline_svm.named_steps['tfidf'].transform(X_test)
    X_train_summary = shap.kmeans(X_tfidf_tfidf, 10)
    explainer = shap.KernelExplainer(pipeline_svm.named_steps[classifier_name].predict_proba, X_train_summary)
    tfidf_features_svm = pipeline_svm.named_steps['tfidf'].get_feature_names()
    per_loop_count = 20
    start, end = 0, per_loop_count
    final_df_svm=pd.DataFrame()
    while end < len(X_test):
        print(end)
        X_tfidf = X_tfidf_tfidf[start:end].copy()
        loop_df = get_shap_feature_importance(explainer, X_tfidf,tfidf_features_svm)
        final_df_svm = pd.concat([loop_df,final_df_svm])
        start+=per_loop_count
        end+=per_loop_count
        if end%50:
            final_df_svm = final_df_svm.query('importance!=0')
    end = len(X_test)
    X_tfidf = X_tfidf_tfidf[start:end].copy()
    loop_df = get_shap_feature_importance(explainer, X_tfidf,tfidf_features_svm)
    final_df_svm = pd.concat([loop_df,final_df_svm])
    final_df_svm = final_df_svm.query('importance!=0')
    return final_df_svm

def get_shap_feature_importance(explainer, X_tfidf,tfidf_features_svm):
    loop_shap_values = explainer.shap_values(X_tfidf)
    loop_vals= np.abs(loop_shap_values).mean(0)
    loop_feature_importance_shap_svm = pd.DataFrame(list(zip(tfidf_features_svm, sum(loop_vals))), columns=['word','importance'])
    return loop_feature_importance_shap_svm


In [None]:
final_df_svm = get_shap_values_full(pipeline_svm,X_test,'svm')

In [None]:
final_df_svm.to_csv('../features/svm_1_1_shap_500_words.csv.gz',index=False,compression='gzip')

#### Training XGBOOST

In [None]:
from xgboost import XGBClassifier

params_xgb = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'xgb__eta': [0.01,0.015,0.025,0.05, 0.1],
    'xgb__gamma':[0.05,.1,.3,.5,.7,.9,1],
    'xgb__max_depth' : [3,5,7,9,12,15,17,25,50,100],
    'xgb__min_child_weight' : [1,3,5,7],
    'xgb__subsample' : [0.6,.7,.8,.9,1],
    'xgb__colsample_bytree' : [.6,.7,.8,.9,1],
    'xgb__lambda' : [0.01,.1,1],
    'xgb__alpha': [0,.1,.5,1]
}

tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2',ngram_range=(1,1))
xgb = XGBClassifier(TREE_METHOD = 'gpu_hist', random_state=42)
pipeline_xgb = Pipeline(steps = [('tfidf',tfidf),('xgb',xgb)])
skf_cv = StratifiedKFold(n_splits=5, random_state = 42)
rsc_xgb = RandomizedSearchCV(pipeline_xgb,params_xgb,cv= skf_cv,scoring='accuracy', random_state=0,n_jobs=-1)
rsc_xgb.fit(X,y)
print(f'Best CrossValidated accuracy achieved via SVM is : {round(rsc_xgb.best_score_*100,2)} %\n')
from pprint import pprint
print('XGB Best params : \n')
pprint(rsc_xgb.best_params_)


In [None]:
feature_xgb = rsc_xgb.best_estimator_.named_steps["xgb"].get_booster().get_score(importance_type = "gain")
vocab = rsc_xgb.best_estimator_.named_steps["tfidf"].vocabulary_
key_to_vocab = dict([(value, key) for key, value in vocab.items()]) 

In [None]:
feature_importance_dict_xgb = {key_to_vocab[int(key[1:])] : value for key,value in feature_xgb.items()}

In [None]:
feature_importance_df_xgb = pd.DataFrame(feature_importance_dict_xgb,columns=['Features','Importance'],index=range(len(feature_importance_dict_xgb)))

In [None]:
xgb_feature_importance = pd.DataFrame(feature_importance_dict_xgb,index=range(len(feature_importance_dict_xgb))).T.iloc[:,0].to_frame('importance')
xgb_feature_importance.sort_values('importance',ascending=False,inplace=True)

In [None]:
xgb_feature_importance = xgb_feature_importance.iloc[:50,:]

In [None]:
sns.barplot(x=xgb_feature_importance['importance'], y=xgb_feature_importance.index,palette=["#55a868"], dodge=False)

In [None]:
xgb_feature_importance.reset_index(inplace=True)

In [None]:
xgb_feature_importance.columns = ['FeatureNames', 'FeatureImportance']

In [None]:
xgb_feature_importance.to_csv('../features/xgb_1_1_builtin_500_words.csv',index=False)

In [None]:
rsc_xgb.best_params_

In [None]:
pipeline_xgb = Pipeline(steps = [('tfidf',TfidfVectorizer(sublinear_tf=True, norm='l2',ngram_range=(1,1),max_df=.5)),
                        ('xgb', XGBClassifier(subsample=.7,min_child_weight=3,max_depth=17,gamma=.3,eta=.01,colsample_bytree=.6,alpha=1))])

pipeline_xgb.fit(X,y)

#### Lime

In [None]:
from tqdm import tqdm
xgb_fis = []
for idx in tqdm(range(len(X))):
    loop_fi = get_feature_importance_for_full_document_parallel(X[idx],y[idx],idx,pipeline_xgb,500)
    xgb_fis.append(loop_fi)
lime_xgb_fi = pd.concat(xgb_fis)

In [None]:
lime_xgb_fi.to_csv('../features/xgb_1_1_lime_500_words.csv.gz',index=False,compression='gzip')

#### Shap

In [None]:
final_df_xgb = get_shap_values_full(pipeline_xgb,X_test,'xgb')
final_df_xgb.to_csv('../features/xgb_1_1_shap_500_words.csv.gz',index=False,compression='gzip')