In [1]:
import pandas as pd
import numpy as np
import re
import time
import requests
import xmltodict
import ast 
from nltk.tokenize import sent_tokenize
from IPython.display import clear_output
import googletrans
from googletrans import Translator
import math
import sklearn.ensemble
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import html,nltk
from nltk.corpus import wordnet 
from collections import Counter 
from string import digits

In [None]:
df = pd.read_csv("df_outcome_procedure_instantie.csv", lineterminator='\n', index_col=0)

In [None]:
# sample_to_label = df.sample(300)
# sample_to_label = df.to_pickle('sample_to_label.pickle')

In [None]:
sample_to_label = pd.read_pickle('sample_to_label.pickle')
list_ECLI = sample_to_label.case.tolist()


In [None]:
samples_dataframe = pd.DataFrame({'ECLI': list_ECLI, 'sentences' : np.nan})

In [None]:
def get_sentences(ECLI):
    content = df[df.case == ECLI].overwegingen.item()
    return sent_tokenize(content) 

In [None]:
def create_file(sentences, ECLI):
    df = pd.DataFrame({'sentence':sentences, 'reveals_decision':np.nan})
    name_of_file = 'labeled_sentences_{}.pickle'.format(ECLI)
    df.to_pickle(name_of_file)
    return name_of_file

    
def manually_label(pickle_file):
    translator = Translator()
    print('Does this sentence reveal the decision? Type 1 if yes. \n')
    df = pd.read_pickle(pickle_file)
    for index, row in df[::-1].iterrows():
        if pd.isnull(row.reveals_decision):
            print(row.sentence)
            result = translator.translate(row.sentence)
            print("\n")
            print("The translation is: ")
            print(result.text)
            reveals_decision = input()
            if reveals_decision == '1':
                df.loc[index, 'reveals_decision'] = 1
            if reveals_decision == '':
                df.loc[index, 'reveals_decision'] = 0
            if reveals_decision == '0':
                df.loc[index, 'reveals_decision'] = 0
                clear_output()
                df.to_pickle(pickle_file)
                break
            clear_output()
            df.to_pickle(pickle_file)
    
    #append_df(pickle_file, 'labeled_sentences_ECLI:NL:RVS:2015:417.pickle')   
    #append_df(pickle_file, 'labeled_sentences_ECLI:NL:RBSGR:2003:AH8572.pickle')   
    #append_df(pickle_file, 'labeled_sentences_ECLI:NL:RBSGR:2010:BO1705.pickle')
    

    print('No more labels to classify!')


def label(df):
    for index,row in df.iterrows():
        print("We are labelling: {}".format(index))
        print("https://uitspraken.rechtspraak.nl/inziendocument?id={}".format(row.ECLI))
        print("The ECLI of the file we are labelling is: {}".format(row.ECLI))
        name_of_file = create_file(row.sentences, row.ECLI)
        manually_label(name_of_file)
        
        
def append_df(new_file, df_pickle):
    df1 = pd.read_pickle(df_pickle)
    df2 = pd.read_pickle(new_file)
    df1 = df1.append(df2)
    df1 = df1.reset_index(drop=True)
    
    df1.to_pickle(df_pickle)

def reset_labels(df_pickle):
    df = pd.read_pickle(df_pickle)
    df['label'] = np.nan
    df.to_pickle(df_pickle)
    


### Training model and removing decision related sentences


In [None]:
df = pd.read_csv("final_dataset_november.csv", lineterminator='\n', index_col=0)

In [2]:
df1 = pd.read_pickle('labeled_sentences_ECLI:NL:RVS:2015:417.pickle')
df2 = pd.read_pickle('labeled_sentences_ECLI:NL:RBSGR:2003:AH8572.pickle')
df3 = pd.read_pickle('labeled_sentences_ECLI:NL:RBSGR:2010:BO1705.pickle')

In [None]:
#Merging the dataframes together
labeled_sentences_df = df1.append(df2)
labeled_sentences_df = labeled_sentences_df.append(df3)
labeled_sentences_df = labeled_sentences_df.fillna(0)
labeled_sentences_df = labeled_sentences_df.reset_index(drop = True)

In [None]:
def undersample(df, target_col, r=1):
    falses = df[target_col].value_counts()[0]
    trues = df[target_col].value_counts()[1]
    relation = float(trues)/float(falses)
    if trues >= r*falses:
        df_drop = df[df[target_col] == True]
        drop_size = int(math.fabs(int((relation - r) * (falses))))
    else: 
        df_drop = df[df[target_col] == False]
        drop_size = int(math.fabs(int((r-relation) * (falses))))
    df_drop = df_drop.sample(drop_size)
    df = df.drop(labels=df_drop.index, axis=0)
    return df

def text_cleaning(text, escape_list=[], stop=[]):
    """
    Text cleaning function:
    """
    text=text.lower()
    StopWords = list(set(stopwords.words('dutch')))
    custom_stop = StopWords + stop
    text = html.unescape(text)
    text = re.sub('[^A-Za-z]+', ' ', text)
    text=text.replace('/',' ').replace('?',' ').replace(',',' ').replace('\'',' ')
    tokenz=nltk.word_tokenize(text)
    tokenz=([token for token in tokenz if token not in custom_stop]) 
    return ' '.join(tokenz)


def do_cross_validation(models,features,labels,CV=5):
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV,n_jobs=-1)
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    plot_cv_result(cv_df)
    return cv_df

def plot_cv_result(cv_df):
    sns.boxplot(x='model_name', y='accuracy', data=cv_df)
    sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
                  size=8, jitter=True, edgecolor="gray", linewidth=2)
    plt.show()


def get_prediction(model,X_train,y_train,X_test,y_test):
    y_pred_trn = model.predict(X_train)
    conf_mat = confusion_matrix(y_train, y_pred_trn)
    print(f'Accuracy for Training Set is  : {accuracy_score(y_train, y_pred_trn)}')
    print(f'Confusion Matrix for Training Set :\n {conf_mat} \n\n Classification Report for Training Set: \n')
    print(classification_report(y_train, y_pred_trn))
    print('--'*50)
    y_pred_tst = model.predict(X_test)
    conf_mat_tst = confusion_matrix(y_test, y_pred_tst)
    print(f'Accuracy for Testing Set is  : {accuracy_score(y_test, y_pred_tst)}')
    print(f'Confusion Matrix for Testing Set :\n {conf_mat_tst} \n\n Classification Report for Testing Set: /n')
    print(classification_report(y_test, y_pred_tst))

    
def remove_decision_sentences(text, pipeline):
    keep = []
    text_sentences_list = sent_tokenize(text)
    
    for sentence in text_sentences_list:
        if pipeline.predict(pd.Series(sentence)) == 0:
            keep.append(sentence)

    return ' '.join(keep)
    


In [None]:
undersampled_df = undersample(labeled_sentences_df, 'reveals_decision')

In [None]:
undersampled_df.reveals_decision.value_counts()

In [None]:
undersampled_df['sentence'] = undersampled_df['sentence'].apply(text_cleaning)

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

In [None]:
sns.set(rc={'figure.figsize':(16,6)})

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',ngram_range=(1,2))
features = tfidf.fit_transform(undersampled_df['sentence'])
labels = undersampled_df.reveals_decision
print(features.shape)


cv_df_tfidf = do_cross_validation(models,features,labels,CV=5)

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',ngram_range=(1,2))

In [None]:
cv_df_tfidf.groupby('model_name')['accuracy'].mean().to_frame()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(undersampled_df['sentence'], undersampled_df.reveals_decision, test_size=0.3, random_state=40)

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2',ngram_range=(1,2))
pipeline = make_pipeline(tfidf,CalibratedClassifierCV(LinearSVC(),method='isotonic'))
pipeline.fit(X_train,y_train)
get_prediction(pipeline,X_train,y_train,X_test,y_test)

In [None]:
df.overwegingen = df.overwegingen.apply(remove_decision_sentences, pipeline = pipeline)

In [None]:
df.to_csv('alien_cases_dataset.csv')