In [None]:
import pandas as pd
import numpy as np
import re
import time
import requests
import xmltodict
import ast 
from nltk.tokenize import sent_tokenize
from IPython.display import clear_output
import googletrans
from googletrans import Translator



In [None]:
#pd.set_option('display.max_colwidth', -1)

In [None]:
cases_df = pd.read_csv("asylum_cases_structure.csv", index_col=0) 

In [None]:
unstructured_cases_df = pd.read_csv("unstructured_with_subsections.csv", index_col = 0)
structured_cases_df = pd.read_csv("structured_with_subsections.csv", lineterminator='\n',index_col=0)

In [None]:
combined_df = unstructured_cases_df.append(structured_cases_df, ignore_index = True)

In [None]:
df = pd.merge(cases_df, combined_df, on = 'case')

In [None]:
#Regex defined for extracting the decision

#Regex for detecting 'gegrond' cases
gegrond = (r"((verklaart)( )?(eiser)?( )?(het|de)( )(hoger)?( )?(beroep|beroepen|verzet|verzoek|bezwaar)( )?"
           r"(gericht)?( )?(voor het overige)?( )?(om een voorlopige voorziening)?( )?(is)?( )?(kennelijk)?( )?"
           r"(van eisers|van eiser|van eiseressen|van eiseres)?( )?(tegen|van)?(.*)( )(gegrond)(\.|\,|\;| en|( )))" 
           r"|((vernietigt)( )(het|de)( )(bestreden( )(besluit)))"
           r"|((wijst( )(het|de)( )?(gevraagde)?( )(verzoek|verzoeken|voorziening)(.*)( )(toe)(\.|\,|\;| en|( ))))")

#Regex for detecting 'ongegrond' cases
ongegrond = (r"((verklaart)( )?(eiser)?( )?(het|de)( )?(hoger)?( )?(beroep|beroepen|verzet|verzoek|bezwaar)( )?"
             r"(gericht)?( )?(voor het overige)?( )?(om een voorlopige voorziening)?( )?(is)?( )?(kennelijk)?"
             r"( )?(van eisers|van eiser|van eiseressen|van eiseres)?( )?(tegen|van)?(.*)"
             r"( )(ongegrond|niet-ontvankelijk|niet ontvankelijk|vervallen)(\.|\,|\;| en|( )))"\
             r"|((bevestigt)( )(het|de)( )(aangevallen)( )(uitspraak))"
             r"|((wijst)( )(het|de)( )(verzoek|verzoeken|herzieningsverzoek)(.*)( )(af)(\.|\,|\;| en|( )))")



gegrond_hoger_beroep = "(verklaart)( )(eiser)?( )?(het|de)( )(hoger)?( )?(beroep|beroepen|verzet|verzoek|bezwaar)( )(gegrond)"
ongegrond_hoger_beroep = "(verklaart)( )(eiser)?( )?(het|de)( )(hoger)?( )?(beroep|beroepen|verzet|verzoek|bezwaar)( )(ongegrond|niet-ontvankelijk|niet ontvankelijk|vervallen)"



In [None]:
"""Remove all spaces of length > 1
and replace them with a single space"""

def removespace(text):
    return ' '.join(text.split())

df['beslissing'] = df['beslissing'].apply(removespace)

In [None]:
"""Cases with multiple decisions are completeley removed"""

gegrond_cases = []
ecli_gegrond_cases = []
ongegrond_cases = []
ecli_ongegrond_cases = []
no_match = []
remaining = []
unique_structure = []
test = []

for index,row in df.iterrows():
    
    
    count_gegrond = len(re.findall(gegrond, row.beslissing)) 
    count_ongegrond = len(re.findall(ongegrond, row.beslissing))
    count_gegrond_hoger_beroep = len(re.findall(gegrond_hoger_beroep, row.beslissing))
    count_ongegrond_hoger_beroep = len(re.findall(ongegrond_hoger_beroep, row.beslissing))
    
    if 'overwegingen1.' in row.beslissing:
        unique_structure.append(index)   
        
    elif not isinstance(row.procesverloop, str):
        unique_structure.append(index)
    
    elif not isinstance(row.overwegingen, str):
        unique_structure.append(index)
    
    elif count_gegrond == 1 and count_ongegrond == 0:
        gegrond_cases.append(index)
        ecli_gegrond_cases.append(row.case)
        
    elif count_gegrond == 0 and count_ongegrond == 1:
        ongegrond_cases.append(index)
        ecli_ongegrond_cases.append(row.case)
        
    elif count_gegrond == 0 and count_ongegrond == 0:
        no_match.append(index)
        
    elif count_gegrond_hoger_beroep == 1 and count_ongegrond_hoger_beroep == 0:
        gegrond_cases.append(index)
        ecli_gegrond_cases.append(row.case)
    
    elif count_gegrond_hoger_beroep == 0 and count_ongegrond_hoger_beroep == 1:
        ongegrond_cases.append(index)
        ecli_ongegrond_cases.append(row.case)
        
    else:
        remaining.append(index)

In [None]:
print("gegrond cases: " + str(len(gegrond_cases)) + " ongegrond cases: " + str(len(ongegrond_cases)))
print("The number of cases are: " + str(len(df)))

In [None]:
#Drop those rows with cases that are 'inbetween' or no specific 'gegrond' or 'ongegrond' decision
drop_row_indices = unique_structure + no_match + remaining
drop_row_indices.sort()    
df.drop(df.index[drop_row_indices], inplace = True)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
def getoutcome(code):
    if code in ecli_gegrond_cases:
        return 'gegrond'
    elif code in ecli_ongegrond_cases:
        return 'ongegrond'

def remove_procesverloop(text):
    if "procesverloop" in text[:25]:
        text = text.replace("procesverloop", "")
    return text

def remove_overwegingen(text):
    if "overwegingen1" in text[:25]:
        text = text.replace("overwegingen1", "")
    elif "overwegingen" in text[:25]:
        text = text.replace("overwegingen", "")
    return text

In [None]:
#Add the outcome column
df["outcome"] = np.nan
df['outcome'] = df['case'].apply(getoutcome)

In [None]:
def get_length(text):
    return len(text)

In [None]:
df['length_beslissing'] = np.nan
df['length_beslissing'] = df['beslissing'].apply(get_length)

In [None]:
df['length_processverloop'] = np.nan
df['length_processverloop'] = df['procesverloop'].apply(get_length)

In [None]:
df['length_overwegingen'] = np.nan
df['length_overwegingen'] = df['overwegingen'].apply(get_length)

In [None]:
df['procesverloop'] = df['procesverloop'].apply(remove_procesverloop)
df['overwegingen'] = df['overwegingen'].apply(remove_overwegingen)

In [None]:
non_standard_cases = df[df.length_overwegingen < 100].case.tolist()
non_standard_cases = non_standard_cases + df[df.length_overwegingen == df.length_beslissing].case.tolist()

In [None]:
#Further remove more non-standard cases
remove = []
for index,row in df.iterrows():
    if 'procesverloop' in row.beslissing[:30]:
        remove.append(row.case)
    elif 'overwe' in row.beslissing[:20]:
        remove.append(row.case)

In [None]:
non_standard_cases = non_standard_cases + remove

In [None]:
indices_to_drop = []

for code in non_standard_cases:
    index = df[df.case == code].index.item()
    indices_to_drop.append(index)

In [None]:
#Drop cases with the indices in indices_to_drop
df.drop(df.index[indices_to_drop], inplace = True)
df.reset_index(drop=True, inplace=True)


In [None]:
df.drop(columns = ["length_overwegingen", "length_processverloop", "length_beslissing"], inplace = True)

# Add Procedure and Instantie

In [None]:
DETAILED_LINK_URL = "https://data.rechtspraak.nl/uitspraken/content?id={}"

def get_ecli_from_detailed_link(detailed_link):
    try:
        return detailed_link.split("=")[-1]
    except:
        return None

def get_procedure_from_called_api(called_api_page):
    doc = xmltodict.parse(called_api_page.text)
    dict_procedure = doc["open-rechtspraak"]["rdf:RDF"]["rdf:Description"][0]["psi:procedure"]
    if not isinstance(dict_procedure, list):
        return [dict_procedure["#text"]]
    else:
        s = []
        for i in range(0,len(dict_procedure)):
            s.append(dict_procedure[i]["#text"])
            
        return s      
    
    
def get_instantie_from_called_api(called_api_page):
    doc = xmltodict.parse(called_api_page.text)#     dict_instantie = doc["open-rechtspraak"]["rdf:RDF"]["rdf:Description"][0]["psi:Instantie"]
    dict_instantie = doc['open-rechtspraak']['rdf:RDF']['rdf:Description'][0]['dcterms:creator']
    if not isinstance(dict_instantie, list):
        return [dict_instantie["#text"]]
    else:
        s = []
        for i in range(0,len(dict_instantie)):
            s.append(dict_instantie[i]["#text"])
            
        return s      


def check_multiple_zaaknummers(case):
    
    ecli_temp = get_ecli_from_detailed_link(case)
    detailed_sentence = requests.get(
        DETAILED_LINK_URL.format(ecli_temp))
    
    zaaknummer_temp = get_zaaknummer_from_called_api(detailed_sentence)
        
    if 'en' in zaaknummer_temp or ',' in zaaknummer_temp:
        return True
    
    return False

def get_procedure(ecli_code):
    
    time.sleep(0.0001)
    
    ecli_temp = get_ecli_from_detailed_link(ecli_code)
    
    detailed_sentence = requests.get(
            DETAILED_LINK_URL.format(ecli_temp), timeout = 15)
    
    try:
        kenmerken = get_procedure_from_called_api(detailed_sentence)
    
    except IndexError as i:
        
        kenmerken =["no procedure"] 
        
    except KeyError:
        
        kenmerken = ["no procedure"]
        
    except requests.exceptions.ReadTimeout as err:
        kenmerken = ["no procedure"]
    
    return kenmerken

def get_instantie(ecli_code):
    
    time.sleep(0.0001)
    
    ecli_temp = get_ecli_from_detailed_link(ecli_code)
    
    detailed_sentence = requests.get(
            DETAILED_LINK_URL.format(ecli_temp), timeout = 15)
    
    try:
        kenmerken = get_instantie_from_called_api(detailed_sentence)
    
    except IndexError as i:
        
        kenmerken =["no instantie"] 
        
    except KeyError:
        
        kenmerken = ["no instantie"]
        
    except requests.exceptions.ReadTimeout as err:
        kenmerken = ["no instantie"]
    
    return kenmerken


In [None]:
df['procedure'] = df['case'].apply(get_procedure)

In [None]:
df['instantie'] = df['case'].apply(get_instantie)

# Add Case Types

In [None]:
#Remove cases with multiple high appeals:

ecli_multiple_hoger_beroep = []
for index,row in df.iterrows():
    if 'hoger beroepen' in row.verdict:
        ecli_multiple_hoger_beroep.append(index)

In [None]:
df.drop(df.index[ecli_multiple_hoger_beroep], inplace = True)
df.reset_index(drop=True, inplace=True)

In [None]:
#Remove multiple complaints:

ecli_multiple_eisers = []

for index,row in df.iterrows():
    if 'eiseres 1' in row.verdict or 'eiser 1' in row.verdict:
        ecli_multiple_eisers.append(index)

In [None]:
df.drop(df.index[ecli_multiple_eisers], inplace = True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop(columns = ['structured'], inplace = True)
df.drop(columns = ['type'], inplace = True)

In [None]:
def is_hoger_beroep(text):
    if 'Hoger beroep' in text:
        return 1
    else:
        return 0

def is_bodem_zaak(text):
    if 'Bodemzaak' in text or 'bodemzaak' in text:
        return 1
    else:
        return 0
    
def is_eerste_aanleg_enkelvoudig(text):
    if 'Eerste aanleg - enkelvoudig' in text:
        return 1
    else:
        return 0
    
def is_eerste_aanleg_meervoudig(text):
    if 'Eerste aanleg - meervoudig' in text:
        return 1
    else:
        return 0
    
def is_vorloopige_vooziening(text):
    if 'Voorlopige voorziening' in text:
        return 1
    else:
        return 0    
    
def is_mondelinge_uitspraak(text):
    if 'Mondelinge uitspraak' in text:
        return 1
    else:
        return 0 
    


In [None]:
df['hoger_beroep'] = df['procedure'].apply(is_hoger_beroep)

In [None]:
df['bodem_zaak'] = df['procedure'].apply(is_bodem_zaak)

In [None]:
df['eerste_aanleg_enkelvoudig'] = df['procedure'].apply(is_eerste_aanleg_enkelvoudig)

In [None]:
df['eerste_aanleg_meervoudig'] = df['procedure'].apply(is_eerste_aanleg_meervoudig)

In [None]:
df['vorloopige_vooziening'] = df['procedure'].apply(is_vorloopige_vooziening)

In [None]:
df['mondelinge_uitspraak'] = df['procedure'].apply(is_mondelinge_uitspraak)

In [None]:
l1 = (df[df.hoger_beroep + df.vorloopige_vooziening > 1]).index.tolist()
l2 = (df[df.bodem_zaak + df.vorloopige_vooziening > 1]).index.tolist()
l3 = (df[df.eerste_aanleg_enkelvoudig + df.vorloopige_vooziening > 1]).index.tolist()
l4 = (df[df.eerste_aanleg_meervoudig + df.vorloopige_vooziening > 1]).index.tolist()
l5 = (df[df.mondelinge_uitspraak + df.vorloopige_vooziening > 1]).index.tolist()

cases_multiple = l1 + l2 + l3 + l4 + l5 

In [None]:
df.drop(df.index[cases_multiple], inplace = True)
df.reset_index(drop=True, inplace=True)

In [None]:
#df.to_csv("df_outcome_procedure_instantie.csv")