In [None]:
import pandas as pd
import numpy as np
import re
import requests
import xmltodict
import time
import pickle
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import html
from bs4 import BeautifulSoup
import random 
from ipynb.fs.full.Preprocessing_Methods import get_detailed_sentence
from ipynb.fs.full.Preprocessing_Methods import get_html_from_detailed_link
from ipynb.fs.full.Preprocessing_Methods import get_ecli_from_detailed_link
from ipynb.fs.full.Preprocessing_Methods import find_index
from ipynb.fs.full.Preprocessing_Methods import get_index


In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
"""Structured Cases are those that have html tags defining the subsections
structured == 0.0, indicates that the case is unstructred, and structured == 1.0
indicates that the case is structured"""

cases_df = pd.read_csv("asylum_cases_structure.csv", index_col=0) 


In [None]:
structured_cases = cases_df[cases_df.structured == 1.0]
ECLI_structured_cases = structured_cases['case'].tolist()
unstructured_cases = cases_df[cases_df.structured == 0.0]
ECLI_unstructured_cases = unstructured_cases['case'].tolist()

In [None]:
def generate_subsections(ECLI_list, unstructured = True):
    
    counter = 0
    
    discard = []


    regex_procesverloop = "\r\n[ ]{0,}([0-9]*)?(\.)?([0-9]*)?([i|v|x]*)?[ ]{0,}(de)?(het)?[ ]{0,}(\.)?[ ]{0,}(procesverloop|ontstaan en loop van het geding|zitting|gegevens inzake het geding|verloop van de procedure|ontstaan en loop van het geschil|inleiding|ontstaan en loop van de gedingen)(en)?(:)?[ ]{0,}(\.)?[ ]{0,}\r\n"


    
    regex_beslissing = "\r\n[ ]{0,}([0-9]*)?(\.)?([0-9]*)?([i|v|x]*)?[ ]{0,}(de)?(het)?[ ]{0,}(\.)?[ ]{0,}(beslissing|uitspraak)(en)?(:)?[ ]{0,}(\.)?[ ]{0,}\r\n"
            
    
    regex_overwegingen = "\r\n[ ]{0,}([0-9]*)?(\.)?([0-9]*)?([i|v|x]*)?[ ]{0,}(de)?(het)?[ ]{0,}(\.)?[ ]{0,}(rechtsoverwegingen|overwegingen|motivering|gronden|beoordeling|feiten)(en)?(:)?[ ]{0,}(\.)?[ ]{0,}\r\n"



    
    DETAILED_LINK_URL = "https://data.rechtspraak.nl/uitspraken/content?id={}"
    
    detailed_link_list = [ecli for ecli in ECLI_list]
    
    text_df = pd.DataFrame({"case": [],
                            "procesverloop": [],
                            "overwegingen": [],
                            "beslissing": []
                           })
    
    
    for detailed_link in detailed_link_list:
        
        counter +=1
        print(counter, end="\r", flush=True)
        time.sleep(0.001)
        ecli_temp = get_ecli_from_detailed_link(detailed_link)
        
        detailed_sentence = get_detailed_sentence(ecli_temp)
        
        if unstructured:
            text = get_html_from_detailed_link(detailed_sentence)
            full_text = text.getchildren()[-1]
            content = full_text.text_content()
            content = content.lower()
            content = content.strip()
            content = content.replace('\xa0', '')
            content = content.replace('\t', '')
            x = re.search(regex_beslissing, content)
            
            
            if x!= None:
                index_procesverloop = find_index(regex_procesverloop,content,False)
                index_beslissing = find_index(regex_beslissing,content)
                index_overwegingen = find_index(regex_overwegingen, content, False)
                
                if index_beslissing == "no_index_found":
                    discard.append(detailed_link)
                    continue


                else:
                    beslissing = content[index_beslissing[0]:]
                    beslissing = beslissing.replace("\r\n", "")
                
                if index_overwegingen == "no_index_found":
                    discard.append(detailed_link)
                    continue


                else:
                    overwegingen = content[index_overwegingen[0]:index_beslissing[1]]
                    overwegingen = overwegingen.replace("\r\n", "")
                    
                if index_procesverloop == "no_index_found":
                    discard.append(detailed_link)
                    continue
                
                else:
                    procesverloop = content[index_procesverloop[0]:index_overwegingen[1]]
                    procesverloop = procesverloop.replace("\r\n", "")


                
                df_temp = pd.DataFrame({
                "case": [ecli_temp],
                "procesverloop" : [procesverloop],
                "overwegingen" : [overwegingen],
                "beslissing" : [beslissing]
                })
                
                text_df = text_df.append(df_temp, ignore_index=True)
        else:
            
            soup = BeautifulSoup(detailed_sentence.content, 'html.parser')
            
            string_beslissing = ['<section role="beslissing">', '<emphasis role="bold">beslissing', 
                                 "<title>beslissing", '<bridgehead role="bold">beslissing', '<para>beslissing'
                                ]
            string_overwegingen = ['<section role="overwegingen">', '<emphasis role="bold">overwegingen',
                                   "<title>overwegingen", '<?linebreak?>overwegingen',
                                   '<title>de beoordeling',
                                  '<emphasis role="bold">overwegingen', '<title>Motivering']
            
            string_procesverloop = ['<section role="procesverloop">', '<title>zitting', '<title>procesverloop']  

            
       
            element_list = [x for x in [str(child).lower() for child in soup.findAll('section')]]
            
            
            index_beslissing = get_index(string_beslissing, element_list)
            index_overwegingen = get_index(string_overwegingen, element_list )
            index_procesverloop = get_index(string_procesverloop,  element_list )
            


            if index_beslissing == "no_index_found":
                discard.append(detailed_link)
                continue

            else:
                content_beslissing = soup.find_all("section")[index_beslissing].get_text()
                content_beslissing = content_beslissing.lower()
                content_beslissing = content_beslissing.replace("\n", "")
                content_beslissing = content_beslissing.replace("\t", "")
                content_beslissing = content_beslissing.replace("beslissing", "", 1)
                content_beslissing = content_beslissing.lower()
                
            
            if index_overwegingen == "no_index_found":
                discard.append(detailed_link)
                continue
                
            else:
                content_overwegingen  = soup.find_all("section")[index_overwegingen].get_text()
                content_overwegingen = content_overwegingen.lower()
                content_overwegingen = content_overwegingen.replace("\n", "")
                content_overwegingen = content_overwegingen.replace("\t", "")
                content_overwegingen = content_overwegingen.replace("overwegingen", "", 1)
                content_overwegingen = content_overwegingen.lower()

            
                
            if index_procesverloop == "no_index_found":
                discard.append(detailed_link)
                continue
                
            else:
                
                content_procesverloop = soup.find_all("section")[index_procesverloop].get_text()
                content_procesverloop = content_procesverloop.lower()
                content_procesverloop = content_procesverloop.replace("\n", "")
                content_procesverloop  = content_procesverloop.replace("\t", "")
                content_procesverloop  = content_procesverloop.replace("procesverloop", "", 1)
                content_procesverloop  = content_procesverloop.lower()

             
                
            df_temp = pd.DataFrame({
                "case": [ecli_temp],
                "procesverloop" : [content_procesverloop],
                "overwegingen" : [content_overwegingen],
                "beslissing" : [content_beslissing]
            })
                
            text_df = text_df.append(df_temp, ignore_index=True)

    
    
    return text_df, discard




In [None]:
subsections = generate_subsections(ECLI_unstructured_cases)

In [None]:
subsections[0].to_csv("unstructured_with_subsections.csv")

In [None]:
import pickle

with open("unstructured_list.txt", "wb") as fp:
        pickle.dump(subsections[1], fp)