In [259]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import dateparser

In [260]:
def Loop_Document():
    source = './Document/'
    Document_List = []
    Document_Path_List = []
    df = pd.DataFrame.from_dict({'File' : [] , 'Respondent_Roles': [], 'Respondent_Names': [], 'Appellant_Roles': [], 'Appellant_Names': [], 'Claimant_Roles': [], 'Claimant_Names': [], 'Applicant_Roles': [], 'Applicant_Names': [], 'Representation' : [],'Event': [], 'Event_Date': [], 'Translate': [], 'Judgment': [], 'Coram': [], 'Final_Role': [], 'Court': []})
    with os.scandir(source) as os_it:
        for f_name in os_it:
            if f_name.name.endswith(('.html')):
                    Document_List.append(f_name.name)
    for i in Document_List:
        Document_Path = "./Document/" + i
        Document_Path_List.append(Document_Path)
    for i in Document_Path_List:
        Result = Inspect_Document(i)
        File = {"File" : i.replace('./Document/', "").replace('.html', "")}
        Dict = {**File, **Result}
        df = df.append(Dict, ignore_index=True)
    return df

In [261]:
def Inspect_Document(Path):
    File = open(Path, "r")
    try:
        Html_code = File.read()
        Doc_Soup = BeautifulSoup(Html_code, 'html.parser')
        Parties = Extract_Parties(Doc_Soup)
        #print(Path + ": " + str(Parties))
        Dates = Extract_Dates(Doc_Soup)
        #print(Path + ": " + str(Dates))
        Representation = Extract_Representation(Doc_Soup)
        #print(Path + ": " + str(Representation))
        Translate = Extract_Translate(Doc_Soup)
        #print(Path + ": " + str(Translate))
        Judgment = Extract_Judgment(Doc_Soup)
        #print(Path + ": " + str(Judgment))
        Coram = Extract_Coram(Doc_Soup)
        #print(Path + ": " + str(Coram))
        Final_Role = Extract_Final_Role(Doc_Soup)
        #print(Path + ": " + str(Final_Role))
        Court = Extract_Court(Doc_Soup)
        #print(Path + ": " + str(Court))
        Dict = {**Parties, **Representation, **Dates, **Translate, **Judgment, **Coram, **Final_Role, **Court}
        return Dict
    except UnicodeDecodeError:
        print(Path + "   UniCode")
        return {'Respondent_Roles': [], 'Respondent_Names': [], 'Appellant_Roles': [], 'Appellant_Names': [], 'Claimant_Roles': [], 'Claimant_Names': [], 'Applicant_Roles': [], 'Applicant_Names': [], 'Representation' : [],'Event': [], 'Event_Date': [], 'Translate': [], 'Judgment': [], 'Coram': [], 'Final_Role': [], 'Court': []}

In [262]:
def Extract_Parties(Doc_Soup):
    Parties = Doc_Soup.find('parties')
    if Parties != None:
        Respondent_Tag = Parties.find_all(lambda tag:tag.name=="td" and "Respondent" in tag.text)
        Respondent_Roles = []
        Respondent_Names = []
        for i in Respondent_Tag:
            Respondent_Roles.append(i.text.strip().replace('\n', " ").replace('\xa0', "").replace("  ", " "))
            Respondent_Names.append(i.find_previous('td').text.strip().replace('\n', " ").replace('\xa0', "").replace("  ", " "))
        Appellant_Tag = Parties.find_all(lambda tag:tag.name=="td" and "Appellant" in tag.text)
        Appellant_Roles = []
        Appellant_Names = []
        for i in Appellant_Tag:
            Appellant_Roles.append(i.text.strip().replace('\n', " ").replace('\xa0', "").replace("  ", " "))
            Appellant_Names.append(i.find_previous('td').text.strip().replace('\n', " ").replace('\xa0', "").replace("  ", " "))
        Claimant_Tag = Parties.find_all(lambda tag:tag.name=="td" and "Claimant" in tag.text)
        Claimant_Roles = []
        Claimant_Names = []
        for i in Claimant_Tag:
            Claimant_Roles.append(i.text)
            Claimant_Names.append(i.find_previous('td').text.strip().replace('\n', " ").replace('\xa0', "").replace("  ", " "))
        Applicant_Tag = Parties.find_all(lambda tag:tag.name=="td" and "Applicant" in tag.text)
        Applicant_Roles = []
        Applicant_Names = []
        for i in Applicant_Tag:
            Applicant_Roles.append(i.text.strip().replace('\n', " ").replace('\xa0', "").replace("  ", " "))
            Applicant_Names.append(i.find_previous('td').text.strip().replace('\n', " ").replace('\xa0', "").replace("  ", " "))   
        return {"Respondent_Roles" : Respondent_Roles, "Respondent_Names" : Respondent_Names, "Appellant_Roles" : Appellant_Roles, "Appellant_Names" : Appellant_Names,  "Claimant_Roles" : Claimant_Roles, "Claimant_Names" : Claimant_Names, "Applicant_Roles" : Applicant_Roles, "Applicant_Names" : Applicant_Names}
    return {"Respondent_Roles" : ['empty (no tag)'], "Respondent_Names" : ['empty (no tag)'], "Appellant_Roles" : ['empty (no tag)'], "Appellant_Names" : ['empty (no tag)'],  "Claimant_Roles" : ['empty (no tag)'], "Claimant_Names" : ['empty (no tag)'], "Applicant_Roles" : ['empty (no tag)'], "Applicant_Names" : ['empty (no tag)']}

In [263]:
def Extract_Dates(Doc_Soup):
    Dates_Tag = Doc_Soup.find(lambda tag:tag.name=="date")
    if Dates_Tag != None:
        Initial_Search = Dates_Tag.find_all('td', text = re.compile("^Date.*\:|："))
        Event = []
        Event_Date = []
        if Initial_Search == []:
            Advance_Search = Dates_Tag.find_all('p', text = re.compile("(^Date.*\:|：)"))
            if Advance_Search == []:
                Dates = {"Event" : 'empty', "Event_Date" : 'empty'}
                return Dates
            for i in Advance_Search:
                Event_Pattern = r"(Date.*\:)"
                Event_Search = re.search(Event_Pattern, str(i))
                if Event_Search != None:
                    Event.append(Event_Search.group().strip())
                    Event_Date.append(i.string.replace(Event_Search.group().strip(),"").strip())
                else:
                    Event = ['empty']
                    Event_Date = ['empty']
        else:
            Final_Search = Dates_Tag.find_all('td', attrs={'class' : 'auto-style1'})
            if Final_Search == []:
                Extra_Search = Dates_Tag.find_all('td', attrs={'valign' : 'top'})
                if Extra_Search == []:
                    for i in Initial_Search:
                        Event.append(i.text.strip())
                        Event_Date.append(i.find_next('td').text.strip())
                else:
                    for i in Extra_Search:
                        Event_Pattern = r"(Date.*\:)"
                        Event_Search = re.search(Event_Pattern, str(i))
                        if Event_Search != None:
                            Event.append(Event_Search.group().strip().replace('\n', " ").replace('\xa0', "").replace('amp;', ""))
                            Event_Date.append(i.text.replace(Event_Search.group().strip().replace('\n', " ").replace('\xa0', "").replace('amp;', ""),"").strip())
                        else: 
                            Event = ['empty']
                            Event_Date = ['empty']    
            else:
                for i in Final_Search:
                    Event_Pattern = r"(Date.*\:)"
                    Event_Search = re.search(Event_Pattern, str(i))
                    if Event_Search != None:
                        Event.append(Event_Search.group().strip().replace('\n', " ").replace('\xa0', "").replace('amp;', ""))
                        Event_Date.append(i.text.replace(Event_Search.group().strip().replace('\n', " ").replace('\xa0', "").replace('amp;', ""),"").strip())
                    else: 
                        Event = ['empty']
                        Event_Date = ['empty']
        Dates = {"Event" : Event, "Event_Date" : Event_Date}
    else:
        Dates = {"Event" : 'empty (no tag)', "Event_Date" : 'empty (no tag)'}
    return Dates

In [264]:
def Extract_Representation(Doc_Soup):
    Representation_Tag = Doc_Soup.find('representation')
    Representation_List = []
    if Representation_Tag != None:
        Initial_Search = Representation_Tag.find_all('p')
        if Initial_Search != []:
            for i in Initial_Search:
                if '\xa0' not in i.text:
                    Representation_List.append(i.text.strip().replace('\n', " "))
        else:
            Advance_Search = Representation_Tag.find_all('td')
            if Advance_Search != []:
                for i in Advance_Search:
                    Representation_List.append(i.text.strip())
            else:
                Representation = {'Representation' : ['empty']}
                return Representation
        Representation = {"Representation" : Representation_List}
    else:
        Representation = {'Representation' : ['empty']}
    return Representation

In [265]:
def Extract_Translate(Doc_Soup):
    Translate_Tag = Doc_Soup.find('p', text = re.compile("^Translated"))
    if Translate_Tag != None:
        Translate_Tag = Doc_Soup.find('p', text = re.compile("^Translated")).text.strip()
    else:
        Translate_Tag = Doc_Soup.find('p', text = re.compile("\b*Vetted"))
        if Translate_Tag == None:
            Translate = {'Translate' : ['empty']}
            return Translate
        else:
            Translate_Tag = Doc_Soup.find('p', text = re.compile("\b*Vetted")).text.strip()
    Translate = {"Translate" : [Translate_Tag]}
    return Translate

In [266]:
def Extract_Judgment(Doc_Soup):
    Representation_Tag = Doc_Soup.find('representation')
    if Representation_Tag != None:
        Judgment_Tag = Representation_Tag.find_previous('p', attrs = {"class" : "heading"}, text = re.compile("\b*Judgment"))
        if Judgment_Tag != None:
            Judgment_Info = Representation_Tag.find_previous('p', attrs = {"class" : "heading"}, text = re.compile("\b*Judgment")).find_next('p').text.strip().replace('\xa0', " ")
            Judgment = {"Judgment" : [Judgment_Info]}
        else:
            Judgment = {'Judgment' : ['empty']}
    else:
        Judgment = {'Judgment' : ['empty (no Tag)']}
    return Judgment

In [267]:
def Extract_Coram(Doc_Soup):
    Coram_Tag = Doc_Soup.find('coram')
    if Coram_Tag != None:
        Initial_Search = Coram_Tag.find('td', text = re.compile("^Before.*\:"))
        if Initial_Search != None:
            #Coram_Info = Initial_Search.find_next('td').text.strip().replace('\xa0', "").replace('\n', " ")
            Search = Coram_Info = Coram_Tag.get_text().strip().replace('\n', " ").replace('\xa0', "").replace("Before: ", "").replace("  ", " ").strip()
            Coram = {'Coram' : [Coram_Info]}
        else:
            Advance_Search = Coram_Tag.find('p', text = re.compile("(^Before:\s.*)"))
            if Advance_Search != None:
                Coram_Info = Advance_Search.get_text().strip().replace('\n', " ").replace('\xa0', "").replace("Before: ", "").replace("  ", " ").strip()
                Coram = {'Coram' : [Coram_Info]}
            else: 
                Coram_Info = Coram_Tag.get_text().strip().replace('\n', " ").replace('\xa0', "").replace("Before: ", "").replace("  ", " ").strip()
                Coram = {'Coram' : [Coram_Info]}
    else:
        Coram = {'Coram' : ['empty']}
    return Coram

In [268]:
def Extract_Final_Role(Doc_Soup):
    Representation_Tag = Doc_Soup.find('representation')
    if Representation_Tag != None:
        Final_Role_Table_Tag = Representation_Tag.find_previous('table')
        if Final_Role_Table_Tag != None:
            Final_Role_Info = Final_Role_Table_Tag.get_text()
            Rows = Final_Role_Table_Tag.find_all('tr')
            data = []
            for row in Rows:
                cols = row.find_all('td')
                cols = [ele.get_text().strip().replace('\n', "").replace('_', "").replace('\xa0', "").replace("  ", "") for ele in cols]
                data.append([ele for ele in cols])
            Final_Role = {'Final_Role' : [data]}
        else:
            Final_Role = {'Final_Role' : ['empty']}
    else:
        Final_Role = {'Final_Role' : ['empty (no tag)']}
    return Final_Role

In [269]:
def Extract_Court(Doc_Soup):
    Center_Tag = Doc_Soup.find_all('p', attrs = {'style' : 'text-align:center'}, limit = 3)
    Court_Info = ""
    if Center_Tag != None:
        for i in Center_Tag:
            Court_Info += i.text.strip().replace('\n', " ").replace('\xa0', "")
        Court = {'Court' : [Court_Info]}
    else:
        Court = {'Court' : ['empty']}
    return Court

In [270]:
Result = Loop_Document()

In [271]:
#Result

In [272]:
Result.to_csv('Batch_Process.csv', index = False)