In [None]:
# import xml.dom.minidom as minidom
import os
import pandas as pd
from natsort import os_sorted
from markdownify import markdownify as md
import xml.etree.ElementTree as ET
import copy

import warnings
warnings.filterwarnings('ignore')

In [None]:

def get_coverPage_data(akn, coverPage, preface):

    coverPage_data = dict()
    coverPage_data['publisher'] = coverPage.find(".//"+akn+"docAuthority[@refersTo='#publisher']").text
    coverPage_data['issuingBody'] = coverPage.find(".//"+akn+"docAuthority[@refersTo='#issuingBody']").text
    coverPage_data['symbol'] = coverPage.find(".//"+akn+"docNumber[@refersTo='#symbol']").text
    coverPage_data['docStatus'] = coverPage.find(".//"+akn+"docStatus").text
    coverPage_data['date'] = coverPage.find(".//"+akn+"docDate").attrib['date']
    coverPage_data['session'] = coverPage.find(".//"+akn+"session").attrib['value']
    coverPage_data['agendaItem'] = coverPage.find(".//"+akn+"inline[@name='agenda item']").text
    agendaItemDesc =  coverPage.find(".//"+akn+"inline[@name='agenda item name']")
    if(isinstance(agendaItemDesc, ET.Element)):
        agendaItemDesc = agendaItemDesc.text
    else: 
        agendaItemDesc = ''
    coverPage_data['agendaItemDesc'] =  agendaItemDesc   
    coverPage_data['docTitle'] = coverPage.find(".//"+akn+"docTitle").text
    coverPage_data['resolutionTitle'] = preface.find(".//"+akn+"docTitle")
    
    coverPage_data['resolutionTitle'] = md(ET.tostring(coverPage_data['resolutionTitle'], encoding='utf8', method='xml').decode("utf8")).replace("xml version='1.0' encoding='utf8'?","").strip()

    return coverPage_data

In [None]:
def get_resolution_data(path,file, keywords):
    
    

    tree = ET.parse(path + file)
    root = tree.getroot()

    akn = "{http://docs.oasis-open.org/legaldocml/ns/akn/3.0}"
    coverPage = tree.find(".//"+akn+"coverPage")
    preamble =  tree.find(".//"+akn+"preamble")
    mainBody = tree.find(".//"+akn+"mainBody")
    preface = tree.find("//"+akn+"preface")
    

    resolution_data = []

    #----------------------
    formula = preamble.find(".//"+akn+"formula")
    if(isinstance(formula, ET.Element)):
        formula = formula.find(".//"+akn+"span").text
    else:
        formula = ''
    #----------------------

    containers = preamble.findall(".//"+akn+"container")

    for c in containers:
        preamble_text = c.find(".//"+akn+"p")

        if preamble_text:

            d = get_coverPage_data(akn, coverPage, preface)
            d['forumula'] = formula
            d['eId'] = c.attrib['eId']
            d['p_num'] = ''
            d['text'] = md(ET.tostring(preamble_text, encoding='utf8', method='xml').decode("utf8")).replace("xml version='1.0' encoding='utf8'?","").strip()
            resolution_data.append(d)


    paragraphs = mainBody.findall(".//"+akn+"paragraph")

    for p in paragraphs:
        p_num = p.find(".//"+akn+"num")
        if(isinstance(p_num, ET.Element)):
            p_num = p_num.text
        else: 
            p_num = ''

        p_heading = p.findall(".//"+akn+"heading")
        if p_heading:
            for ph in p_heading:

                d = get_coverPage_data(akn, coverPage, preface)
                d['forumula'] = formula
                d['eId'] = p.attrib['eId']
                d['p_num'] = p_num
                d['text'] = md(ET.tostring(ph, encoding='utf8', method='xml').decode("utf8")).replace("xml version='1.0' encoding='utf8'?","").strip()
                resolution_data.append(d)

        p_content = p.findall(".//"+akn+"content")
        if p_content:
            for pc in p_content:

                d = get_coverPage_data(akn, coverPage, preface)
                d['forumula'] = formula
                d['eId'] = p.attrib['eId']
                d['p_num'] = p_num
                d['text'] = md(ET.tostring(pc, encoding='utf8', method='xml').decode("utf8")).replace("xml version='1.0' encoding='utf8'?","").strip()
                resolution_data.append(d)
                
    x = pd.DataFrame(resolution_data)
    x['select'] = False
    x['keywords'] = ''
    
    for index, row in x.iterrows():
        found_keywords = []
        for k in keywords:
            if k in row['text'].lower():
                x.loc[index, 'select'] = True
                found_keywords.append(k)
        
        x.loc[index, 'keywords'] = ', '.join(found_keywords)
        
    return x


In [None]:
def get_ga_data(path, keywords):
    data = []
    ga_files = os_sorted(os.listdir(path))
    for idx, file in enumerate(ga_files):
        data.append(get_resolution_data(path,file, keywords))
    return pd.concat(data)


In [None]:
GA_76_path = "../UNxml/GAresolutions-main/76session/English/"
GA_77_path = "../UNxml/GAresolutions-main/77session/English/"
ECOSOC_2021_path = "../UNxml/ECOSOCresolutions-main/2021/English/"
ECOSOC_2022_path = "../UNxml/ECOSOCresolutions-main/2022/English/"


In [None]:
keywords = ['statistic', 'data', 'geospatial', 'indicator']

GA_76_data = get_ga_data(GA_76_path, keywords)
GA_77_data = get_ga_data(GA_77_path, keywords)
ECOSOC_2021_data = get_ga_data(ECOSOC_2021_path, keywords)
ECOSOC_2022_data = get_ga_data(ECOSOC_2022_path, keywords)

all_ga_data = pd.concat([GA_76_data, GA_77_data, ECOSOC_2021_data, ECOSOC_2022_data])

all_ga_data.to_excel("../output/analysis_all_2022.xlsx") 