In [149]:
import pdftotext
import requests
import io
import bs4
import re
import pandas as pd
import numpy as np

In [78]:
url=requests.get('https://digitallibrary.un.org/record/3904095/files/A_RES_75_265-EN.pdf')

In [79]:
with io.BytesIO(url.content) as f:
    pdf = pdftotext.PDF(f)

In [83]:
print(pdf[1])

                                                                               Cooperation between the United Nations and the
A/RES/75/265                                                              Organisation for the Prohibition of Chemical Weapons

                        4.    Welcomes the effective and ongoing cooperation between the United
                   Nations and the Organisation for the Prohibition of Chemical Weapons as set out in
                   the Agreement concerning the Relationship between the United Nations and the
                   Organisation for the Prohibition of Chemical Weapons; 3
                         5.   Recalls the report of the Conference of the States Parties to the Convention
                   on the Prohibition of the Development, Production, Stockpiling and Use of Chemical
                   Weapons and on Their Destruction on its fourth special session, held in The Hague
                   on 26 and 27 June 2018, and also recalls the dec

In [241]:
def get_resolution_urls(start_year=1946, end_year=2021):
    '''
    Crawls through the UN Digital Library and fetches the URLs for each resolution's
    profile between the given start and end years. 
    Input:
        start_year (int): the year program will start scraping from
        end_year (int): the year program will scrape the last
    Output:
        List of strings: URLs of each resolution's webpage
    '''
    all_urls = []
    for year in range(start_year, end_year + 1):
        for i in [1, 201]:
            url = 'https://digitallibrary.un.org/search?ln=en&c=Voting+Data&rg=200\
            &jrec={}&fct__3={}&fct__2=General+Assembly&fct__2=General+Assembly&cc=Voting+Data'.format(i, year)    
            req = requests.get(url)
            soup = bs4.BeautifulSoup(req.text, 'html.parser')
            as_ = soup.find_all('a', class_='moreinfo', text='Detailed record')
            all_urls += [a['href'] for a in as_]
    return list(set(all_urls))

In [242]:
d = get_resolution_urls()

In [243]:
def get_metadata(urls):
    '''
    Given the URLs of the UNGA resolutions, fetches the metadata associated with 
    each resolution.
    Input:
        urls (list): list of UNGA resolution URLs 
            (output of the get_resolultion_urls() function)
    Output:
        List of dicts: contains metadata from all UNGA resolutions provided 
            as input 
        
    '''
    urls = list(map(lambda x: 'https://digitallibrary.un.org' + x, urls))
    metadata = []
    for url in urls:
        req = requests.get(url)
        soup = bs4.BeautifulSoup(req.text, 'html.parser')
        dic = {'url': url}
        divs = soup.find_all('div', class_='metadata-row')
        for div in divs:
            k = div.find_all('span')[0].text.strip()
            v = div.find_all('span')[1].text.strip()
            dic[k] = v
            as_ = div.find_all('a')
            for a in as_:
                dic[k + '_url'] = a['href']
        if dic['Note'] == 'RECORDED':
            decisions = ['Yes', 'No', 'Abstentions', 'Non-voting', 'Total']
            votes = re.findall(r':(\s+\S+)', dic['Vote summary'])
            for i, vote in enumerate(votes):
                votes[i] = vote.strip()
                if votes[i] == '|':
                    votes[i] = 0
                votes[i] = int(votes[i])
            dic['Votes'] = dict(zip(decisions, votes))
            dic['Votes_url'] = url.replace('?ln=en', '/export/xm')
        metadata.append(dic)
    return metadata

In [None]:
def get_voting_data(metadata):
    '''
    Given the metadata of UNGA resolutions, fetches the voting records for each resolution.
    Input:
        metadata (list of dicts): contains the metadata of all UNGA resolutions 
            (output of the get_metadata() function)
    Output:
        List of dicts: matching each resolution ID with a dictionary of voting records.
    '''
    voting_data = {}
    for res in metadata:
        try:
            req = requests.get(res['Votes_url'])
        except KeyError:
            continue
        voting_data[res['Resolution']] = []
        soup = bs4.BeautifulSoup(req.text, 'html.parser')
        datafields = soup.find_all('datafield', tag='967')
        for field in datafields:
            votes = {}
            votes['Code'] = field.find_all('subfield', code='c')[0].text
            votes['Country'] = field.find_all('subfield', code='e')[0].text
            try:
                votes['Vote'] = field.find_all('subfield', code='d')[0].text
                voting_data[res['Resolution']].append(votes)
            except IndexError:
                voting_data[res['Resolution']].append(votes)
                continue
    return voting_data

In [None]:
voting_data_full = get_voting_data(metadata_full)

In [209]:
pd.DataFrame(all_votes).set_index('Country').loc[:, ['Vote']].T

Country,AFGHANISTAN,ALBANIA,ALGERIA,ANDORRA,ANGOLA,ANTIGUA AND BARBUDA,ARGENTINA,ARMENIA,AUSTRALIA,AUSTRIA,...,UNITED REPUBLIC OF TANZANIA,UNITED STATES,URUGUAY,UZBEKISTAN,VANUATU,VENEZUELA (BOLIVARIAN REPUBLIC OF),VIET NAM,YEMEN,ZAMBIA,ZIMBABWE
Vote,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,...,Y,Y,Y,Y,,,Y,Y,Y,A


In [None]:
def get_resolutions(metadata):
    resolutions = {}
    for res in metadata:
        try:
            req = requests.get(res['Resolution_url'])
        except KeyError:
            resolutions[res['Resolution']] = np.NaN
            continue
    
    