In [396]:
import pdftotext
import requests
import io
import bs4
import re
import pandas as pd
import numpy as np
import json
from mpi4py import MPI

In [361]:
def get_resolution_urls(start_year=1946, end_year=2021):
    '''
    Crawls through the UN Digital Library and fetches the URLs for each resolution's
    profile between the given start and end years. 
    Input:
        start_year (int): the year program will start scraping from
        end_year (int): the year program will scrape the last
    Output:
        List of strings: URLs of each resolution's webpage
    '''
    all_urls = []
    for year in range(start_year, end_year + 1):
        for i in [1, 201]:
            url = 'https://digitallibrary.un.org/search?ln=en&c=Voting+Data&rg=200\
            &jrec={}&fct__3={}&fct__2=General+Assembly&fct__2=General+Assembly&cc=Voting+Data'.format(i, year)    
            req = requests.get(url)
            soup = bs4.BeautifulSoup(req.text, 'html.parser')
            as_ = soup.find_all('a', class_='moreinfo', text='Detailed record')
            all_urls += [a['href'] for a in as_]
    return list(set(all_urls))

In [362]:
def get_metadata(urls):
    '''
    Given the URLs of the UNGA resolutions, fetches the metadata associated with 
    each resolution.
    Input:
        urls (list): list of UNGA resolution URLs 
            (output of the get_resolultion_urls() function)
    Output:
        List of dicts: contains metadata from all UNGA resolutions provided 
            as input 
        
    '''
    urls = list(map(lambda x: 'https://digitallibrary.un.org' + x, urls))
    metadata = []
    for url in urls:
        req = requests.get(url)
        soup = bs4.BeautifulSoup(req.text, 'html.parser')
        dic = {'url': url}
        divs = soup.find_all('div', class_='metadata-row')
        for div in divs:
            k = div.find_all('span')[0].text.strip()
            v = div.find_all('span')[1].text.strip()
            dic[k] = v
            as_ = div.find_all('a')
            for a in as_:
                dic[k + '_url'] = a['href']
        if dic['Note'].startswith('RECORDED'):
            decisions = ['Yes', 'No', 'Abstentions', 'Non-voting', 'Total']
            votes = re.findall(r':(\s+\S+)', dic['Vote summary'])
            for i, vote in enumerate(votes):
                votes[i] = vote.strip()
                if votes[i] == '|':
                    votes[i] = 0
                votes[i] = int(votes[i])
            dic['Votes'] = dict(zip(decisions, votes))
            dic['Votes_url'] = url.replace('?ln=en', '/export/xm')
        metadata.append(dic)
    return metadata

In [363]:
def get_voting_data(metadata):
    '''
    Given the metadata of UNGA resolutions, fetches the voting records for each resolution.
    Input:
        metadata (list of dicts): contains the metadata of all UNGA resolutions 
            (output of the get_metadata() function)
    Output:
        List of dicts: matching each resolution ID with a dictionary of voting records.
    '''
    voting_data = {}
    for res in metadata:
        try:
            req = requests.get(res['Votes_url'])
        except KeyError:
            continue
        voting_data[res['Resolution']] = []
        soup = bs4.BeautifulSoup(req.text, 'html.parser')
        datafields = soup.find_all('datafield', tag='967')
        for field in datafields:
            votes = {}
            votes['Code'] = field.find_all('subfield', code='c')[0].text
            votes['Country'] = field.find_all('subfield', code='e')[0].text
            try:
                votes['Vote'] = field.find_all('subfield', code='d')[0].text
                voting_data[res['Resolution']].append(votes)
            except IndexError:
                voting_data[res['Resolution']].append(votes)
                continue
    return voting_data

In [364]:
def get_pdf_urls(metadata):
    pdf_urls = {}
    for res in metadata:
        try:
            req = requests.get(res['Resolution_url'].replace('?ln=en', '/export/xm'))
        except KeyError:
            
            continue
        soup = bs4.BeautifulSoup(req.text, 'html.parser')
        subfields = soup.find_all('subfield', code='u')
        for sf in subfields:
            if sf.text.endswith('-EN.pdf'):
                pdf_urls[res['Resolution']] = sf.text
                break
    return pdf_urls  

In [78]:
url=requests.get('https://digitallibrary.un.org/record/3904095/files/A_RES_75_265-EN.pdf')

In [79]:
with io.BytesIO(url.content) as f:
    pdf = pdftotext.PDF(f)

TEST

In [374]:
with open('urls.txt') as json_file:
    urls_read = json.load(json_file)

In [377]:
with open('metadata.txt') as json_file:
    metadata_read = json.load(json_file)

In [380]:
with open('voting_data.txt') as json_file:
    voting_data_read = json.load(json_file)

In [None]:
with open('voting_data.txt') as json_file:
    voting_data_read = json.load(json_file)

In [385]:
with open('pdf_urls.txt') as json_file:
    pdf_urls_read = json.load(json_file)

In [397]:
2021 - 1946

75

In [398]:
75/4

18.75

In [402]:
np.arange(1946, 2022, 18)

array([1946, 1964, 1982, 2000, 2018])

In [405]:
urls_read[:3]

['/record/3904088?ln=en', '/record/3906587?ln=en', '/record/3902234?ln=en']

In [406]:
urls_read[3:6]

['/record/3906585?ln=en', '/record/3904148?ln=en', '/record/3904090?ln=en']

In [410]:
with open('urls.txt', 'w') as outfile:
    json.dump(urls, outfile)

In [412]:
np.array(urls)

array(['/record/657115?ln=en', '/record/563022?ln=en',
       '/record/281942?ln=en', ..., '/record/620159?ln=en',
       '/record/561482?ln=en', '/record/645467?ln=en'], dtype='<U21')

In [457]:
n = math.ceil(len(urls) / 17)
final = [urls[i * n:(i + 1) * n] for i in range((len(urls) + n - 1) // n )] 

In [430]:
n = len(urls) // 15

In [458]:
len(final)

17

In [442]:
pdfs = []
for i in range(4):    
    with open('unga-analysis/pdf_urls{}.txt'.format(i)) as json_file:
        pdfs.append(json.load(json_file))

In [444]:
for i in pdfs:
    print(len(i))

5
5
5
3


In [447]:
metadatas = []
for i in range(4):    
    with open('unga-analysis/metadata{}.txt'.format(i)) as json_file:
        metadatas += json.load(json_file)

In [449]:
print(len(metadatas))
for i in metadatas:
    print(i['Resolution'] + ': ', i.get('Resolution_url', 'not found'), i['url'])

20
A/RES/2537(XXIV)[C]:  https://digitallibrary.un.org/record/645837?ln=en https://digitallibrary.un.org/record/657115?ln=en
A/RES/60/98:  https://digitallibrary.un.org/record/562730?ln=en https://digitallibrary.un.org/record/563022?ln=en
A/RES/44/154:  https://digitallibrary.un.org/record/82474?ln=en https://digitallibrary.un.org/record/281942?ln=en
A/RES/69/219:  https://digitallibrary.un.org/record/786724?ln=en https://digitallibrary.un.org/record/786774?ln=en
A/RES/52/108:  https://digitallibrary.un.org/record/251370?ln=en https://digitallibrary.un.org/record/285020?ln=en
A/RES/45/54:  https://digitallibrary.un.org/record/105310?ln=en https://digitallibrary.un.org/record/282107?ln=en
A/RES/74/102:  https://digitallibrary.un.org/record/3846809?ln=en https://digitallibrary.un.org/record/3847639?ln=en
A/RES/51/48:  https://digitallibrary.un.org/record/230524?ln=en https://digitallibrary.un.org/record/284533?ln=en
A/RES/73/160:  https://digitallibrary.un.org/record/1660233?ln=en https:

In [450]:
import math

In [451]:
urls

['/record/657115?ln=en',
 '/record/563022?ln=en',
 '/record/281942?ln=en',
 '/record/786774?ln=en',
 '/record/285020?ln=en',
 '/record/282107?ln=en',
 '/record/3847639?ln=en',
 '/record/284533?ln=en',
 '/record/1660621?ln=en',
 '/record/662589?ln=en',
 '/record/281109?ln=en',
 '/record/698778?ln=en',
 '/record/657685?ln=en',
 '/record/281300?ln=en',
 '/record/283437?ln=en',
 '/record/279049?ln=en',
 '/record/283639?ln=en',
 '/record/283197?ln=en',
 '/record/284524?ln=en',
 '/record/643117?ln=en',
 '/record/430760?ln=en',
 '/record/281462?ln=en',
 '/record/765961?ln=en',
 '/record/671993?ln=en',
 '/record/643255?ln=en',
 '/record/671779?ln=en',
 '/record/644977?ln=en',
 '/record/637555?ln=en',
 '/record/429142?ln=en',
 '/record/673009?ln=en',
 '/record/643854?ln=en',
 '/record/673619?ln=en',
 '/record/284268?ln=en',
 '/record/635779?ln=en',
 '/record/1325770?ln=en',
 '/record/1482074?ln=en',
 '/record/602583?ln=en',
 '/record/454779?ln=en',
 '/record/279727?ln=en',
 '/record/482741?ln=e