In [37]:
import pandas as pd
import json
import re
import os

dirname=os.path.dirname

ROOT_DIR = os.path.join('..', '..')
INIT_DATA_PATH = os.path.join(ROOT_DIR, 'data', 'initial')
INTER_DATA_PATH = os.path.join(ROOT_DIR, 'data', 'inter')
FINAL_DATA_PATH = os.path.join(ROOT_DIR, 'data', 'final')
ERROR_LOG = os.path.join(ROOT_DIR, 'error-logs')

## Clean Metadata

In [49]:
def clean_metadata(): 
    with open(os.path.join(INIT_DATA_PATH, 'metadata.txt')) as f:
        metadata = pd.DataFrame(json.load(f))
    metadata = metadata.loc[:, ['Resolution', 'Vote date', 'Title', 'Votes', 'url']]
    votes = (metadata
                .Votes
                .apply(pd.Series)
                .drop(0, axis=1)
                .loc[:,['Yes', 'No', 'Abstentions', 'Non-voting', 'Total']]
                )
    metadata = pd.concat([metadata, votes], axis=1)
    metadata['Votes'] = (~metadata.Votes.isna())
    metadata.to_json(os.path.join(FINAL_DATA_PATH, 'metadata.json'))

In [54]:
clean_metadata()
print(pd.read_json(os.path.join(FINAL_DATA_PATH, 'metadata.json')).shape)

(19428, 10)


## Clean Voting Data

In [58]:
# read voting data
with open(os.path.join(INIT_DATA_PATH, 'voting_data.txt')) as f:
    votes = json.load(f)

# make vote = 'X' if the country did not vote
for k, v in votes.items():
    for c in v:
        c['Vote'] = c.get('Vote', 'X')

In [59]:
# create a dict for matching country codes to country names
# print out if a vote does not have a country code
d = {}
for r, v in votes.items():
    for i in v:
        try:
            d[i['Code']] = d.get(i['Code'], set())
            d[i['Code']].add(i['Country'])
        except KeyError:
            print(i)

{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'Y'}
{'Country': 'TANGANYIKA', 'Vote': 'Y'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'A'}
{'Country': 'TANGANYIKA', 'Vote': 'Y'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'A'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'N'}
{'Country': 'TANGANYIKA', 'Vote': 'Y'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'A'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'Y'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'Y'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'Y'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'Y'}
{'Country': 'UNITED ARAB REPUBLIC', 'Vote': 'Y'}


In [63]:
# handle cases where one code is associated with more than one name
# choose a name based on the index
mult_names = {k: v for k, v in d.items() if len(v) > 1}
for i, v in mult_names.items():
    mult_names[i] = list(v)[int(input('{}: {}'.format(i, v)))]

In [67]:
# change the name of the countries with multiple names to the names determined above
for k, v in votes.items():
    for i in v:
        try:
            if i['Code'] in mult_names:
                i['Country'] = mult_names[i['Code']]
        except KeyError:
            continue

In [68]:
# create a dict for matching country names to country codes
d = {}
for r, v in votes.items():
    for i in v:
        try:
            d[i['Country']] = d.get(i['Country'], set())
            d[i['Country']].add(i['Code'])
        except KeyError:
            continue

In [72]:
# handle cases where one name is associated with more than one code
# choose a code based on the index
mult_codes = {k: v for k, v in d.items() if len(v) > 1}
for i, v in mult_codes.items():
    mult_codes[i] = list(v)[int(input('{}: {}'.format(i, v)))]

In [74]:
# change the name of the countries with multiple codes to the codes determined above
for k, v in votes.items():
    for i in v:
        try:
            if i['Country'] in mult_codes:
                i['Code'] = mult_codes[i['Country']]
        except KeyError:
            continue

In [91]:
# create a vote DataFrame
records = []
for res, v in votes.items():
    record = pd.DataFrame(v).loc[:, ['Country', 'Vote']].set_index('Country').T
    record['Resolution'] = res
    record.set_index('Resolution', inplace=True)
    records.append(record)
organized_votes = pd.concat(records)

In [92]:
# print unique values of votes
values = set()
for col in organized_votes.columns:
    values = values.union(organized_votes[col].unique())
print('unique votes before cleaning:', values)

# change to appropriate values 
'''
X: did not vote
A: abstained
Y: voted yes
N: voted no
nan: not a member at the time of vote
'''
for col in organized_votes.columns:
    organized_votes[col] = organized_votes[col].apply(lambda x: x.upper().strip()[0] if type(x) == str else x)

# print unique values after cleaning
values = set()
for col in organized_votes.columns:
    values = values.union(organized_votes[col].unique())
print('unique votes after cleaning:', values)

unique votes before cleaning: {nan, 'y', 'N ', 'A ', 'a', 'A', 'n', 'X', 'Y', 'Aa', 'N', 'AY'}
unique votes after cleaning: {nan, 'X', 'Y', 'N', 'A'}


In [93]:
# merge EGYPT and UNITED ARAB REPUBLIC
organized_votes['EGYPT'] = \
    (organized_votes.EGYPT.fillna('') + organized_votes['UNITED ARAB REPUBLIC'].fillna('')).apply(lambda x: np.NaN if x == '' else x)
organized_votes.drop('UNITED ARAB REPUBLIC', axis=1, inplace=True)

In [96]:
# save DataFrame as JSON
organized_votes.to_json(os.path.join(FINAL_DATA_PATH, 'votes.json'))

## Clean Corpus

In [222]:
# read the complete corpus (corpus after missing resolutions are added with OCR)
corpus = pd.read_json(os.path.join(INTER_DATA_PATH, 'unga_corpus_complete.json'))
print(corpus.shape)

# drop corpus without text
corpus = corpus.drop(corpus[corpus.Text.isna()].index).reset_index(drop=True)
corpus = corpus[corpus['Text']!=''].reset_index(drop=True)
print(corpus.shape)

corpus.head()

(17231, 3)
(17226, 3)


Unnamed: 0,Resolution,url,Text
0,A/RES/32/149,http://digitallibrary.un.org/record/187841/fil...,Ik. Resolutions adopted without reference to a...
1,A/RES/63/51,http://digitallibrary.un.org/record/642704/fil...,United Nations\n\nGeneral Assembly\n\nA/RES/63...
2,A/RES/68/186,http://digitallibrary.un.org/record/765761/fil...,A/RES/68/186\n\nUnited Nations\n\nDistr.: Gene...
3,A/RES/70/102[B-VIII],http://digitallibrary.un.org/record/815818/fil...,A/RES/70/102 A-B\n\nUnited Nations\n\nDistr.: ...
4,A/RES/46/158,http://digitallibrary.un.org/record/136146/fil...,V. Resolutions adopted on the reports of the S...


I will be using a language detection API to keep only the resolutions in English. Some older resolutions have both English and French versions in the same PDF, I will try to keep only the English versions and remove the French parts. 

Also, in few instances, probably due to some error in UN data management and/or website, the all language PDF links direct the user to German resolutions, therefore making access to English resolutions impossible. I will be dropping those resolutions too. 

In [223]:
from langdetect import detect, DetectorFactory
from langdetect import lang_detect_exception

# import nltk
from nltk.tokenize import sent_tokenize
# nltk.download('punkt')


DetectorFactory.seed = 0
# example language detection:
print(detect('今一はお前さん'))

ja


In [235]:
# get languages for each sentence and create a dict lang:sentence_list 
def get_sentence_lang(res):
    lang_dict = {} 
    sents = sent_tokenize(res)
    for s in sents:
        try: 
            lang = detect(s)
            lang_dict[lang] = lang_dict.get(lang, []) + [s]
        except:
            pass
    return lang_dict

corpus['lang_dict'] = corpus.Text.apply(get_sentence_lang)

In [236]:
# get the share of English sentences vs. others. 
def get_en_share(d):
    try: 
        return len(d.get('en', [])) / len([i for v in d.values() for i in v])
    except ZeroDivisionError: 
        pass

corpus['en_share'] = corpus['lang_dict'].apply(get_en_share)

In [268]:
organized_corpus = corpus[(corpus['en_share'] > 0) & (~corpus['en_share'].isna())]
# the line below gets rid of German resolutions 
organized_corpus = organized_corpus[organized_corpus.en_share > .2]
# the line below stitches the English text together
organized_corpus['Text'] = organized_corpus.lang_dict.apply(lambda x: ' '.join(x['en']).replace('-\n', ''))

# manually dropping some resolutions that were not read properly and contain too much French 
drops = ['A/RES/193(III)[A]', 'A/RES/193(III)[B]', 'A/RES/181(II)[A]', 'A/RES/193(III)[C]']
organized_corpus = organized_corpus[~organized_corpus.Resolution.isin(drops)]

res_lang_dict = organized_corpus[['Resolution', 'lang_dict', 'en_share']]
organized_corpus = organized_corpus[['Resolution', 'url', 'Text']]

print(organized_corpus.shape)

(17199, 5)


In [274]:
# saving the language dictionary in case it is needed
res_lang_dict.to_json(os.path.join(INTER_DATA_PATH, 'lang_dict.json'))

# saving the cleaned corpus to final data path
organized_corpus.to_json(os.path.join(FINAL_DATA_PATH, 'unga_corpus_clean.json'))