# PDF/Docx scraping the COP 2021 participation records using Python
#### Stefan-Cristan Roata
#### Yale-NUS College

## Part 1 : collecting the organization name and entity type

In [1]:
# !pip install python-docx

In [23]:
from docx import Document
path ='/Users/stefan/Desktop/IAD_blockchain/COP2017/COP2017.docx'
document = Document(path)
organization_names = []

for para in document.paragraphs:
    for run in para.runs:
        if run.bold :
            organization_names.append(run.text)

In [24]:
organization_names

['FCCC/CP/2017/INF.4',
 'Parties',
 'Afghanistan',
 'Albania',
 'Algeria',
 '2',
 'FCCC/CP/2017/INF.4',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 '3',
 'FCCC/CP/2017/INF.4',
 'Argentina (continued)',
 'Armenia',
 'Australia',
 '4',
 'FCCC/CP/2017/INF.4',
 'Australia (continued)',
 'Austria',
 '5',
 'FCCC/CP/2017/INF.4',
 'Austria (continued)',
 'Azerbaijan',
 '6',
 'FCCC/CP/2017/INF.4',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 '7',
 'FCCC/CP/2017/INF.4',
 'Barbados',
 'Belarus',
 'Belgium',
 '8',
 'FCCC/CP/2017/INF.4',
 'Belgium (continued)',
 'Belize',
 'Benin',
 '9',
 'FCCC/CP/2017/INF.4',
 'Benin (continued)',
 '10',
 'FCCC/CP/2017/INF.4',
 'Benin (continued)',
 '11',
 'FCCC/CP/2017/INF.4',
 'Benin (continued)',
 'Bhutan',
 '12',
 'FCCC/CP/2017/INF.4',
 'Bhutan (continued)',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 '13',
 'FCCC/CP/2017/INF.4',
 'Botswana (continued)',
 'Brazil',
 '14',
 'FCCC/CP/2017/INF.4',
 'Brazil (continued

In [25]:
len(organization_names)

2257

In [26]:
def remove_items(test_list, item):
    # using list comprehension to perform the task
    res = [i for i in test_list if i != item]
    return res

organization_names = remove_items(organization_names,'FCCC/CP/2017/INF.4') # removes page headers
organization_names[1:50]

['Afghanistan',
 'Albania',
 'Algeria',
 '2',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 '3',
 'Argentina (continued)',
 'Armenia',
 'Australia',
 '4',
 'Australia (continued)',
 'Austria',
 '5',
 'Austria (continued)',
 'Azerbaijan',
 '6',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 '7',
 'Barbados',
 'Belarus',
 'Belgium',
 '8',
 'Belgium (continued)',
 'Belize',
 'Benin',
 '9',
 'Benin (continued)',
 '10',
 'Benin (continued)',
 '11',
 'Benin (continued)',
 'Bhutan',
 '12',
 'Bhutan (continued)',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 '13',
 'Botswana (continued)',
 'Brazil',
 '14',
 'Brazil (continued)',
 '15',
 'Brazil (continued)']

In [27]:
def int_filter(myList):
    new_list = []
    for element in myList:
        try:
            int(element)
            pass
        except ValueError:
            new_list.append(element)
    return new_list

organization_names = int_filter(organization_names) # filter out page numbers

In [28]:
organization_names[1:100]

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Argentina (continued)',
 'Armenia',
 'Australia',
 'Australia (continued)',
 'Austria',
 'Austria (continued)',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belgium (continued)',
 'Belize',
 'Benin',
 'Benin (continued)',
 'Benin (continued)',
 'Benin (continued)',
 'Bhutan',
 'Bhutan (continued)',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 'Botswana (continued)',
 'Brazil',
 'Brazil (continued)',
 'Brazil (continued)',
 'Brazil (continued)',
 'Brazil (continued)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burkina Faso (continued)',
 'Burkina Faso (continued)',
 'Burkina Faso (continued)',
 'Burundi',
 'Burundi (continued)',
 'Cabo Verde',
 'Cambodia',
 'Cambodia (continued)',
 'Cameroon',
 'Cameroon (continued)',
 'Canada',
 'Canada (continued)',
 'Canada (continued)',
 'Canada (continued)'

In [29]:
def continued_filter(myList):
    new_list = []
    for element in myList:
        if '(continued)' in element:
            pass
        else:
            new_list.append(element)
    return new_list


organization_names = continued_filter(organization_names) # eliminate entries containing "continued"
organization_names[1:100]

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 "Côte d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 "Democratic People's Republic of Korea",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Estonia',
 'Ethiopia',
 'European Union',
 'European Union',
 'European Union',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',


In [30]:
len(organization_names)

1383

In [31]:
organization_names = list(dict.fromkeys(organization_names))

In [32]:
organization_names

['Parties',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 "Côte d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 "Democratic People's Republic of Korea",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Estonia',
 'Ethiopia',
 'European Union',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Gr

In [33]:
len(organization_names)

1353

In [34]:
# inspecting the list of entities:
organization_names[0:10]

['Parties',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia']

## Unfortunately, at this point, The Observer States and the Holy See seem not to have been read by the parser. This is also the case for Non-governmental Organizations subtitle, A SEED Europe and Abibiman Foundation. I will add them manually here.

In [37]:
z = organization_names.index('Zimbabwe') + 1

In [38]:
temp = organization_names[0:z] + ["Observer States", "Holy See"] + organization_names[z:]
organization_names = temp
organization_names[180:210]

['Tuvalu',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom of Great Britain and Northern Ireland',
 'United Republic of',
 'Tanzania',
 'United States of America',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela (Bolivarian Republic of)',
 'Viet Nam',
 'Yemen',
 'Zambia',
 'Zimbabwe',
 'Observer States',
 'Holy See',
 'United Nations Secretariat units and bodies',
 'United Nations Secretariat',
 'Convention on the Conservation of Migratory Species of Wild Animals',
 'Department of Economic and Social Affairs',
 'Economic and Social Commission for Asia and the Pacific',
 'Economic and Social Commission for Western Asia',
 'Economic Commission for Africa',
 'Intergovernmental Platform on Biodiversity and Ecosystem Services',
 'Office of the High Representative for the Least Developed Countries, Landlocked Developing Countries and Small Island Developing States',
 'Office of the United Nations High Commissioner for Human Rights',
 'Office of the United',
 'Nations High

In [54]:
u = organization_names.index("University for Peace") + 1

In [55]:
temp = organization_names[0:u] + ["Non-governmental organizations", "A SEED Europe", "Abibimman Foundation"] + organization_names[u:]
organization_names = temp
organization_names[330:350]

['Permanent Secretariat of the Alpine Convention',
 "Secrétariat général de l'union du Maghreb Arabe",
 'Secretariat of the Pacific Community',
 'Secretariat of the Union for the Mediterranean',
 'South Asia Co-operative Environment Programme',
 'South Centre',
 'Technical Centre for Agricultural and Rural Cooperation EU-ACP',
 'The Regional Organization for the Conservation of the Environment of the Red Sea and Gulf of Aden',
 'Union Economique et Monétaire Ouest Africaine',
 'University for Peace',
 'Non-governmental organizations',
 'A SEED Europe',
 'Abibimman Foundation',
 'Academy for Mountain Environics',
 'ACT Alliance - Action by Churches Together',
 'Action Against Hunger',
 'Action Planéterre',
 'Action Solidarité Tiers-Monde a.s.b.l.',
 'ActionAid International',
 'African Centre for']

In [56]:
organization_names.index("United Nations Secretariat units and bodies")

198

In [57]:
organization_names.index("Holy See")

197

In [58]:
organization_names.index("Observer States")

196

In [59]:
# checking the indices where the type of entity appears in the list (we will pop it out later)
organization_names.index("Parties")

0

In [60]:
organization_names.index("Observer States")

196

In [61]:
organization_names.index("United Nations Secretariat units and bodies")

198

In [62]:
organization_names.index("Specialized agencies and related organizations")

236

In [63]:
organization_names.index("Intergovernmental organizations")

258

In [64]:
organization_names.index("Non-governmental organizations")

340

In [65]:
# populating the entity type list with the corresponding entity type for each organization, 
# considering the previously calculated indices (see above)

entities = []
for i in range(0, len(organization_names)):
    if (0 <= i and i < 196):
        entities.append("Parties")
    if(196 <= i and i < 198):
        entities.append("Observer States")
    if (198 <= i and i < 236):
        entities.append("United Nations Secretariat units and bodies")
    if(236 <= i and i < 258):
        entities.append("Specialized agencies and related organizations")
    if(258 <= i and i < 340):
        entities.append("Intergovernmental organizations")
    if(340 <= i):
        entities.append("Non-governmental organizations")

In [66]:
organization_names.index("Non-governmental organizations")

340

In [67]:
# popping the entity types out of the organization names
organization_names.pop(organization_names.index("Non-governmental organizations"))
organization_names.pop(organization_names.index("Intergovernmental organizations"))
organization_names.pop(organization_names.index("Specialized agencies and related organizations"))
organization_names.pop(organization_names.index("United Nations Secretariat units and bodies"))
organization_names.pop(organization_names.index("Observer States"))
organization_names.pop(organization_names.index("Parties"))

'Parties'

In [68]:
# since we modified the first list (organization_names), we have to modify the second (entities) accordingly:
entities.pop(entities.index("Non-governmental organizations"))
entities.pop(entities.index("Intergovernmental organizations"))
entities.pop(entities.index("Specialized agencies and related organizations"))
entities.pop(entities.index("United Nations Secretariat units and bodies"))
entities.pop(entities.index("Observer States"))
entities.pop(entities.index("Parties"))

'Parties'

In [69]:
import pandas as pd # import pandas to create dataframe

In [70]:
df = pd.DataFrame(list(zip(organization_names, entities)),
               columns =['name', 'entity_type'])
df

Unnamed: 0,name,entity_type
0,Afghanistan,Parties
1,Albania,Parties
2,Algeria,Parties
3,Andorra,Parties
4,Angola,Parties
...,...,...
1347,York University,Non-governmental organizations
1348,Young Energy Specialists - Development Coopera...,Non-governmental organizations
1349,Young European,Non-governmental organizations
1350,Leadership,Non-governmental organizations


In [71]:
df.iloc[180:210]

Unnamed: 0,name,entity_type
180,Uganda,Parties
181,Ukraine,Parties
182,United Arab Emirates,Parties
183,United Kingdom of Great Britain and Northern I...,Parties
184,United Republic of,Parties
185,Tanzania,Parties
186,United States of America,Parties
187,Uruguay,Parties
188,Uzbekistan,Parties
189,Vanuatu,Parties


In [29]:
df.iloc[250:280]

Unnamed: 0,name,entity_type
250,WMO/UNEP Intergovernmental Panel on Climate Ch...,Specialized agencies and related organizations
251,World Bank,Specialized agencies and related organizations
252,World Health,Specialized agencies and related organizations
253,Organization,Specialized agencies and related organizations
254,World Intellectual Property Organization,Specialized agencies and related organizations
255,World Meteorological Organization,Specialized agencies and related organizations
256,World Tourism,Specialized agencies and related organizations
257,World Trade Organization,Specialized agencies and related organizations
258,African Centre of Meteorological Application f...,Intergovernmental organizations
259,African Development Bank Group,Intergovernmental organizations


In [72]:
# The Adaptation Fund Board is missing from IGOs, needs fixing

In [73]:
df.iloc[324:350]

Unnamed: 0,name,entity_type
324,Permanent Court of Arbitration,Intergovernmental organizations
325,Permanent Secretariat of the Alpine Convention,Intergovernmental organizations
326,Secrétariat général de l'union du Maghreb Arabe,Intergovernmental organizations
327,Secretariat of the Pacific Community,Intergovernmental organizations
328,Secretariat of the Union for the Mediterranean,Intergovernmental organizations
329,South Asia Co-operative Environment Programme,Intergovernmental organizations
330,South Centre,Intergovernmental organizations
331,Technical Centre for Agricultural and Rural Co...,Intergovernmental organizations
332,The Regional Organization for the Conservation...,Intergovernmental organizations
333,Union Economique et Monétaire Ouest Africaine,Intergovernmental organizations


In [74]:
# Saving the dataframe to .csv
df.to_csv("COP2017_organization_names.csv", index=False)

# Round of Data Cleaning

At this point, the data was cleaned (on 10th January 2022) and then renamed as COP2017_orgnames.csv

This is the file to be used at this point!!

In [75]:
df = pd.read_csv("COP2017_orgnames.csv")
df

Unnamed: 0,name,entity_type
0,Afghanistan,Parties
1,Albania,Parties
2,Algeria,Parties
3,Andorra,Parties
4,Angola,Parties
...,...,...
1274,Yale University,Non-governmental organizations
1275,York University,Non-governmental organizations
1276,Young Energy Specialists - Development Coopera...,Non-governmental organizations
1277,Young European Leadership,Non-governmental organizations
