# Heatmap: parties vs document types

In [12]:
import sys

!{sys.executable} -m pip install seaborn wikipedia


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
import sys
import os
from pathlib import Path
import itertools
from collections import Counter
from typing import Optional

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import wikipedia
from tqdm.auto import tqdm

sys.path.append("../../..")

from src.opensearch.index_data import get_dataset_and_filter_values

# 1. Create list of authors for manual filling in country, organisation type

In [2]:
data = pd.read_csv(
    "/Users/kalyan/Documents/CPR/unfccc-global-stocktake-documents/CPR_UNFCCC_MASTER.csv"
)
data["Year"] = data["Date"].apply(lambda i: int(i[:4]))
data[["Author", "Submission Type"]] = data[["Author", "Submission Type"]].applymap(
    lambda i: i.split(",")
)

In [3]:
non_party_authors = (
    data[data["Author Type"] == "Non-Party"]
    .explode(column="Author")["Author"]
    .unique()
    .tolist()
)

In [10]:
def get_best_wikipedia_url(title: str) -> Optional[str]:
    pages = wikipedia.search(title)

    if pages:
        try:
            return wikipedia.page(pages[0]).url
        except Exception as e:
            print(f"failed for {title}: {e}")
            return None

    else:
        return None


get_best_wikipedia_url(non_party_authors[0])

'https://en.wikipedia.org/wiki/The_Nature_Conservancy'

In [11]:
wikipedia_urls = [get_best_wikipedia_url(author) for author in tqdm(non_party_authors)]

  0%|          | 0/208 [00:00<?, ?it/s]

failed for CAD2: Page id "autoca" does not match any pages. Try another id!
failed for Partnership on Sustainable: Page id "sustainable development goal 1" does not match any pages. Try another id!




  lis = BeautifulSoup(html).find_all('li')


failed for Organisation for Economic Co-operation and Development (OECD): "oe" may refer to: 
Old English
Œ
Oe (digraph)
Open front rounded vowel
Open-mid front rounded vowel
Ö
Ø
Ө
Oe, Estonia
Ōe, Yamagata
Oe District, Tokushima
Ōe, Kyoto
Oe (Attica)
Otrokovice
Olathe East High School
Ōe (surname)
Kenzaburō Ōe
Old Edwardian
Old Etonian
Ordem dos Engenheiros
Order of Excellence of Guyana
Okean Elzy
Cessna OE Bird Dog
°Oe
Oersted
On30
OpenEmbedded
Opportunistic encryption
Outlook Express
Ophryocystis elektroscirrha
Odakyū Enoshima Line
aircraft registration
Overseas experience
Overview effect
All pages with titles beginning with Oe
All pages with titles containing Oe
0e (disambiguation)
OES (disambiguation)
failed for Local Governments and Municipal Authorities (LGMA): Page id "icl4" does not match any pages. Try another id!
failed for Fundación Ecología y Desarrollo (ECODES): "encode" may refer to: 
APL (programming language)
Binary encoding
Binary-to-text encoding
Character encoding
Co

In [16]:
non_party_author_df = pd.DataFrame(
    {"Author": non_party_authors, "Wikipedia page best guess": wikipedia_urls}
).fillna("")
non_party_author_df[["Country code", "Organisation type"]] = ""

non_party_author_df.to_csv("./non-party-authors-for-completion.csv", index=False)

## 2. Produce table for visualisation

Author, Country, Organisation Type, Document Count

In [13]:
dataset, _ = get_dataset_and_filter_values(
    os.environ["DOCS_DIR_GST"],
    Path(
        "/Users/kalyan/Documents/CPR/unfccc-global-stocktake-documents/CPR_UNFCCC_MASTER.csv"
    ),
    Path("../../../concepts/"),
)

INFO:src.opensearch.index_data:Loading scraper CSV
INFO:src.opensearch.index_data:Loading dataset of parsed documents


  0%|          | 0/1697 [00:00<?, ?it/s]

  0%|          | 0/1679 [00:00<?, ?it/s]

INFO:src.opensearch.index_data:Loaded 1679 documents. 0 documents failed to load.
INFO:src.opensearch.index_data:Loading spans from concepts directory
INFO:src.opensearch.index_data:Adding spans to dataset


0docs [00:00, ?docs/s]

In [14]:
dataset_metadata = dataset.metadata_df

In [25]:
author_df = pd.read_csv("GST non-party authors-completed.csv")
author_df = author_df.drop_duplicates(subset="Author")
author_df.head()

Unnamed: 0,Author,Wikipedia page best guess (leave blank if there isn't one),Country,Organisation type
0,Nature Conservancy (TNC),https://en.wikipedia.org/wiki/The_Nature_Conse...,International,International organisation
1,Institute for Global Environmental Strategies ...,,International,International organisation
2,Wetlands International,https://en.wikipedia.org/wiki/Wetlands_Interna...,International,International organisation
3,Japan Aerospace Exploration Agency (JAXA),https://en.wikipedia.org/wiki/JAXA,Japan,Government
4,Aberystwyth University,https://en.wikipedia.org/wiki/Aberystwyth_Univ...,United Kingdom,Educational institution


In [17]:
dataset_metadata.columns

Index(['document_id', 'document_name', 'document_source_url',
       'document_content_type', 'document_md5_sum', 'languages', 'translated',
       'has_valid_text', 'page_metadata', '_text_block_idx_hash_map', 'source',
       'author', 'validation_status', 'themes', 'types', 'version', 'date',
       'link', 'data_error_type', 'author_is_party', 'document_variant',
       'topics', 'num_text_blocks', 'num_pages'],
      dtype='object')

In [64]:
dataset_metadata = dataset_metadata[~dataset_metadata["author_is_party"]]

In [76]:
dataset_metadata_expanded_author = dataset_metadata.explode(column="author")
dataset_metadata_expanded_author["author"] = dataset_metadata_expanded_author[
    "author"
].replace(
    "United NationsÂ Office for Disaster Risk Reduction (UNDRR)",
    "United Nations Office for Disaster Risk Reduction (UNDRR)",
)

joined_data = pd.merge(
    dataset_metadata_expanded_author,
    author_df,
    left_on="author",
    right_on="Author",
    how="left",
    validate="m:1",
).rename(columns={"Country": "Organisation country"})

In [96]:
nonparty_author_groups = (
    joined_data.groupby(["Organisation country", "Organisation type"])
    .count()["document_id"]
    .unstack()
    .fillna(0)
)
nonparty_author_groups.to_csv("nonparty-author-groups.csv", index=True)