In [None]:
# auto reload modules
%load_ext autoreload
%autoreload 2

In [None]:
import os

from dotenv import load_dotenv

load_dotenv()
from datetime import datetime
from pathlib import Path

import geopandas as gpd
import pandas as pd
from IPython.core.display_functions import display
from IPython.display import Markdown
from wbgapi import economy

from utils import (
    create_stacked_chart,
    create_choropleth_map,
    plot_ngrams,
    plot_submission_type_frequencies,
    create_geodataframe,
    preprocess_concept_df,
    process_spans,
)

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
concept = "loss-and-damage"
formatted_concept = concept.replace("-", " ").title()
date_of_nb = datetime.now().strftime("%d-%m-%Y")

In [None]:
display(Markdown(f"# GST Report on {formatted_concept}"))
display(Markdown(f"## Summary"))
display(
    Markdown(
        f"This report contains summary statistics and visualisations for all identified mentions of {formatted_concept} across UNFCCC input documents, as of {date_of_nb}. Highlights from this report: Loss and Damage in UNFCCC input documents appear most frequently alongside mentions of X (context), in document type X and by Party members, particularly those located in the X region."
    )
)
display(
    Markdown(
        f"[Link to Methodology](https://www.notion.so/climatepolicyradar/Concept-tracker-internal-a879dfc5c2fd49159838af86cd5e8955)"
    )
)
display(
    Markdown(
        f"[Link to Linguistic input file](https://github.com/climatepolicyradar/global-stocktake/blob/main/concepts/fossil-fuels/input.xlsx)"
    )
)

In [None]:
# set the display options to allow resizing columns
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None);

In [None]:
concepts_path = Path().absolute().parent / "concepts";

In [None]:
df_concepts = pd.read_excel(concepts_path / concept / "output_with_metadata.xlsx")
df_spans = pd.read_csv(concepts_path / concept / "spans.csv")
date_of_nb = datetime.today().strftime("%d-%m-%Y")
df_input = pd.read_excel(concepts_path / concept / "input.xlsx");

In [None]:
scraper_csv_path = os.getenv("SCRAPER_CSV_PATH", "scraper_csv");

In [None]:
df_worldbank = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
df_world_economics = pd.DataFrame(economy.list())
df_concepts_processed = preprocess_concept_df(
    df_concepts, df_worldbank, df_world_economics
)
df_worldbank = df_worldbank.rename(columns={"name": "country"})

In [None]:
df_spans = process_spans(df_spans, df_concepts_processed)

In [None]:
df_concepts_geoplot = create_geodataframe(df_concepts_processed, df_worldbank)

In [None]:
display(
    Markdown(
        f"## UNFCCC Party members mentioning {formatted_concept} across all input documents"
    )
)

In [None]:
# Iterate over the unique concepts and create a separate choropleth map for each
unique_concepts = sorted(df_concepts_geoplot["Concept"].unique())
for conc in unique_concepts:
    choropleth_map = create_choropleth_map(df_concepts_geoplot, conc)
    choropleth_map.display()

In [None]:
total_stakeholders = df_concepts_processed["Author"].nunique()
num_stakeholders_mentioning = df_concepts_processed.groupby(["Concept", "Author Type"])[
    "Author"
].nunique()

In [None]:
percentage_stakeholders_mentioning = (
    num_stakeholders_mentioning / total_stakeholders * 100
)
num_stakeholders_mentioning = num_stakeholders_mentioning.rename("count").reset_index()
percentage_stakeholders_mentioning = percentage_stakeholders_mentioning.rename(
    "percentage"
).reset_index()
df_counts_and_percentages = num_stakeholders_mentioning.merge(
    percentage_stakeholders_mentioning, on=["Concept", "Author Type"]
)

In [None]:
display(
    Markdown(
        f"## Number and percentage of UNFCCC Party and Non-Party documents that mention {concept.replace('-', ' ').title()} related concepts in submitted documents"
    )
)

In [None]:
create_stacked_chart(df_counts_and_percentages)

In [None]:
display(
    Markdown(
        f"## Document types that mention {concept.replace('-', ' ').title()} in submitted documents"
    )
)

In [None]:
NUM_TOP_SUBMISSION_TYPES = 5
plot_submission_type_frequencies(
    df_concepts_processed, formatted_concept, NUM_TOP_SUBMISSION_TYPES
)

# Frequent Word Combinations

In [None]:
import nltk

nltk.download("punkt", quiet=True);

In [None]:
from IPython.display import display, HTML

html = """
<div>
    <a href="#" data-toggle="tooltip" title="An n-gram is a contiguous sequence of n items from a given sample of text or speech. Here, 'n' can be any integer. When n is 1, we refer to it as a 'unigram'. Similarly, a 2-gram (bigram) is a two-word sequence of words like 'please turn', 'turn your', and so on, and a 3-gram (trigram) is a three-word sequence of words like 'please turn your', 'turn your computer', etc.">❓What is an n-gram?❓</a>
</div>

<script>
$(document).ready(function(){
    $('[data-toggle="tooltip"]').tooltip();
});
</script>
"""

display(HTML(html))

In [22]:
plot_ngrams(df_spans)

## Top 10 bigrams and trigrams (frequent word combinations) relating to Loss And Damage across UNFCCC input documents


Unnamed: 0,Total,Total Bigrams,trigram,Total Trigrams
0,"(loss, damage)",780,"(addressing, loss, damage)",131
1,"(climate, change)",215,"(address, loss, damage)",118
2,"(Loss, Damage)",139,"(loss, damage, associated)",100
3,"(addressing, loss)",133,"(effects, climate, change)",85
4,"(address, loss)",120,"(adverse, effects, climate)",84
5,"(damage, associated)",101,"(damage, associated, adverse)",75
6,"(adverse, effects)",85,"(minimizing, addressing, loss)",74
7,"(effects, climate)",85,"(associated, adverse, effects)",68
8,"(Paris, Agreement)",76,"(Warsaw, International, Mechanism)",48
9,"(associated, adverse)",76,"(minimize, address, loss)",42
