In [None]:
import re
from datetime import datetime
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
import pycountry
from IPython.core.display_functions import display
from IPython.display import Markdown
from wbgapi import economy
from wordcloud import STOPWORDS
import altair as alt

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
# set the display options to allow resizing columns
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

In [None]:
concepts_path = Path().absolute().parent / "concepts"

In [None]:
concept = "fossil-fuels"
date_of_nb = datetime.now().strftime("%d-%m-%Y")
df_concepts = pd.read_excel(concepts_path / concept / "output_with_metadata.xlsx")
df_spans = pd.read_csv(concepts_path / concept / "spans.csv")
date_of_nb = datetime.today().strftime("%d-%m-%Y")
df_input = pd.read_excel(concepts_path / concept / "input.xlsx")

In [None]:
df_unfccc = pd.read_csv(concepts_path / "unfccc_files.csv")
df_unfccc["category"] = np.where(df_unfccc["party"].notna(), "Party", "Non-Party")

# Contents

- [Introduction](#introduction)
- [Number of Documents Analysed](#number-of-documents-analysed)
- [Geography of High-Level Concept Mentions](#geographical-distribution-of-concepts)
- [Analysis of High-Level Concepts by Author Type (Party/Non-Party)](#high-level-analysis-of-concepts-by-author-type-party-non-party)
    - [Total Number of Mentions](#total-number-of-mentions)
    - [Total Number of Documents with Mentions](#total-number-of-documents-with-mentions)
    - [Number of Unique Authors with Mentions](#number-of-unique-authors-with-mentions)
    - [Percentage of Documents with Mentions](#percentage-of-documents-with-mentions)
    - [Percentage of Authors with Mentions](#percentage-of-authors-mentioning-each-concept-by-party-non-party-stakeholders)
- [Detailed Analysis of Concepts](#granular-analysis-of-concepts)
    - [Total Mentions](#total-mentions)
    - [Total Documents with Mentions](#total-mentions)
    - [Total Authors with Mentions](#total-authors)
- [Frequent Word Combinations](#word-co-occurrences)
- [Sample Data For Mentions of Concepts](#sample-data)
- [Annex: Methodology](#annex-methodology)

In [None]:
df_concepts = pd.read_excel(concepts_path / concept / "output_with_metadata.xlsx")
df_spans = pd.read_csv(concepts_path / concept / "spans.csv")

In [None]:
def find_country(text: str) -> str:
    """
    Given a text string, attempts to find the name of a country
    mentioned in the text. Returns the name of the country if found,
    or None otherwise.
    """
    # Check for uppercase and lowercase country name
    for country in pycountry.countries:
        if country.name.upper() in text.upper():
            return country.name
        elif country.name.lower() in text.lower():
            return country.name

    # Check for common name
    for country in pycountry.countries:
        if country.name in text:
            return country.name
        if hasattr(country, "common_name") and country.common_name in text:
            return country.common_name

        # Check for official name
        if hasattr(country, "official_name") and country.official_name in text:
            return country.official_name

        # Check for alpha_2 code (e.g., "US" for United States)
        if country.alpha_2 in re.findall(r"\b[A-Z]{2}\b", text):
            return country.name

        # Check for alpha_3 code (e.g., "USA" for United States)
        if country.alpha_3 in re.findall(r"\b[A-Z]{3}\b", text):
            return country.name

    return None


def get_country_code(x: str) -> str:
    """
    Given the name of a country, returns its ISO 3166-1 alpha-3
    code. Returns None if the country is not found.
    """
    try:
        return pycountry.countries.get(name=x).alpha_3
    except (AttributeError, LookupError):
        return None


def create_docs_table(df_concepts):
    # First, create a new column that categorizes each row as 'Party' or 'Non-Party'
    df_concepts["category"] = np.where(
        df_concepts["Party"].notna(), "Party", "Non-Party"
    )

    # Use groupby to count the unique document_id's for each category and for the total dataset
    docs_analysed = df_concepts.groupby("category")["document_id"].nunique()

    # Convert the resulting Series into a DataFrame with a single row and a custom index
    docs_analysed_table = pd.DataFrame(
        docs_analysed.values.reshape(1, -1),
        columns=docs_analysed.index,
        index=["Documents"],
    )

    return docs_analysed_table


start_col_name = "text"
end_col_name = "document_id"
start_col = df_concepts.columns.get_loc(start_col_name)
end_col = df_concepts.columns.get_loc(end_col_name)
indicator_columns = df_concepts.columns[start_col + 1 : end_col]
# Melt the DataFrame and specify the columns to keep as id_vars
df_concepts = df_concepts.rename(columns={"party": "Party"})
df_concepts["category"] = np.where(df_concepts["Party"].notna(), "Party", "Non-Party")
df_concepts_melted = df_concepts.melt(
    id_vars=[col for col in df_concepts.columns if col not in indicator_columns],
    var_name="Concept",
    value_name="value",
)
# filter where indicators are 1
df_concepts_melted = df_concepts_melted[df_concepts_melted["value"] == 1]
# Create a new column 'country' with the found country names
df_concepts_melted["document_name_x_reformatted"] = df_concepts_melted[
    "document_name_x"
].str.replace(r"[_20]+", " ", regex=True)
df_concepts_melted["document_name_y_reformatted"] = df_concepts_melted[
    "document_name_y"
].str.replace(r"[_20]+", " ", regex=True)
df_concepts_melted["country_x"] = df_concepts_melted[
    "document_name_x_reformatted"
].apply(find_country)
df_concepts_melted["country_y"] = df_concepts_melted[
    "document_name_y_reformatted"
].apply(find_country)
df_concepts_melted["country"] = df_concepts_melted["country_x"].combine_first(
    df_concepts_melted["country_y"]
)
# create 3 letter country code
df_concepts_melted["country_code"] = df_concepts_melted["country"].apply(
    get_country_code
)
# create 3 letter country code
df_concepts_melted["country_code"] = df_concepts_melted["country"].apply(
    get_country_code
)
df_eco = pd.DataFrame(economy.list())
# Assuming the 3-letter country code column in df_concepts_melted is named 'country_code'
df_concepts_melted = pd.merge(
    df_concepts_melted,
    df_eco[["id", "region"]],
    left_on="country_code",
    right_on="id",
    how="left",
)

df_documents = (
    df_concepts_melted.groupby(["Concept", "category"])["document_id"]
    .nunique()
    .reset_index()
    .pivot(index="Concept", columns="category", values="document_id")
)
df_documents["Total"] = df_documents.sum(axis=1)
df_documents.columns.name = None
df_mentions = (
    df_concepts_melted.groupby("Concept")["category"]
    .value_counts()
    .rename("count")
    .reset_index()
    .pivot(index="Concept", columns="category", values="count")
)  # .reset_index()
df_mentions["Total"] = df_mentions.sum(axis=1)
df_mentions.columns.name = None
df_authors = (
    df_concepts_melted.groupby(["Concept", "category"])["author"]
    .nunique()
    .reset_index()
    .pivot(index="Concept", columns="category", values="author")
)
df_authors["Total"] = df_authors.sum(axis=1)
df_authors.columns.name = None

# group melted df by country_code and Concept and number of concept
dd = (
    df_concepts_melted.groupby(["country_code", "Concept"])["Concept"]
    .count()
    .rename("count")
    .reset_index()
)
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
merged = world.set_index("iso_a3").join(dd.set_index("country_code"), how="inner")
# where concept is nan, create new rows for all concepts with count 0
# merged = merged.reset_index().melt(id_vars=['country', 'geometry'], value_vars=merged.columns[2:], value_name='count').dropna(subset=['count'])

# get a unique mapping between document_id and document_name_y for the df_concepts_melted
mapping = df_concepts_melted[
    ["document_id", "document_name_y", "category", "author"]
].drop_duplicates()
# now apply this mapping to df_spans to create a new column with the document_name

df_spans["processed_sentence"] = df_spans["sentence"].apply(
    lambda x: " ".join([word for word in x.split() if word not in (STOPWORDS)])
)
df_spans["normalised_text"] = df_spans["text"].str.lower()
df_spans = df_spans.merge(mapping, on=["document_id"], how="left")
merged = merged.reset_index().rename(columns={"index": "iso_a3"})
# for every country in world, check if the conc is in merged. If not, add a row with count 0
for country in world["iso_a3"]:
    for conc in merged["Concept"].unique():
        if not merged[
            (merged["iso_a3"] == country) & (merged["Concept"] == conc)
        ].empty:
            continue
        else:
            merged = pd.concat(
                [
                    merged,
                    world[world["iso_a3"] == country]
                    .reset_index(drop=True)
                    .merge(
                        pd.DataFrame({"Concept": conc, "count": 0}, index=[0]),
                        left_index=True,
                        right_index=True,
                    ),
                ],
                ignore_index=True,
            )


merged = merged.rename(columns={"name": "country"})

<a id='introduction'></a>
# Introduction

In [None]:
report_text = f"""
 This report contains summary statistics and visualisations for all identified mentions of {concept} within the Global Stocktake submissions, as of {date_of_nb}. It summarises the total number of mentions of {concept} and the number of documents these mentions come from,  broken down by Party and non-Party stakeholders. It also includes a full list of the Parties and non-Party stakeholders which mention {concept} within the Global Stocktake submissions, broken down by subtypes. A sample of the extracts where {concept} is mentioned is included below, with a link to the full set provided. The technical annex contains the methodology used to source the data for this report.
"""
display(Markdown(report_text))

<a id='number-of-documents-analysed'></a>
# Number of Documents Analysed

In [None]:
docs_analysed_table = create_docs_table(df_concepts_melted)
docs_analysed_table_melted = docs_analysed_table.reset_index().melt(
    id_vars=["index"], var_name="stakeholder", value_name="count"
)
chart = (
    alt.Chart(docs_analysed_table_melted)
    .mark_bar()
    .encode(
        x="index:N",
        y="count:Q",
        color="stakeholder:N",
        order=alt.Order("stakeholder:N", sort="ascending"),
    )
    .properties(
        title=f"Number of documents analysed by author type", width=400, height=300
    )
)

chart

<a id='geographical-distribution-of-concepts'></a>
# Geography of High-Level Concept Mentions

In [None]:
# Function to create a choropleth map for a given concept
def create_choropleth_map(merged, concept, range_min, range_max):
    choropleth_map = (
        alt.Chart(merged[merged["Concept"] == concept])
        .mark_geoshape(stroke="black", strokeWidth=1)
        .encode(
            color=alt.Color(
                "count:Q",
                scale=alt.Scale(scheme="viridis", domain=[range_min, range_max]),
            ),
            tooltip=["country:N", "count:Q"],
        )
        .properties(
            width=800, height=400, title=f"Number of Mentions of {concept} by Country"
        )
    )
    return choropleth_map


# Remove Antarctica
merged = merged[merged["country"] != "Antarctica"]

# Calculate the global maximum and minimum counts across all concepts
global_max_count = np.max(merged["count"])
global_min_count = np.min(merged["count"])

# Iterate over the unique concepts and create a separate choropleth map for each
unique_concepts = sorted(merged["Concept"].unique())
for conc in unique_concepts:
    choropleth_map = create_choropleth_map(
        merged, conc, range_min=global_min_count, range_max=global_max_count
    )
    choropleth_map.display()

<a id='high-level-analysis-of-concepts-by-author-type-party-non-party'></a>
# Analysis of High-Level Concepts by Author Type (Party/Non-Party)

In [None]:
display(
    Markdown(
        f"""This section provides an overview of the high-level concepts extracted (see Annex for definition of high-level), split by author type (Party/Non-Party). This includes the number of mentions of each concept, the number of documents mentioning each concept, and the number of authors mentioning each concept, as well as all of these expressed in percentages."""
    )
)

<a id='total-number-of-mentions'></a>
## Total Number of Mentions

In [None]:
display(df_mentions)

<a id='total-number-of-documents-with-mentions'></a>
## Total Number of Documents with Mentions

In [None]:
display(df_documents)

<a id='number-of-unique-authors-with-mentions'></a>
## Number of Unique Authors with Mentions

In [None]:
display(df_authors)

In [None]:
def plot_percentages_stacked(df):
    df = df.reset_index()
    df["Non-Party_percentage"] = df["Non-Party"] / df["Total"] * 100
    df["Party_percentage"] = df["Party"] / df["Total"] * 100

    # Melt the DataFrame to have columns: Concept, Group, and Percentage
    df_melted = df.melt(
        id_vars="Concept",
        value_vars=["Non-Party", "Party"],
        var_name="Group",
        value_name="Percentage",
    )

    base = alt.Chart(df_melted).encode(
        x=alt.X("Concept:N", title="Concept"),
        y=alt.Y(
            "Percentage:Q",
            title="Percentage",
            stack="normalize",
            axis=alt.Axis(format="%"),
        ),
        color=alt.Color("Group:N", title="Group", scale=alt.Scale(scheme="tableau10")),
        tooltip=["Concept", "Group", "Percentage"],
    )

    # Create the bar chart
    bar_chart = base.mark_bar().encode()

    display(bar_chart)

<a id='percentage-of-documents-with-mentions'></a>
## Percentage of Documents with Mentions

In [None]:
unique_docs = (
    (df_concepts_melted.groupby(["category", "Concept"]).document_id.nunique())
    .reset_index()
    .set_index("category")
)
unique_docs_by_type = df_unfccc.groupby("category").size()
unique_docs["Total"] = unique_docs_by_type
unique_docs["Percentage"] = unique_docs["document_id"] / unique_docs["Total"] * 100
unique_docs = unique_docs.reset_index()

alt.Chart(unique_docs).mark_bar().encode(
    x=alt.X("category:O"),
    y=alt.Y("Percentage:Q", scale=alt.Scale(domain=[0, 100])),
    color="category:N",
    column="Concept:N",
)

<a id='percentage-of-authors-mentioning-each-concept-by-party-non-party-stakeholders'></a>
## Percentage of Authors with Mentions


In [None]:
unique_authors = (
    (df_concepts_melted.groupby(["category", "Concept"]).author.nunique())
    .reset_index()
    .set_index("category")
)
unique_docs_by_type = df_unfccc.groupby("category").size()
unique_authors["Total"] = unique_docs_by_type
unique_authors["Percentage"] = unique_authors["author"] / unique_authors["Total"] * 100
unique_authors = unique_authors.reset_index()

In [None]:
# of all the Party/non-Party-authors in the corpus, what percentage have documents containing mentions of each concept?
data_authors = (
    (
        100
        * df_concepts_melted.groupby(["category", "Concept"]).author.nunique()
        / df_unfccc.groupby("category").size()
    )
    .rename("Percentage")
    .reset_index()
)
# make sure chart has no x-axis labels
chart = (
    alt.Chart(unique_authors)
    .mark_bar()
    .encode(
        x=alt.X("category:O", axis=alt.Axis(labels=True)),
        y=alt.Y("Percentage:Q", scale=alt.Scale(domain=[0, 100])),
        color="category:N",
        column="Concept:N",
    )
)
chart

<a id='granular-analysis-of-concepts'></a>
# Detailed Analysis of Concepts

In [None]:
display(
    Markdown(
        f"""This section provides a detailed analysis of the concepts extracted (see Annex for full definition of detailed). Here, we do not split by
Party/Non-Party but instead look at more granular subcategories of the meta-concepts to give a sense of which subcategories are important, potentially facilitating better search."""
    )
)

In [None]:
# Prepare the data
total_concept_mentions = (
    df_spans.rename(columns={"type": "concept"})
    .groupby(["id", "concept"])
    .document_id.count()
    .reset_index()
    .rename(columns={"document_id": "count"})
)
document_concept_mentions = (
    df_spans.rename(columns={"type": "concept"})
    .groupby(["id", "concept"])
    .document_id.nunique()
    .reset_index()
    .rename(columns={"document_id": "count"})
)
author_concept_mentions = (
    df_spans.rename(columns={"type": "concept"})
    .groupby(["id", "concept"])
    .author.nunique()
    .reset_index()
    .rename(columns={"author": "count"})
)


def plot_concept_mentions(data, title):
    # Create a base chart
    base_chart = (
        alt.Chart(data)
        .mark_bar()
        .encode(
            y=alt.Y("count:Q", title=title),
            x=alt.X(
                "id:N", sort="-y", axis=alt.Axis(title=None, labels=True), title="ID"
            ),
            color=alt.Color(
                "concept:N",
                scale=alt.Scale(scheme="category10"),
                legend=alt.Legend(title="Concept"),
            ),
            tooltip=["count:Q"],
        )
        .properties(
            width=600,
            height=300,
        )
    )

    # Display the combined chart
    display(base_chart)

<a id='total-mentions'></a>
## Total Mentions

In [None]:
plot_concept_mentions(total_concept_mentions, "Total mentions of each concept")

<a id='total-mentions'></a>
## Total Documents with Mentions

In [None]:
plot_concept_mentions(
    document_concept_mentions, "Number of documents mentioning each concept"
)

<a id='total-authors'></a>
## Total Authors with Mentions

In [None]:
plot_concept_mentions(
    author_concept_mentions, "Number of authors mentioning each concept"
)

<a id='word-co-occurrences'></a>
# Frequent Word Combinations

This section displays the most common co-occurring words for all mentions of a concept. The co-occurrence is calculated by counting the top 10 tuples and triples ("bigrams" and "trigrams") of words that appear in the same sentences as the extracted concepts. This is useful because it allows us to pinpoint related concepts that are not necessarily included in high level categories, thus providing hints for where to search for new concepts.

In [None]:
import nltk
from nltk import FreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download("punkt", quiet=True);

In [None]:
def extract_ngrams(df, concept, n, text_col="processed_sentence"):
    # Filter the DataFrame for the given concept
    concept_df = df[df["type"] == concept]

    # Tokenize the sentences and extract n-grams
    tokens = [word_tokenize(sentence) for sentence in concept_df[text_col]]
    ngram_list = [ngram for sentence in tokens for ngram in ngrams(sentence, n)]

    # Filter out n-grams containing non-word characters
    word_ngrams = [
        ngram for ngram in ngram_list if all(re.match(r"^\w+$", word) for word in ngram)
    ]

    # Calculate the frequency distribution of n-grams
    freq_dist = FreqDist(word_ngrams)

    return freq_dist

In [None]:
concepts = df_spans.type.unique()

# Set the number of top bigrams and trigrams to display
num_top_ngrams = 10

for conc in concepts:
    display(
        Markdown(
            f"## Top {num_top_ngrams} Bigrams and Trigrams in Sentences Relating to {conc.title()}\n"
        )
    )

    # Extract bigrams and trigrams for the given concept
    bigrams_freq = extract_ngrams(df_spans, conc, n=2, text_col="processed_sentence")
    trigrams_freq = extract_ngrams(df_spans, conc, n=3, text_col="processed_sentence")

    # create a single DataFrame with the bigrams and trigrams
    ngrams_df = pd.DataFrame(
        bigrams_freq.most_common(num_top_ngrams), columns=["bigram", "bigram_count"]
    )
    ngrams_df["trigram"] = [
        trigram for trigram, _ in trigrams_freq.most_common(num_top_ngrams)
    ]
    ngrams_df["trigram_count"] = [
        count for _, count in trigrams_freq.most_common(num_top_ngrams)
    ]

    # # Create DataFrames for bigrams and trigrams
    # bigrams_df = pd.DataFrame(bigrams_freq.most_common(num_top_ngrams), columns=['Bigrams', 'Frequency'])
    # trigrams_df = pd.DataFrame(trigrams_freq.most_common(num_top_ngrams), columns=['Trigrams', 'Frequency'])

    display(Markdown(f"### Most common ngrams\n"))
    display(ngrams_df)
    # display(Markdown(f"### Trigrams\n"))
    # display(trigrams_df)

<a id='sample-data'></a>
# Sample Data For Mentions of Concepts.

In [None]:
display(
    Markdown(
        f"""To see the full sample of mentions of {concept}, please click [here](https://github.com/climatepolicyradar/global-stocktake/blob/main/concepts/{concept}/spans.csv) for a download link."""
    )
)

In [None]:
import warnings

warnings.filterwarnings("ignore")

infile = pd.read_excel(
    "/home/stefan/PycharmProjects/global-stocktake/concepts/fossil-fuels/input.xlsx",
    header=[0, 1],
)
# get a unique mapping between document_id and document_name_y for the df_concepts_melted
mapping = df_concepts_melted[
    ["document_id", "document_name_y", "category"]
].drop_duplicates()
# now apply this mapping to df_spans to create a new column with the document_name
df_spans = df_spans.merge(mapping, on=["document_id"], how="left")

In [None]:
excel_string = infile.to_csv(sep="\t", index=False, header=False)

In [None]:
# , header=[0,1])
# infile.columns = infile.columns.map(' - '.join)
infile.columns = [
    " - ".join(col).strip() if "Unnamed" not in col[1] else col[0]
    for col in infile.columns
]

<a id='annex-methodology'></a>
# Annex: Methodology

In [None]:
display(
    Markdown(
        f"""
The documents within the Global Submissions Information Portal were searched for all mentions of concepts related to the meta-concept "{concept}". This involved automatically translating all documents into English using the Google Translate API before identifying as many potential linguistic expressions of the meta-concept to feed in as rules for a search process. The methodology is explained in detail [here](https://www.notion.so/climatepolicyradar/Concept-tracker-a879dfc5c2fd49159838af86cd5e8955) and the relevant linguistic input file is available for download [here](https://github.com/climatepolicyradar/global-stocktake/tree/main/concepts/{concept}/input.xlsx).
"""
    )
)