# Experiment: visualising concept co-occurrence frequency

The result of this was a [chord diagram](https://public.flourish.studio/visualisation/13986187/).

<img src="chord-diagram.png" width="600">



In [None]:
import sys

!{sys.executable} -m pip install seaborn

In [34]:
import sys

sys.path.append("../..")
import os

from pathlib import Path
import itertools
from collections import Counter

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from src.opensearch.index_data import get_dataset_and_filter_values

In [31]:
cpr_dataset, _ = get_dataset_and_filter_values(
    Path(os.environ["DOCS_DIR_GST"]),
    Path(
        "/Users/kalyan/Documents/CPR/unfccc-global-stocktake-documents/CPR_UNFCCC_MASTER.csv"
    ),
    Path("../concepts").absolute(),
    limit=None,
)

INFO:src.opensearch.index_data:Loading scraper CSV
INFO:src.opensearch.index_data:Loading dataset of parsed documents
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1697/1697 [01:41<00:00, 16.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1679/1679 [03:18<00:00,  8.44it/s]
INFO:src.opensearch.index_data:Loaded 1679 documents. 0 documents failed to load.
INFO:src.opensearch.index_data:Loading spans from concepts directory
INFO:src.opensearch.index_data:Adding spans to dataset
1558docs [01:24, 18.39docs/s]


In [32]:
# Create list of pairs of co-occurring concepts

concept_pairs = []

for doc in cpr_dataset.documents:
    if doc.text_blocks is None:
        continue

    for block in doc.text_blocks:
        # We only want blocks with 2 or more spans - looking for co-occurrences
        if len(block.spans) < 2:
            continue

        span_types = [span.type for span in block.spans]

        concept_pairs.extend(list(itertools.combinations(span_types, 2)))

len(concept_pairs)

6247984

In [33]:
df = pd.DataFrame(concept_pairs, columns=["row", "col"])

# Pivot the DataFrame to create a flat table
pivot_df = pd.pivot_table(df, index="row", columns="col", aggfunc=len, fill_value=0)

pivot_df.to_csv("./big-cooccurrence.csv")

In [40]:
sorted_data = [tuple(sorted(pair)) for pair in concept_pairs]

variable1 = [pair[0] for pair in pairwise_counts.keys()]
variable2 = [pair[1] for pair in pairwise_counts.keys()]
count = list(pairwise_counts.values())

# Create a DataFrame from the pairwise counts
df = pd.DataFrame(
    {
        "Concept 1": variable1 + variable2,
        "Concept 2": variable2 + variable1,
        "Count": count * 2,
    }
)

df.to_csv("./big-cooccurrence-long.csv")

In [28]:
# Create the heatmap using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(
    pivot_df,
    cmap="Blues",
    annot=False,
    fmt="d",
    cbar=True,
)
plt.title("Heatmap of Co-occurrences")
plt.show()

FigureWidget({
    'data': [{'colorscale': [[0.0, 'rgb(247,251,255)'], [0.125,
                             'rgb(222,235,247)'], [0.25, 'rgb(198,219,239)'],
                             [0.375, 'rgb(158,202,225)'], [0.5,
                             'rgb(107,174,214)'], [0.625, 'rgb(66,146,198)'],
                             [0.75, 'rgb(33,113,181)'], [0.875, 'rgb(8,81,156)'],
                             [1.0, 'rgb(8,48,107)']],
              'type': 'heatmap',
              'uid': 'c22a371d-42fe-4357-aacf-81ab9e53b25a',
              'x': array(['Adaptation – Adaptation', 'Barriers And Challenges – Barriers',
                          'Barriers And Challenges – Challenges',
                          'Barriers And Challenges – Lessons Learned',
                          'Capacity Building – Capacity-Building',
                          'Climate Related Hazards – Environmental Degradatation',
                          'Climate Related Hazards – Extreme Weather',
                      

In [60]:
categories = list(set([i.split("–")[0].strip() for i in variable1]))
# https://gist.github.com/ollieglass/f6ddd781eeae1d24e391265432297538
kelly_colors = [
    "#F3C300",
    "#875692",
    "#F38400",
    "#A1CAF1",
    "#BE0032",
    "#C2B280",
    "#848482",
    "#008856",
    "#E68FAC",
    "#0067A5",
    "#F99379",
    "#604E97",
    "#F6A600",
    "#B3446C",
    "#DCD300",
    "#882D17",
    "#8DB600",
    "#654522",
    "#E25822",
    "#2B3D26",
]


colourmap = {categories[i]: kelly_colors[i] for i in range(len(categories))}

In [61]:
# Colourmap for chord diagram in Flourish UI

for i in variable1:
    print(i + ": " + colourmap[i.split("–")[0].strip()])

Barriers And Challenges – Challenges: #BE0032
Barriers And Challenges – Challenges: #BE0032
Barriers And Challenges – Challenges: #BE0032
Greenhouse Gases – Greenhouse Gases: #604E97
Deforestation – Forests: #C2B280
Deforestation – Deforestation: #C2B280
Policy Instruments – Governance: #0067A5
Deforestation – Rainforests: #C2B280
Sectors – Agriculture, Forestry And Other Land Use: #E68FAC
Barriers And Challenges – Challenges: #BE0032
Deforestation – Forests: #C2B280
Deforestation – Deforestation: #C2B280
Deforestation – Forests: #C2B280
Deforestation – Forests: #C2B280
Deforestation – Forests: #C2B280
Barriers And Challenges – Challenges: #BE0032
Deforestation – Deforestation: #C2B280
Deforestation – Deforestation: #C2B280
Deforestation – Deforestation: #C2B280
Barriers And Challenges – Challenges: #BE0032
Deforestation – Rainforests: #C2B280
Policy Instruments – Governance: #0067A5
Barriers And Challenges – Challenges: #BE0032
Deforestation – Rainforests: #C2B280
Barriers And Challen