In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

# To make our relative library imports work
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(grandparent_dir)

In [3]:
import pickle

import pandas as pd
import altair as alt
import networkx as nx
import numpy as np
import numpy as np

from dataclasses import dataclass
from urllib.parse import unquote
from tqdm import tqdm

# Local Imports
from Library.embedding_extraction import extract_embeddings
from Library.narrative_landscape import NarrativeLandscape
from Library.storyline import Storyline
from metrics import similarity_metric, dtw_metric, absolute_metrics

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [4]:
@dataclass
class Config:
    # The filename for the source data
    foldername = "wikispeedia"
    # The column with the contents of the article
    data_column = "plain_text"
    # The "date" column. This column will be renamed to "date" later.
    date_column = None
    # The column used as title for visualization and summary
    summary_column = "title"


CONFIG = Config()

In [5]:
data = pd.read_csv(f"../../data/{CONFIG.foldername}/text_data.csv")

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3928 entries, 0 to 3927
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       3928 non-null   object
 1   plain_text  3928 non-null   object
dtypes: object(2)
memory usage: 61.5+ KB


In [6]:
RAW_TEXT = data[CONFIG.data_column]

embeddings, _ = extract_embeddings(
    text=RAW_TEXT.tolist(),
    foldername=f"../../data/{CONFIG.foldername}",
    model_name="gpt4"  # OneOf[mpnet, gpt4]
)

File '../../data/wikispeedia/embed_data-gpt4.pickle' loaded successfully.


## Create Narrative Landscapes

In [7]:
links = pd.read_csv(
    "../../data/Wikispeedia/original/links.tsv",
    sep="\t",
    names=["src", "tgt"]
)

links["src"] = links["src"].map(lambda x: unquote(x))
links["tgt"] = links["tgt"].map(lambda x: unquote(x))

links = links[(links["src"].isin(data["title"])) & (links["tgt"].isin(data["title"]))]

ordered_data_titles = data["title"].tolist()

src_indices = links["src"].map(lambda x: ordered_data_titles.index(x))
tgt_indices = links["tgt"].map(lambda x: ordered_data_titles.index(x))

links_graph = nx.from_edgelist(list(zip(src_indices.tolist(), tgt_indices.tolist())))
links_constraints = nx.adjacency_matrix(links_graph, nodelist=range(len(ordered_data_titles))).todense()

### Regular Narrative Landscape

In [8]:
landscape = NarrativeLandscape(verbose=True)
landscape.fit(embeddings.numpy(), constraints=links_constraints)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 87 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.5197929231231193
		Num Edges: 7712628
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 3479213
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 64117
		Is Connected: False
	 >>> Building NetworkX graph


### Narrative Landscape With Closeness Centrality Info

In [9]:
node_closeness = nx.closeness_centrality(links_graph)
node_closeness = np.array(list(node_closeness.values()))

landscape_closeness = NarrativeLandscape(verbose=True)
landscape_closeness.fit(embeddings.numpy(), constraints=links_constraints, node_ranks=node_closeness)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 87 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.20110551007990723
		Num Edges: 7712628
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 4259941
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 68250
		Is Connected: False
	 >>> Building NetworkX graph


## Narrative Extraction


In [10]:
ground_truths = pd.read_csv(f"../../data/{CONFIG.foldername}/ground_truth_paths.csv")

# Remove paths with duplicated documents (within the same path)
ground_truths = ground_truths[ground_truths["path"].map(
    lambda x: not pd.Series(x.split(";")).duplicated().any()
)]

In [11]:
# This can take hours to finish, so we save the extracted narratives to a pickle file
try:
    with open('./extracted_narratives.pkl', 'rb') as handle:
        all_narratives = pickle.load(handle)
except FileNotFoundError:
    all_narratives = dict()
    for idx, true_storyline in tqdm(enumerate(ground_truths["path"]), total=len(ground_truths)):
        true_storyline = true_storyline.split(";")
        s = ordered_data_titles.index(unquote(true_storyline[0]))
        t = ordered_data_titles.index(unquote(true_storyline[-1]))

        if nx.has_path(landscape.nx_graph, s, t) and nx.has_path(landscape_closeness.nx_graph, s, t):
            # Shortest path
            sp = []
            shortest_simple_paths = nx.shortest_simple_paths(landscape.nx_graph, s, t)
            for _ in range(3):
                sp_path = next(shortest_simple_paths, None)
                if sp_path is not None:
                    sp.append(sp_path)

            # Regular narrative landscape
            nt = []
            paths_nodes = []
            for _ in range(3):
                trail, _ = landscape.extract_narrative(s, t, hidden_nodes=paths_nodes)
                if trail:
                    nt.append(trail)
                    paths_nodes.extend(trail[1:-1])

            # Narrative landscape with node closeness information
            nt_cc = []
            paths_nodes = []
            for _ in range(3):
                trail, _ = landscape_closeness.extract_narrative(s, t, hidden_nodes=paths_nodes)
                if trail:
                    nt_cc.append(trail)
                    paths_nodes.extend(trail[1:-1])

            all_narratives[(s, t, idx)] = {
                "random_points": [
                    [s, *np.random.randint(0, len(data), np.random.randint(5, 18)), t],
                    [s, *np.random.randint(0, len(data), np.random.randint(5, 18)), t],
                    [s, *np.random.randint(0, len(data), np.random.randint(5, 18)), t]
                ],
                "shortest_path": sp,
                "narrative_trails": nt,
                "narrative_trails_closeness": nt_cc,
            }
    with open('./extracted_narratives.pkl', 'wb') as handle:
        pickle.dump(all_narratives, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Evaluation

In [12]:
abs_comparison_counts = {"@1": 0, "@2": 0, "@3": 0}


def compute_absolute_metrics(kind):
    results = []  # (method, k, value)

    # Note that for metrics on the narratives with closeness centrality information,
    # we still use the original base coherence to keep all results in the same scale.
    for (_, _, story_idx), narratives in tqdm(all_narratives.items()):
        true_storyline = ground_truths["path"].iloc[story_idx].split(";")
        true_storyline = [ordered_data_titles.index(unquote(x)) for x in true_storyline]

        rp = narratives[f"random_points"]
        sp = narratives[f"shortest_path"]
        nt = narratives[f"narrative_trails"]
        cc = narratives[f"narrative_trails_closeness"]
        
        wikispeedia = absolute_metrics(true_storyline, landscape, kind)
        res_rp_k1 = absolute_metrics(rp[0], landscape, kind)
        res_sp_k1 = absolute_metrics(sp[0], landscape, kind)
        res_nt_k1 = absolute_metrics(nt[0], landscape, kind)
        res_rt_k1 = absolute_metrics(Storyline(landscape, nt[0]).reduce_redundancy(), landscape, kind)
        res_cc_k1 = absolute_metrics(cc[0], landscape, kind)
        res_rc_k1 = absolute_metrics(Storyline(landscape, cc[0]).reduce_redundancy(), landscape, kind)

        results.append(("Wikispeedia", 1, wikispeedia))
        results.append(("Random Points", 1, res_rp_k1))
        results.append(("Shortest Path", 1, res_sp_k1))
        results.append(("Narrative Trails", 1, res_nt_k1))
        results.append(("Reduced Trails", 1, res_rt_k1))
        results.append(("Narrative Trails (CC)", 1, res_cc_k1))
        results.append(("Reduced Trails (CC)", 1, res_rc_k1))

        abs_comparison_counts["@1"] += 1

        if len(rp) >= 2 and len(sp) >= 2 and len(nt) >= 2 and len(cc) >= 2:
            res_rp_k2 = absolute_metrics(rp[1], landscape, kind)
            res_sp_k2 = absolute_metrics(sp[1], landscape, kind)
            res_nt_k2 = absolute_metrics(nt[1], landscape, kind)
            res_rt_k2 = absolute_metrics(Storyline(landscape, nt[1]).reduce_redundancy(), landscape, kind)
            res_cc_k2 = absolute_metrics(cc[1], landscape, kind)
            res_rc_k2 = absolute_metrics(Storyline(landscape, cc[1]).reduce_redundancy(), landscape, kind)

            results.append(("Random Points", 2, sum((res_rp_k1, res_rp_k2)) / 2))
            results.append(("Shortest Path", 2, sum((res_sp_k1, res_sp_k2)) / 2))
            results.append(("Narrative Trails", 2, sum((res_nt_k1, res_nt_k2)) / 2))
            results.append(("Reduced Trails", 2, sum((res_rt_k1, res_rt_k2)) / 2))
            results.append(("Narrative Trails (CC)", 2, sum((res_cc_k1, res_cc_k2)) / 2))
            results.append(("Reduced Trails (CC)", 2, sum((res_rc_k1, res_rc_k2)) / 2))

            abs_comparison_counts["@2"] += 1

        if len(rp) == 3 and len(sp) == 3 and len(nt) == 3 and len(cc) == 3:
            res_rp_k3 = absolute_metrics(rp[2], landscape, kind)
            res_sp_k3 = absolute_metrics(sp[2], landscape, kind)
            res_nt_k3 = absolute_metrics(nt[2], landscape, kind)
            res_rt_k3 = absolute_metrics(Storyline(landscape, nt[2]).reduce_redundancy(), landscape, kind)
            res_cc_k3 = absolute_metrics(cc[2], landscape, kind)
            res_rc_k3 = absolute_metrics(Storyline(landscape, cc[2]).reduce_redundancy(), landscape, kind)

            results.append(("Random Points", 3, sum((res_rp_k1, res_rp_k2, res_rp_k3)) / 3))
            results.append(("Shortest Path", 3, sum((res_sp_k1, res_sp_k2, res_sp_k3)) / 3))
            results.append(("Narrative Trails", 3, sum((res_nt_k1, res_nt_k2, res_nt_k3)) / 3))
            results.append(("Reduced Trails", 3, sum((res_rt_k1, res_rt_k2, res_rt_k3)) / 3))
            results.append(("Narrative Trails (CC)", 3, sum((res_cc_k1, res_cc_k2, res_cc_k3)) / 3))
            results.append(("Reduced Trails (CC)", 3, sum((res_rc_k1, res_rc_k2, res_rc_k3)) / 3))

            abs_comparison_counts["@3"] += 1

    return results

In [13]:
gt_comparison_counts = {"@1": 0, "@2": 0, "@3": 0}


def compute_metric_with_ground_truth(metric, low_emb):
    results = []  # (method, k, value)

    for (_, _, story_idx), narratives in tqdm(all_narratives.items()):
        true_storyline = ground_truths["path"].iloc[story_idx].split(";")
        true_storyline = [ordered_data_titles.index(unquote(x)) for x in true_storyline]

        rp = narratives[f"random_points"]
        sp = narratives[f"shortest_path"]
        nt = narratives[f"narrative_trails"]
        cc = narratives[f"narrative_trails_closeness"]

        res_rp_k1 = metric(true_storyline, rp[0], low_emb)
        res_sp_k1 = metric(true_storyline, sp[0], low_emb)
        res_nt_k1 = metric(true_storyline, nt[0], low_emb)
        nt_st_line = Storyline(landscape, nt[0])
        res_rt_k1 = metric(true_storyline, nt_st_line.reduce_redundancy(), low_emb)
        res_cc_k1 = metric(true_storyline, cc[0], low_emb)
        nt_st_line = Storyline(landscape_closeness, cc[0])
        res_rc_k1 = metric(true_storyline, nt_st_line.reduce_redundancy(), low_emb)

        results.append(("Random Points", 1, res_rp_k1))
        results.append(("Shortest Path", 1, res_sp_k1))
        results.append(("Narrative Trails", 1, res_nt_k1))
        results.append(("Reduced Trails", 1, res_rt_k1))
        results.append(("Narrative Trails (CC)", 1, res_cc_k1))
        results.append(("Reduced Trails (CC)", 1, res_rc_k1))

        gt_comparison_counts["@1"] += 1

        if len(rp) >= 2 and len(sp) >= 2 and len(nt) >= 2 and len(cc) >= 2:
            res_rp_k2 = metric(true_storyline, rp[1], low_emb)
            res_sp_k2 = metric(true_storyline, sp[1], low_emb)
            res_nt_k2 = metric(true_storyline, nt[1], low_emb)
            nt_st_line = Storyline(landscape, nt[1])
            res_rt_k2 = metric(true_storyline, nt_st_line.reduce_redundancy(), low_emb)
            res_cc_k2 = metric(true_storyline, cc[1], low_emb)
            rc_st_line = Storyline(landscape_closeness, cc[1])
            res_rc_k2 = metric(true_storyline, rc_st_line.reduce_redundancy(), low_emb)

            results.append(("Random Points", 2, sum((res_rp_k1, res_rp_k2)) / 2))
            results.append(("Shortest Path", 2, sum((res_sp_k1, res_sp_k2)) / 2))
            results.append(("Narrative Trails", 2, sum((res_nt_k1, res_nt_k2)) / 2))
            results.append(("Reduced Trails", 2, sum((res_rt_k1, res_rt_k2)) / 2))
            results.append(("Narrative Trails (CC)", 2, sum((res_cc_k1, res_cc_k2)) / 2))
            results.append(("Reduced Trails (CC)", 2, sum((res_rc_k1, res_rc_k2)) / 2))

            gt_comparison_counts["@2"] += 1

        if len(rp) == 3 and len(sp) == 3 and len(nt) == 3 and len(cc) == 3:
            res_rp_k3 = metric(true_storyline, rp[2], low_emb)
            res_sp_k3 = metric(true_storyline, sp[2], low_emb)
            res_nt_k3 = metric(true_storyline, nt[2], low_emb)
            nt_st_line = Storyline(landscape, nt[2])
            res_rt_k3 = metric(true_storyline, nt_st_line.reduce_redundancy(), low_emb)
            res_cc_k3 = metric(true_storyline, cc[2], low_emb)
            rc_st_line = Storyline(landscape_closeness, cc[2])
            res_rc_k3 = metric(true_storyline, rc_st_line.reduce_redundancy(), low_emb)

            results.append(("Random Points", 3, sum((res_rp_k1, res_rp_k2, res_rp_k3)) / 3))
            results.append(("Shortest Path", 3, sum((res_sp_k1, res_sp_k2, res_sp_k3)) / 3))
            results.append(("Narrative Trails", 3, sum((res_nt_k1, res_nt_k2, res_nt_k3)) / 3))
            results.append(("Reduced Trails", 3, sum((res_rt_k1, res_rt_k2, res_rt_k3)) / 3))
            results.append(("Narrative Trails (CC)", 3, sum((res_cc_k1, res_cc_k2, res_cc_k3)) / 3))
            results.append(("Reduced Trails (CC)", 3, sum((res_rc_k1, res_rc_k2, res_rc_k3)) / 3))

            gt_comparison_counts["@3"] += 1

    return results

In [14]:
df_rel = pd.DataFrame(compute_absolute_metrics("reliability"), columns=["Method", "k", "Rel"])
df_rel = df_rel.groupby(by=["Method", "k"]).mean()

df_coh = pd.DataFrame(compute_absolute_metrics("min_coherence"), columns=["Method", "k", "Coh"])
df_coh = df_coh.groupby(by=["Method", "k"]).mean()

gt_sim_res = compute_metric_with_ground_truth(similarity_metric, landscape.low_dim_embeds)
df_sim = pd.DataFrame(gt_sim_res, columns=["Method", "k", "Sim"]).groupby(by=["Method", "k"]).mean()

gt_dist_res = compute_metric_with_ground_truth(dtw_metric, landscape.low_dim_embeds)
df_dtw = pd.DataFrame(gt_dist_res, columns=["Method", "k", "DTW"]).groupby(by=["Method", "k"]).mean()

100%|██████████| 10607/10607 [00:03<00:00, 2883.82it/s]
100%|██████████| 10607/10607 [00:03<00:00, 3097.65it/s]
100%|██████████| 10607/10607 [00:40<00:00, 264.41it/s]
100%|██████████| 10607/10607 [00:23<00:00, 460.88it/s]


In [15]:
print(len(all_narratives.items()), "of", len(ground_truths))
print(gt_comparison_counts) # Divide by 2
print(abs_comparison_counts) # Divide by 2

10607 of 10832
{'@1': 21214, '@2': 19172, '@3': 16770}
{'@1': 21214, '@2': 19172, '@3': 16770}


In [16]:
# Reset the index
df = pd.concat([df_rel, df_coh, df_dtw, df_sim], axis=1)
df = df.reset_index().round(3)

# Set the new index
df = df.set_index(['Method'])

# Melt the DataFrame
df_melted = df.reset_index().melt(
    id_vars=['Method', 'k'],
    value_vars=['Rel', 'Coh', 'Sim', 'DTW'],
    var_name='metric',
    value_name='value'
)

# Pivot the DataFrame
df_pivot = df_melted.pivot_table(
    index=['Method'],
    columns=['metric', 'k'],
    values='value'
)

# Flatten the MultiIndex columns
df_pivot.columns = [
    f"{metric}_{k}" for metric, k in df_pivot.columns
]

df_pivot = df_pivot[["Coh_1", "Coh_2", "Coh_3", "Rel_1", "Rel_2", "Rel_3", "Sim_1", "Sim_2", "Sim_3", "DTW_1", "DTW_2", "DTW_3"]]
df_pivot = df_pivot.loc[[
    "Wikispeedia",
    "Random Points",
    "Shortest Path",
    "Narrative Trails",
    "Reduced Trails",
    "Narrative Trails (CC)",
    "Reduced Trails (CC)",
]]

df_pivot.to_csv("benchmark_results.csv")
df_pivot

Unnamed: 0_level_0,Coh_1,Coh_2,Coh_3,Rel_1,Rel_2,Rel_3,Sim_1,Sim_2,Sim_3,DTW_1,DTW_2,DTW_3
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Wikispeedia,0.419,,,0.609,,,,,,,,
Random Points,0.32,0.321,0.322,0.454,0.455,0.456,0.347,0.347,0.347,2.2,2.201,2.2
Shortest Path,0.558,0.56,0.563,0.614,0.615,0.62,0.742,0.742,0.746,0.967,0.978,0.971
Narrative Trails,0.709,0.704,0.704,0.776,0.769,0.767,0.788,0.785,0.787,1.029,1.049,1.063
Reduced Trails,0.668,0.667,0.669,0.76,0.756,0.754,0.768,0.768,0.771,1.056,1.077,1.088
Narrative Trails (CC),0.64,0.631,0.63,0.753,0.748,0.746,0.777,0.778,0.766,1.029,1.049,1.093
Reduced Trails (CC),0.63,0.625,0.624,0.736,0.734,0.734,0.758,0.76,0.75,1.069,1.082,1.119


## Example

In [17]:
story_idx = pd.Series(all_narratives.keys()).sample(1, random_state=0).values[0]
gt = [ordered_data_titles.index(unquote(x)) for x in ground_truths["path"].iloc[story_idx[-1]].split(";")]

print("Reliability:", Storyline(landscape, gt).reliability())
print("Coherences:", Storyline(landscape, gt).path_base_coherence())

data.iloc[gt]["title"]

Reliability: 0.6141220377905909
Coherences: [0.67154048 0.66929064 0.63787183 0.58873993 0.55455038 0.57311691]


913     Cornell_University
3668         United_States
926                Country
1101                 Earth
3254         South_America
286              Argentina
624           Buenos_Aires
Name: title, dtype: object

In [18]:
sp = all_narratives[story_idx]["shortest_path"][0]
print("Reliability:", Storyline(landscape, sp).reliability())
print("Coherences:", Storyline(landscape, sp).path_base_coherence())

data.iloc[sp]["title"]

Reliability: 0.666284836733653
Coherences: [0.67469644 0.65797811]


913    Cornell_University
464               Beijing
624          Buenos_Aires
Name: title, dtype: object

In [19]:
nt = all_narratives[story_idx]["narrative_trails"][0]

print("Reliability:", Storyline(landscape, nt).reliability())
print("Coherences:", Storyline(landscape, nt).path_base_coherence())

data.iloc[nt]["title"]

Reliability: 0.7292108849616922
Coherences: [0.70087677 0.72335243 0.72531052 0.76853098 0.71967434 0.7322392
 0.71036678 0.7557472 ]


913       Cornell_University
665                   Canada
17           1973_oil_crisis
268    Arab-Israeli_conflict
269              Arab_League
116            African_Union
86                     Abuja
586                   Brazil
624             Buenos_Aires
Name: title, dtype: object

In [20]:
nt_reduced = Storyline(landscape, nt).reduce_redundancy()

print("Reliability:", Storyline(landscape, nt_reduced).reliability())
print("Coherences:", Storyline(landscape, nt_reduced).path_base_coherence())

data.iloc[nt_reduced]["title"]

Reliability: 0.7292108849616922
Coherences: [0.70087677 0.72335243 0.72531052 0.76853098 0.71967434 0.7322392
 0.71036678 0.7557472 ]


913       Cornell_University
665                   Canada
17           1973_oil_crisis
268    Arab-Israeli_conflict
269              Arab_League
116            African_Union
86                     Abuja
586                   Brazil
624             Buenos_Aires
Name: title, dtype: object

In [21]:
# Create DataFrame
df = pd.DataFrame(data.iloc[nt_reduced]["title"])
df['index'] = range(len(df))

# Base chart: Line
line = alt.Chart(df).mark_line(color='blue').encode(
    x=alt.X('index', axis=None),
    y=alt.value(0)
)

# Points
points = alt.Chart(df).mark_point(color='blue').encode(
    x='index',
    y=alt.value(0)
)

# Text labels
text = alt.Chart(df).mark_text(
    align='left',
    baseline='bottom',
    dx=2,
    dy=-5,
    angle=315
).encode(
    x='index',
    y=alt.value(0),
    text='title'
)

# Combine charts
chart = (line + points + text).configure_view(
    strokeWidth=0
).properties(
    width=300,
    height=50
)

# Display the chart
chart