In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

# To make our relative library imports work
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(grandparent_dir)

In [3]:
import pickle

import pandas as pd
import altair as alt
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from dataclasses import dataclass
from time import time
from urllib.parse import unquote
from tqdm import tqdm

# Local Imports
from Library.embedding_extraction import extract_embeddings
from Library.helper import plot_embedding_space, plot_highlighted_nodes, plot_topk_tfidf
from Library.narrative_landscape import NarrativeLandscape
from Library.storyline import Storyline
from metrics import similarity_metric, dtw_metric, absolute_metrics

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [4]:
np.random.seed = 420

## Load the Datasets

In [5]:
data_news_articles = pd.read_csv(f"../../data/NewsData/text_data.csv")
data_vis_pub = pd.read_csv(f"../../data/VisPubData/text_data.csv")
data_aminer = pd.read_feather(f"../../data/AMiner/aminer-subset.feather")

# Convert date column to datetime in NewsData
data_news_articles["date"] = pd.to_datetime(data_news_articles["date"], format="%m/%d/%y")

# Convert Year column to datetime in VisPubData
data_vis_pub["Year"] = pd.to_datetime(data_vis_pub["Year"], format="%Y")
data_vis_pub.rename(columns={"Year": "date"}, inplace=True)

# Drop rows with no abstract in VisPubData
data_vis_pub = data_vis_pub[~(data_vis_pub["Abstract"].isna())].reset_index(drop=True)

# Convert date column to datetime in AMiner
data_aminer["date"] = pd.to_datetime(data_aminer["date"])
data_aminer.reset_index(inplace=True, drop=True)  # The original indices are incorrect, so we reset here.

### Load the embeddings

In [6]:
# Load embeddings for news articles
news_data_embeds, _ = extract_embeddings(
    text=data_news_articles["full_text"].tolist(),
    foldername=f"../../data/NewsData",
    model_name="gpt4"
)

# Load embeddings for news VisPubData
vis_pub_embeds, _ = extract_embeddings(
    text=(data_vis_pub["Title"] + ";" + data_vis_pub["Abstract"]).tolist(),
    foldername=f"../../data/VisPubData",
    model_name="gpt4"
)

# Load embeddings for news Aminer
aminer_embeds, _ = extract_embeddings(
    text=(data_aminer["title"] + ";" + data_aminer["abstract"]).tolist(),
    foldername=f"../../data/AMiner",
    model_name="gpt4"
)

File '../../data/NewsData/embed_data-gpt4.pickle' loaded successfully.
File '../../data/VisPubData/embed_data-gpt4.pickle' loaded successfully.
File '../../data/AMiner/embed_data-gpt4.pickle' loaded successfully.


### Sample Subsets and Match Embeddings

In [7]:
# This must occur after loading the dataset and the embeddings to ensure accurate matching between the sampled rows and the embeddings.
data_vis_pub = data_vis_pub.sample(840, replace=False, random_state=420).sort_values(by="date")
data_aminer = data_aminer.sample(1140, replace=False, random_state=420).sort_values(by="date")

vis_pub_embeds = vis_pub_embeds[data_vis_pub.index]
data_vis_pub.reset_index(inplace=True, names="idx")

aminer_embeds = aminer_embeds[data_aminer.index]
data_aminer.reset_index(inplace=True, names="idx")

In [8]:
landscape_news_data = NarrativeLandscape(
    verbose=True,
    impose_date_constraint=True
)
landscape_news_data.fit(
    news_data_embeds.numpy(),
    dates=data_news_articles["date"].values
)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 16 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.45204710055058356
		Num Edges: 145530
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 75388
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 75388
		Is Connected: True
	 >>> Building NetworkX graph


In [9]:
landscape_vis_pub = NarrativeLandscape(
    verbose=True,
    impose_date_constraint=True
)

landscape_vis_pub.fit(
    vis_pub_embeds.numpy(),
    dates=data_vis_pub["date"].values
)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 5 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.7323444684199538
		Num Edges: 352380
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 65114
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 65114
		Is Connected: True
	 >>> Building NetworkX graph


In [10]:
landscape_aminer = NarrativeLandscape(
    verbose=True,
    impose_date_constraint=True
)
landscape_aminer.fit(
    aminer_embeds.numpy(),
    dates=data_aminer["date"].values
)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 18 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.6010767643379847
		Num Edges: 649230
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 279249
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 279249
		Is Connected: True
	 >>> Building NetworkX graph


In [11]:
# news_data_src = np.random.choice(data_news_articles.index[:(len(data_news_articles) // 2) - 50], 50)
# news_data_tgt = np.random.choice(data_news_articles.index[(len(data_news_articles) // 2) + 50:], 50)

# vis_pub_src = np.random.choice(data_vis_pub.index[:(len(data_vis_pub) // 2) - 50], 50)
# vis_pub_tgt = np.random.choice(data_vis_pub.index[(len(data_vis_pub) // 2) + 50:], 50)

# aminer_src = np.random.choice(data_aminer.index[:(len(data_aminer) // 2) - 50], 50)
# aminer_tgt = np.random.choice(data_aminer.index[(len(data_aminer) // 2) + 50:], 50)

## Narrative Extraction


In [12]:
news_data_ground_truths = pd.read_pickle("./narrative_maps/results/news_data.pkl")
news_data_ground_truths["dataset_name"] = "news"

vispub_ground_truths = pd.read_pickle("./narrative_maps/results/vispub.pkl")
vispub_ground_truths["dataset_name"] = "vispub"

aminer_ground_truths = pd.read_pickle("./narrative_maps/results/aminer.pkl")
aminer_ground_truths["dataset_name"] = "aminer"

ground_truths = pd.concat([news_data_ground_truths, vispub_ground_truths, aminer_ground_truths], axis=0)
ground_truths

Unnamed: 0,algorithm,src,tgt,exec_time,effective_exec_time,main_storyline,storylines,dataset_name
0,narrative_maps,157,507,9.308078,0.930808,"[157, 257, 268, 281, 347, 400, 443, 507]","[[157, 257, 268, 281, 347, 400, 443, 507], [35...",news
1,narrative_maps,157,507,8.502937,0.850294,"[157, 257, 268, 281, 347, 400, 443, 507]","[[157, 257, 268, 281, 347, 400, 443, 507], [35...",news
2,narrative_maps,157,507,8.949423,0.894942,"[157, 257, 268, 281, 347, 400, 443, 507]","[[157, 257, 268, 281, 347, 400, 443, 507], [35...",news
3,narrative_maps,157,507,8.586312,0.858631,"[157, 257, 268, 281, 347, 400, 443, 507]","[[157, 257, 268, 281, 347, 400, 443, 507], [35...",news
4,narrative_maps,23,393,8.746982,0.971887,"[23, 45, 74, 182, 183, 260, 292, 315, 336, 371...","[[23, 45, 74, 182, 183, 260, 292, 315, 336, 37...",news
...,...,...,...,...,...,...,...,...
195,narrative_maps,58,861,54.482687,4.952972,"[58, 186, 250, 346, 649, 861]","[[58, 186, 250, 346, 649, 861], [260, 298, 322...",aminer
196,narrative_maps,107,1114,48.206299,4.382391,"[107, 328, 427, 446, 536, 610, 630, 1114]","[[107, 328, 427, 446, 536, 610, 630, 1114], [1...",aminer
197,narrative_maps,107,1114,47.526864,4.320624,"[107, 328, 427, 446, 536, 610, 630, 1114]","[[107, 328, 427, 446, 536, 610, 630, 1114], [1...",aminer
198,narrative_maps,107,1114,48.335083,4.394098,"[107, 328, 427, 446, 536, 610, 630, 1114]","[[107, 328, 427, 446, 536, 610, 630, 1114], [1...",aminer


In [13]:
# This does not take a significant amount of time to run, but we save it for reproducibility.
try:
    with open('./comparison_data.pkl', 'rb') as handle:
        comparison_data = pickle.load(handle)
except FileNotFoundError:
    # (algorithm, src, tgt, exec_time, effective_exec_time, main_storyline, storylines, dataset_name, gt_index)
    comparison_data = []

    # Randomly select one of the extracted storylines within each Narrative Maps run for comparison
    # They all should be the same storyline within the same extraction group.
    comparison_ground_truths = ground_truths[
        ["dataset_name", "src", "tgt"]
    ].reset_index(drop=True).groupby(
        by=["dataset_name", "src", "tgt"]
    ).sample(1, random_state=1)

    for gt_idx, row_data in tqdm(comparison_ground_truths.iterrows(), total=len(comparison_ground_truths)):
        (dataset_name, src, tgt) = row_data.values

        if dataset_name == "news":
            landscape = landscape_news_data
        elif dataset_name == "vispub":
            landscape = landscape_vis_pub
        else:
            landscape = landscape_aminer

        if dataset_name == "news":
            ds = data_news_articles
        elif dataset_name == "vispub":
            ds = data_vis_pub
        else:
            ds = data_aminer

        # A couple of paths are missing in this random (deterministic) run.
        if nx.has_path(landscape.nx_graph, src, tgt):
            random_points = [
                src, *np.random.choice(len(ds) - 1, np.random.randint(5, 18), replace=False), tgt]

            comparison_data.append((
                "random_points", src, tgt, 0, 0,
                random_points, [random_points], dataset_name, gt_idx
            ))

            shortest_simple_paths = next(nx.shortest_simple_paths(landscape.nx_graph, src, tgt))
            comparison_data.append((
                "shortest_path", src, tgt, 0, 0,
                shortest_simple_paths, [shortest_simple_paths], dataset_name, gt_idx
            ))

            start_time = time()
            narrative_trails, _ = landscape.extract_narrative(src, tgt)
            end_time = time() - start_time
            comparison_data.append((
                "narrative_trails", src, tgt, end_time, end_time,
                narrative_trails, [narrative_trails], dataset_name, gt_idx
            ))

            reduced_trails = Storyline(landscape, narrative_trails).reduce_redundancy()
            comparison_data.append((
                "reduced_trails", src, tgt, 0, 0,
                reduced_trails, [reduced_trails], dataset_name, gt_idx
            ))

        else:
            print(dataset_name, src, tgt)

    with open('./comparison_data.pkl', 'wb') as handle:
        pickle.dump(comparison_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
comparison_results = []  # (method, value, metric, dataset)

for alg, _, _, _, _, path, _, ds_name, gt_idx in tqdm(comparison_data):
    true_storyline = list(map(lambda x: int(x), ground_truths.iloc[gt_idx]["main_storyline"]))

    if ds_name == "news":
        # embeds = news_data_embeds
        embeds = landscape_news_data.low_dim_embeds
    elif ds_name == "vispub":
        # embeds = vis_pub_embeds
        embeds = landscape_vis_pub.low_dim_embeds
    else:
        # embeds = aminer_embeds
        embeds = landscape_aminer.low_dim_embeds

    # print(ds_name, true_storyline, embeds.shape)
    sim_res = similarity_metric(true_storyline, path, embeds)
    dtw_res = dtw_metric(true_storyline, path, embeds)

    comparison_results.append((alg, sim_res, "sim", ds_name))
    comparison_results.append((alg, dtw_res, "dtw", ds_name))

100%|██████████| 560/560 [00:00<00:00, 3853.01it/s]


In [15]:
absolute_results = []  # (method, value, metric, dataset)

for alg, _, _, _, _, path, _, ds_name, gt_idx in tqdm(comparison_data):
    true_storyline = list(map(lambda x: int(x), ground_truths.iloc[gt_idx]["main_storyline"]))

    if ds_name == "news":
        landscape = landscape_news_data
    elif ds_name == "vispub":
        landscape = landscape_vis_pub
    else:
        landscape = landscape_aminer

    true_coh_res = absolute_metrics(true_storyline, landscape, "min_coherence")
    true_rel_res = absolute_metrics(true_storyline, landscape, "reliability")

    absolute_results.append(("narrative_maps", true_coh_res, "coh", ds_name))
    absolute_results.append(("narrative_maps", true_rel_res, "rel", ds_name))

    extracted_coh_res = absolute_metrics(path, landscape, "min_coherence")
    extracted_rel_res = absolute_metrics(path, landscape, "reliability")

    absolute_results.append((alg, extracted_coh_res, "coh", ds_name))
    absolute_results.append((alg, extracted_rel_res, "rel", ds_name))

100%|██████████| 560/560 [00:00<00:00, 24253.79it/s]


In [16]:
df_comparison = pd.DataFrame(comparison_results, columns=["method", "value", "metric", "dataset"])
df_absolute = pd.DataFrame(absolute_results, columns=["method", "value", "metric", "dataset"])

In [17]:
# Reset the index
df = pd.concat([df_comparison, df_absolute], axis=0)
df = df.reset_index(drop=True).round(3)

# # Pivot the DataFrame
df_pivot = df.pivot_table(
    index=['method'],
    columns=['metric', 'dataset'],
    values='value'
)

# Flatten the MultiIndex columns
df_pivot.columns = [
    f"{metric}:{k}" for metric, k in df_pivot.columns
]

# df_pivot = df_pivot[["min_coherence:news", "reliability:news", "sim:news", "dtw:news"]]
df_pivot = df_pivot[[
    "coh:news",
    "coh:vispub",
    "coh:aminer",

    "rel:news",
    "rel:vispub",
    "rel:aminer",

    "sim:news",
    "sim:vispub",
    "sim:aminer",

    "dtw:news",
    "dtw:vispub",
    "dtw:aminer"
]]
df_pivot = df_pivot.loc[[
    "narrative_maps",
    "random_points",
    "shortest_path",
    "narrative_trails",
    "reduced_trails",
]]

# df_pivot.to_csv("benchmark_results.csv")
df_pivot.round(3)

Unnamed: 0_level_0,coh:news,coh:vispub,coh:aminer,rel:news,rel:vispub,rel:aminer,sim:news,sim:vispub,sim:aminer,dtw:news,dtw:vispub,dtw:aminer
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
narrative_maps,0.499,0.554,0.502,0.702,0.677,0.629,,,,,,
random_points,0.343,0.412,0.357,0.492,0.577,0.512,0.621,0.337,0.278,2.466,1.397,1.427
shortest_path,0.557,0.743,0.635,0.593,0.753,0.644,0.854,0.519,0.411,1.001,0.991,1.108
narrative_trails,0.689,0.784,0.736,0.786,0.8,0.764,0.872,0.63,0.556,0.762,0.915,0.962
reduced_trails,0.638,0.756,0.691,0.739,0.777,0.724,0.863,0.597,0.465,0.825,0.946,1.025


## Exec Time Comparison

In [18]:
comparison_data = pd.DataFrame(
    comparison_data,
    columns=("algorithm", "src", "tgt", "exec_time", "effective_exec_time",
             "main_storyline", "storylines", "dataset_name", "gt_index")
)

nt_comparison_data = comparison_data[comparison_data["algorithm"] == "narrative_trails"]
nt_exec_times = nt_comparison_data[["dataset_name", "exec_time"]]

In [19]:
extended_nt_time_comparison = []  # (dataset_name, exec_time)

for idx, (src, tgt, dataset_name) in tqdm(nt_comparison_data[["src", "tgt", "dataset_name"]].iterrows()):
    for _ in range(3):
        start_time = time()
        landscape.extract_narrative(src, tgt)
        end_time = time() - start_time

        extended_nt_time_comparison.append((dataset_name, end_time))

140it [00:06, 21.94it/s]


In [20]:
nt_exec_times = pd.concat((
    nt_exec_times,  # executions times from previous run
    pd.DataFrame(extended_nt_time_comparison, columns=["dataset_name", "exec_time"])
))

nt_exec_times

Unnamed: 0,dataset_name,exec_time
2,aminer,0.016740
6,aminer,0.043016
10,aminer,0.100665
14,aminer,0.337646
18,aminer,0.028281
...,...,...
415,vispub,0.000036
416,vispub,0.000037
417,vispub,0.069215
418,vispub,0.027089


In [21]:
compared_src_tgt = set(comparison_data.apply(lambda x: (x["src"], x["tgt"]), axis=1).tolist())
nm_exec_times = ground_truths[ground_truths.apply(lambda x: (x["src"], x["tgt"]) in compared_src_tgt, axis=1)]
nm_exec_times = nm_exec_times[["dataset_name", "effective_exec_time"]]
nm_exec_times = nm_exec_times.rename(columns={"effective_exec_time": "exec_time"})

nm_exec_times

Unnamed: 0,dataset_name,exec_time
0,news,0.930808
1,news,0.850294
2,news,0.894942
3,news,0.858631
4,news,0.971887
...,...,...
195,aminer,4.952972
196,aminer,4.382391
197,aminer,4.320624
198,aminer,4.394098


In [22]:
compared_src_tgt = set(comparison_data.apply(lambda x: (x["src"], x["tgt"]), axis=1).tolist())
nm_exec_times_full = ground_truths[ground_truths.apply(
    lambda x: (x["src"], x["tgt"]) in compared_src_tgt, axis=1)]
nm_exec_times_full = nm_exec_times_full[["dataset_name", "exec_time"]]

nm_exec_times_full

Unnamed: 0,dataset_name,exec_time
0,news,9.308078
1,news,8.502937
2,news,8.949423
3,news,8.586312
4,news,8.746982
...,...,...
195,aminer,54.482687
196,aminer,48.206299
197,aminer,47.526864
198,aminer,48.335083


In [23]:
nm_exec_times['algorithm'] = 'Narrative Maps'
nm_exec_times_full["algorithm"] = "Narrative Maps (Full)"
nt_exec_times['algorithm'] = 'Narrative Trails'

# Combine the two dataframes for plotting
combined_exec_times = pd.concat([nt_exec_times, nm_exec_times, nm_exec_times_full])
dataset_labels = {
    "news": "News",
    "vispub": "VisPub",
    "aminer": "Aminer"
}
combined_exec_times['dataset_display_name'] = combined_exec_times['dataset_name'].map(dataset_labels)

# Define the x-axis order for datasets
dataset_order = ["News", "VisPub", "Aminer"]

# Create separate charts for each algorithm
nt_chart = alt.Chart(combined_exec_times[combined_exec_times['algorithm'] == 'Narrative Trails']).mark_point(filled=True).encode(
    x=alt.X('dataset_display_name:N', title='', sort=dataset_order, axis=alt.Axis(labelAngle=0)),
    y=alt.Y('exec_time:Q', title='Execution Time (s)', axis=alt.Axis(titlePadding=20)),
).properties(
    width=200,
    height=100,
    title='Narrative Trails'
)

nm_chart = alt.Chart(combined_exec_times[combined_exec_times['algorithm'] == 'Narrative Maps']).mark_point(filled=True).encode(
    x=alt.X('dataset_display_name:N', title='', sort=dataset_order, axis=alt.Axis(labelAngle=0)),
    y=alt.Y('exec_time:Q', title=''),
    color=alt.value('#F87B0E')
).properties(
    width=200,
    height=100,
    title='Narrative Maps (Per Storyline)'
)

nm_full_chart = alt.Chart(combined_exec_times[combined_exec_times['algorithm'] == 'Narrative Maps (Full)']).mark_point(filled=True).encode(
    x=alt.X('dataset_display_name:N', title='', sort=dataset_order, axis=alt.Axis(labelAngle=0)),
    y=alt.Y('exec_time:Q', title=''),
    color=alt.value('#D02626')
).properties(
    width=200,
    height=100,
    title='Narrative Maps (Full)'
)

# Add confidence intervals using error bars for each algorithm
nt_error_bars = alt.Chart(combined_exec_times[combined_exec_times['algorithm'] == 'Narrative Trails']).mark_errorband(extent='ci').encode(
    x=alt.X('dataset_display_name:N', sort=dataset_order),
    y=alt.Y('exec_time:Q', title=''),
)

nm_error_bars = alt.Chart(combined_exec_times[combined_exec_times['algorithm'] == 'Narrative Maps']).mark_errorband(extent='ci').encode(
    x=alt.X('dataset_display_name:N', sort=dataset_order),
    y=alt.Y('exec_time:Q', title=''),
    color=alt.value('#F87B0E')
)

nm_full_bars = alt.Chart(combined_exec_times[combined_exec_times['algorithm'] == 'Narrative Maps (Full)']).mark_errorband(extent='ci').encode(
    x=alt.X('dataset_display_name:N', sort=dataset_order),
    y=alt.Y('exec_time:Q', title=''),
    color=alt.value('#D02626')
)

# Combine the charts side by side
combined_chart = (nt_chart + nt_error_bars) | (nm_chart + nm_error_bars) | (nm_full_chart + nm_full_bars)
combined_chart.configure_axis(
    labelFontSize=14,
    titleFontSize=12
)