# Narrative Trails


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

# To make our relative library imports work
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

In [3]:
import pandas as pd
import networkx as nx

from dataclasses import dataclass
from time import time

# Local Imports
from Library.embedding_extraction import extract_embeddings
from Library.narrative_landscape import NarrativeLandscape
from Library.storyline import Storyline

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [4]:
@dataclass
class Config:
    # The filename for the source data
    foldername = "VisPubData"
    # The column with the contents of the article
    data_column = "Abstract"
    # The "date" column. This column will be renamed to "date" later.
    date_column = "Year"
    # The column used as title for visualization and summary
    summary_column = "Title"
    # Wether or not to force events to follow a timeline
    apply_time_order = True


CONFIG = Config()

In [5]:
data = pd.read_csv(f"../data/{CONFIG.foldername}/text_data.csv")

# Convert the date to DateTime object (if isn't a date object already)
data[CONFIG.date_column] = pd.to_datetime(data[CONFIG.date_column], format="%Y")
data.rename(columns={CONFIG.date_column: "date"}, inplace=True)

# Drop rows with no abstract
data = data[~(data["Abstract"].isna())].reset_index(drop=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3549 entries, 0 to 3548
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Conference              3549 non-null   object        
 1   date                    3549 non-null   datetime64[ns]
 2   Title                   3549 non-null   object        
 3   DOI                     3549 non-null   object        
 4   Link                    3549 non-null   object        
 5   FirstPage               3516 non-null   object        
 6   LastPage                3509 non-null   object        
 7   PaperType               3549 non-null   object        
 8   Abstract                3549 non-null   object        
 9   AuthorNames-Deduped     3549 non-null   object        
 10  AuthorNames             3548 non-null   object        
 11  AuthorAffiliation       3544 non-null   object        
 12  InternalReferences      2845 non-null   object  

## Construct Narrative Landscape

In [6]:
RAW_TEXT = data["Title"] + ";" + data[CONFIG.data_column]

embeddings, _ = extract_embeddings(
    text=RAW_TEXT.tolist(),
    foldername=f"../data/{CONFIG.foldername}",
    model_name="gpt4"  # OneOf[mpnet, gpt4]
)

File '../data/VisPubData/embed_data-gpt4.pickle' loaded successfully.


In [7]:
landscape = NarrativeLandscape(verbose=True)
landscape.fit(embeddings.numpy(), dates=data["date"].values)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 59 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.4652723459983509
		Num Edges: 6295926
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 4403153
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 4403153
		Is Connected: True
	 >>> Building NetworkX graph


## Narrative Extraction


In [8]:
# SRC_NODE = 185
# TGT_NODE = 445

# SRC_NODE = 15
# TGT_NODE = 460

# SRC_NODE = 85
# TGT_NODE = 114

# SRC_NODE = 101
# TGT_NODE = 109

# Example of ver long original storyline, with a lot of redundancy
# The reduced storyline is much more concise
SRC_NODE = 129
TGT_NODE = 345

paths_nodes = []
n_paths = 1

all_narratives = []
all_trail_coherences = []

for i in range(n_paths):
    start_time = time()
    narrative_path, narrative_cost = landscape.extract_narrative(SRC_NODE, TGT_NODE, hidden_nodes=paths_nodes)
    end_time = time() - start_time

    if narrative_path:
        print(f"Execution Time: {end_time} seconds")

        storyline = Storyline(landscape, narrative_path)

        print("-" * 13)
        print("Path:", storyline.chain)
        print("Bottleneck: ", storyline.bottleneck_weight())
        print("Reliability: ", storyline.reliability())
        print("Length of Path:", len(storyline.chain))

        Storyline.print_narrative_path(
            data,
            landscape.cluster_labels,
            storyline.chain,
            CONFIG
        )

        paths_nodes.extend(storyline.chain[1:-1])

        storyline.reduce_redundancy(inplace=True, delta=1)

        print("-" * 13)
        print("Reduced Path:", storyline.chain)
        print("Reduced Bottleneck: ", storyline.bottleneck_weight())
        print("Reduced Reliability: ", storyline.reliability())
        print("Reduced Length of Path:", len(storyline.chain), "\n")
        
        Storyline.print_narrative_path(
            data,
            landscape.cluster_labels,
            storyline.chain,
            CONFIG
        )

        all_narratives.append(storyline.chain)
        all_trail_coherences.append(storyline.path_base_coherence())

        if i != n_paths - 1:
            print("\n\n")
    else:
        print("No Path Found")

Execution Time: 2.0768439769744873 seconds
-------------
Path: [129, 1074, 620, 1280, 615, 5, 317, 439, 55, 75, 109, 66, 258, 172, 29, 247, 2418, 3125, 3352, 3263, 1632, 2051, 423, 714, 2924, 3198, 3454, 3273, 861, 1429, 1291, 581, 1764, 864, 1517, 1891, 1879, 1386, 2008, 2518, 1750, 926, 1014, 887, 1563, 2279, 2368, 1868, 2379, 1992, 345]
Bottleneck:  0.8161739176231043
Reliability:  0.8256881540367818
Length of Path: 51
idx    Topic   Date             Title
----------------------------------------------------------------
129    40      Jan 01, 2021     M2Lens: Visualizing and Explaining Multimodal Models for Sentiment Analysis
1074   28      Jan 01, 2014     Multi-Model Semantic Interaction for Text Analytics
620    41      Jan 01, 2018     The Effect of Semantic Interaction on Foraging in Text Analysis
1280   41      Jan 01, 2012     Semantic Interaction for Sensemaking: Inferring Analytical Reasoning for Model Steering
615    41      Jan 01, 2018     Enhancing Web-based Analytics A