# Narrative Trails


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

# To make our relative library imports work
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

In [3]:
import pandas as pd
import networkx as nx

from dataclasses import dataclass
from time import time

# Local Imports
from Library.embedding_extraction import extract_embeddings
from Library.narrative_landscape import NarrativeLandscape
from Library.storyline import Storyline

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [4]:
@dataclass
class Config:
    # The filename for the source data
    foldername = "NewsData"
    # The column with the contents of the article
    data_column = "full_text"
    # The "date" column. This column will be renamed to "date" later.
    date_column = "date"
    # The column used as title for visualization and summary
    summary_column = "title"


CONFIG = Config()

In [5]:
data = pd.read_csv(f"../data/{CONFIG.foldername}/text_data.csv")

# Convert the date to DateTime object (if isn't a date object already)
data["date"] = pd.to_datetime(data["date"], format="%m/%d/%y")
data.rename(columns={CONFIG.date_column: "date"}, inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        540 non-null    object        
 1   url          540 non-null    object        
 2   date         540 non-null    datetime64[ns]
 3   publication  540 non-null    object        
 4   full_text    540 non-null    object        
 5   Unnamed: 5   0 non-null      float64       
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 25.4+ KB


## Construct Narrative Landscape

In [None]:
RAW_TEXT = data[CONFIG.data_column]

embeddings, _ = extract_embeddings(
    text=RAW_TEXT.tolist(),
    foldername=f"../data/{CONFIG.foldername}",
    model_name="gpt4"  # OneOf[mpnet, gpt4]
)

File '../data/NewsData/embed_data-gpt4.pickle' loaded successfully.


In [7]:
landscape = NarrativeLandscape(
    impose_date_constraint=True,
    n_neighbors=16,
    min_cluster_size=4,
    verbose=True,
)

landscape.fit(embeddings.numpy(), dates=data[CONFIG.date_column].values)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 28 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.4774725550696371
		Num Edges: 145530
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 66605
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 66605
		Is Connected: True
	 >>> Building NetworkX graph


In [8]:
landscape.low_dim_embeds = None

In [9]:
landscape.plot_2d(data, CONFIG.summary_column)

## Narrative Extraction


In [10]:
# SRC_NODE = 185
# TGT_NODE = 445

SRC_NODE = 15
TGT_NODE = 460

# SRC_NODE = 85
# TGT_NODE = 114

# SRC_NODE = 101
# TGT_NODE = 109

# SRC_NODE = 129
# TGT_NODE = 345

# SRC_NODE = 91
# TGT_NODE = 113

paths_nodes = []
n_paths = 3

all_narratives = []
all_trail_coherences = []

for i in range(n_paths):
    start_time = time()
    narrative_path, narrative_cost = landscape.extract_narrative(SRC_NODE, TGT_NODE, hidden_nodes=paths_nodes)
    end_time = time() - start_time

    if narrative_path:
        print(f"Execution Time: {end_time} seconds")

        storyline = Storyline(landscape, narrative_path)

        print("-" * 13)
        print("Path:", storyline.chain)
        print("Bottleneck: ", storyline.bottleneck_weight())
        print("Reliability: ", storyline.reliability())
        print("Length of Path:", len(storyline.chain))

        paths_nodes.extend(storyline.chain[1:-1])
        
        Storyline.print_narrative_path(
            data,
            landscape.cluster_labels,
            storyline.chain,
            CONFIG
        )

        all_narratives.append(storyline.chain)
        all_trail_coherences.append(storyline.path_base_coherence())

        if i != n_paths - 1:
            print("\n\n")
    else:
        print("No Path Found")

Execution Time: 0.059001922607421875 seconds
-------------
Path: [15, 16, 17, 22, 25, 35, 37, 80, 162, 186, 460]
Bottleneck:  0.6488019139765411
Reliability:  0.760350685465444
Length of Path: 11
idx    Topic   Date             title
----------------------------------------------------------------
15     2       Mar 21, 2016     Obama on Cuba: Differences Remain but Change in Sight
16     2       Mar 21, 2016     'Historic visit, historic opportunity': Obama arrives in Cuba, becomes first US president to do so in 88 years-World News , Firstpost
17     2       Mar 21, 2016     Barack Obama: 'Change going to happen' in Cuba
22     2       Mar 22, 2016     Obama presses Cuba's Castro on human rights during historic visit
25     3       May 02, 2016     Historic U.S. cruise docks in Havana
35     1       Nov 27, 2016     Cuba faces renewed tensions with U.S., but without Fidel Castro, its field marshal
37     26      Jan 02, 2017     Cuba puts on show of strength as Trump inauguration near