# Example Narrative

In [1]:
import os
import sys

# To make our relative library imports work
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(grandparent_dir)

## Imports


In [2]:
import pandas as pd
import numpy as np

import re
from ast import literal_eval
from urllib.parse import urlparse

import networkx as nx
from networkx.drawing.nx_agraph import write_dot, graphviz_layout

import itertools
import pickle
import json
from time import time

import math
from math import log, exp, pi, sqrt, ceil
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
from tqdm import tqdm

import umap
import hdbscan

from pulp import *

from Library.embedding_extraction import extract_embeddings
from Library.narrative_landscape import NarrativeLandscape
from Library.storyline import Storyline
from metrics import embedding_based_dtw

from narrative_maps import (
    extract_varsdict,
    compute_temp_distance_table,
    build_graph,
    graph_stories,
)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [3]:
np.random.seed = 420

## Linear Program Construction

This has a lot of parameters, some of them ended up unused.


In [4]:
def create_LP(query, sim_table, membership_vectors, clust_sim_table, exp_temp_table, ent_table, numclust, relevance_table,
              K, mincover, sigma_t, credibility=[], bias=[], operations=[],
              has_start=True, has_end=False, window_time=None, cluster_list=[], start_nodes=[], end_nodes=[],
              verbose=True, force_cluster=True, previous_varsdict=None):
    n = len(query.index)  # We can cut out everything after the end.
    # Variable names and indices
    var_i = []
    var_ij = []
    var_k = [str(k) for k in range(0, numclust)]

    for i in range(0, n):  # This goes up from 0 to n-1.
        var_i.append(str(i))
        for j in window_i_j[i]:
            if i == j:
                print("ERROR IN WINDOW - BASE")
            var_ij.append(str(i) + "_" + str(j))

    # Linear program variable declaration.
    minedge = LpVariable("minedge", lowBound=0, upBound=1)
    node_act_vars = LpVariable.dicts("node_act", var_i, lowBound=0, upBound=1)
    node_next_vars = LpVariable.dicts("node_next", var_ij, lowBound=0,  upBound=1)
    # clust_active_vars = LpVariable.dicts("clust_active", var_k, lowBound=0, upBound=1)

    # Create the 'prob' variable to contain the problem data
    prob = LpProblem("StoryChainProblem", LpMaximize)
    # The objective function is added to 'prob' first
    prob += minedge, "WeakestLink"

    # Chain restrictions
    if has_start:
        num_starts = len(start_nodes)
        if verbose:
            print("Start node(s):")
            print(start_nodes)
        if num_starts == 0:  # This is the default when no list is given and it has a start.
            prob += node_act_vars[str(0)] == 1, 'InitialNode'
        else:
            if verbose:
                print("Added start node(s)")
                print("--- %s seconds ---" % (time() - start_time))
            initial_energy = 1.0 / num_starts
            earliest_start = min(start_nodes)
            for node in start_nodes:
                prob += node_act_vars[str(node)] == initial_energy, 'InitialNode' + str(node)
            for node in range(0, earliest_start):
                prob += node_act_vars[str(node)] == 0, 'BeforeStart' + str(node)
    if has_end:
        num_ends = len(end_nodes)
        if verbose:
            print("End node(s):")
            print(end_nodes)
        if num_ends == 0:  # This is the default when no list is given and it has a start.
            prob += node_act_vars[str(n - 1)] == 1, 'FinalNode'
        else:
            if verbose:
                print("Added end node(s)")
                print("--- %s seconds ---" % (time() - start_time))
            final_energy = 1.0 / num_ends
            latest_end = min(end_nodes)
            for node in end_nodes:
                prob += node_act_vars[str(node)] == final_energy, 'FinalNode' + str(node)
            for node in range(latest_end + 1, n):
                prob += node_act_vars[str(node)] == 0, 'AfterEnd' + str(node)

    if verbose:
        print("Chain constraints created.")
        print("--- %s seconds ---" % (time() - start_time))
    prob += lpSum([node_act_vars[i] for i in var_i]) == K, 'KNodes'

    if verbose:
        print("Expected length constraints created.")
        print("--- %s seconds ---" % (time() - start_time))

    if has_start:
        if verbose:
            print("Equality constraints.")
            print("--- %s seconds ---" % (time() - start_time))
        for j in range(1, n):
            if j not in start_nodes:
                prob += lpSum([node_next_vars[str(i) + "_" + str(j)]
                              for i in window_j_i[j]]) == node_act_vars[str(j)], 'InEdgeReq' + str(j)
            else:
                if verbose:
                    print("Generating specific starting node constraints.")
                    print("--- %s seconds ---" % (time() - start_time))
                prob += lpSum([node_next_vars[str(i) + "_" + str(j)]
                              for i in window_j_i[j]]) == 0, 'InEdgeReq' + str(j)
    else:
        if verbose:
            print("Inequality constraints.")
            print("--- %s seconds ---" % (time() - start_time))
        for j in range(1, n):
            prob += lpSum([node_next_vars[str(i) + "_" + str(j)]
                          for i in window_j_i[j]]) <= node_act_vars[str(j)], 'InEdgeReq' + str(j)
    if verbose:
        print("In-degree constraints created.")
        print("--- %s seconds ---" % (time() - start_time))

    if has_end:
        if verbose:
            print("Equality constraints.")
            print("--- %s seconds ---" % (time() - start_time))
        for i in range(0, n - 1):
            if i not in end_nodes:
                prob += lpSum([node_next_vars[str(i) + "_" + str(j)]
                              for j in window_i_j[i]]) == node_act_vars[str(i)], 'OutEdgeReq' + str(i)
            else:
                if verbose:
                    print("Generating specific starting node constraints.")
                    print("--- %s seconds ---" % (time() - start_time))
                prob += lpSum([node_next_vars[str(i) + "_" + str(j)]
                              for j in window_i_j[i]]) == 0, 'OutEdgeReq' + str(i)
    else:
        if verbose:
            print("Inequality constraints.")
            print("--- %s seconds ---" % (time() - start_time))
        for i in range(0, n - 1):
            prob += lpSum([node_next_vars[str(i) + "_" + str(j)]
                          for j in window_i_j[i]]) <= node_act_vars[str(i)], 'OutEdgeReq' + str(i)
    if verbose:
        print("Out-degree constraints created.")
        print("--- %s seconds ---" % (time() - start_time))

    # Objective
    for i in range(0, n):
        for j in window_i_j[i]:
            coherence_weights = [0.5, 0.5]
            # Five or more entities in common means double the connection strength.
            entity_multiplier = min(1 + ent_table[i, j], 2)
            # Geometric mean the relevances, multiply based on how far it is from 0.5.
            relevance_multiplier = (relevance_table[i] * relevance_table[j]) ** 0.5
            coherence = (sim_table[i, j] ** coherence_weights[0]) * \
                (clust_sim_table[i, j] ** coherence_weights[1])
            weighted_coherence = min(coherence * entity_multiplier * relevance_multiplier, 1.0)
            prob += minedge <= 1 - node_next_vars[str(i) + "_" + str(j)] + \
                weighted_coherence, "Objective" + str(i) + "_" + str(j)
    if verbose:
        print("Objective constraints created.")
        print("--- %s seconds ---" % (time() - start_time))

    if previous_varsdict:
        current_names = [v.name for v in prob.variables() if "node_act" in v.name]
        if verbose:
            print("Generated list of names.")
            print("--- %s seconds ---" % (time() - start_time))
        for k, v in previous_varsdict.items():
            if "node_act" in k and k in current_names:
                node_act_vars[k.replace("node_act_", "")].setInitialValue(v)

    if verbose:
        if previous_varsdict:
            print("Used previous solution as starting point.")
            print("--- %s seconds ---" % (time() - start_time))
        else:
            print("No previous solution available.")
            print("--- %s seconds ---" % (time() - start_time))
    # The problem data is written to an .lp file
    return prob

## Building the Graph Data Frame


In [5]:
def build_graph_df_multiple_starts(query, varsdict, prune=None, threshold=0.01, cluster_dict={}, start_nodes=[]):
    n = len(query)
    # This has some leftover stuff that is not really useful now.
    if 'bias' in query.columns:
        graph_df = pd.DataFrame(columns=['id', 'adj_list', 'adj_weights',
                                'date', 'publication', 'title', 'text', 'url', 'bias', 'coherence'])
    else:
        graph_df = pd.DataFrame(columns=['id', 'adj_list', 'adj_weights',
                                'date', 'publication', 'title', 'text', 'url', 'coherence'])

    already_in = []
    for i in range(0, n):
        prob = []
        coherence = varsdict["node_act_" + str(i)]
        if coherence <= threshold:
            continue
        coherence_list = []
        index_list = []
        for j in window_i_j[i]:
            name = "node_next_" + str(i) + "_" + str(j)
            prob.append(varsdict[name])
            coherence_list.append(varsdict["node_act_" + str(j)])
        idx_list = [window_i_j[i][idx] for idx, e in enumerate(prob) if round(
            e, 8) != 0 and e > threshold and coherence_list[idx] > threshold]  # idx + i + 1
        nz_prob = [e for idx, e in enumerate(prob) if round(
            e, 8) != 0 and e > threshold and coherence_list[idx] > threshold]
        if prune:
            if len(idx_list) > prune:
                top_prob_idx = sorted(range(len(nz_prob)), key=lambda k: nz_prob[k])[-prune:]
                idx_list = [idx_list[j] for j in top_prob_idx]
                nz_prob = [nz_prob[idx] for idx in top_prob_idx]
        sum_nz = sum(nz_prob)
        nz_prob = [nz_prob[j] / sum_nz for j in range(0, len(nz_prob))]
        # If we haven't checked this one before we add it to the graph.
        url = str(query.iloc[i]['url'])
        if i in already_in or sum_nz > 0:
            if len(url) > 0:
                url = urlparse(url).netloc
            if not (graph_df['id'] == i).any():
                title = query.iloc[i]['title']
                for key, value in cluster_dict.items():
                    if str(i) in value:
                        title = "[" + str(key) + "] " + title
                outgoing_edges = [idx_temp for idx_temp in idx_list]
                # coherence = varsdict["node_act_" + str(i)]
                if 'bias' in query.columns:
                    graph_df.loc[len(graph_df)] = [i, outgoing_edges, nz_prob, query.iloc[i]['date'], query.iloc[i]['publication'],
                                                   title, '', query.iloc[i]['url'], query.iloc[i]['bias'], coherence]
                else:
                    graph_df.loc[len(graph_df)] = [i, outgoing_edges, nz_prob, query.iloc[i]['date'], query.iloc[i]['publication'],
                                                   title, '', query.iloc[i]['url'], coherence]

            already_in += [i] + idx_list
    return graph_df

In [6]:
start_time = None
window_i_j = {}
window_j_i = {}


def solve_LP(
    query,
    dataset,
    membership_vectors,
    K=6,
    mincover=0.20,
    sigma_t=30,
    start_nodes=[],
    end_nodes=[],
    verbose=True,
    force_cluster=True,
    use_entities=True,
    use_temporal=True,
    strict_start=False,
):

    global start_time
    start_time = time()

    n = len(query.index)
    # varsdict_filename = 'varsdict_' + dataset + "_" + str(n) + '.pickle'

    if sigma_t != 0 and use_temporal:
        exp_temp_table = np.exp(-temporal_distance_table / sigma_t)
    else:
        exp_temp_table = np.ones(temporal_distance_table.shape)

    if verbose:
        print("Computed temporal distance table.")
        print("--- %s seconds ---" % (time() - start_time))

    window_time = None
    if sigma_t != 0 and use_temporal:
        window_time = sigma_t * 3  # Days

    if window_time is None:
        for i in range(0, n):
            window_i_j[i] = list(range(i + 1, n))
        for j in range(0, n):
            window_j_i[j] = list(range(0, j))
    else:
        for j in range(0, n):
            window_j_i[j] = []
        for i in range(0, n):
            window_i_j[i] = []
        for i in range(0, n - 1):
            window = 0
            for j in range(i + 1, n):
                if temporal_distance_table[i, j] <= window_time:
                    window += 1
            window = max(min(5, n - i), window)
            window_i_j[i] = list(range(i + 1, min(i + window, n)))
            for j in window_i_j[i]:
                window_j_i[j].append(i)

    if verbose:
        print("Computed temporal windows.")
        print("--- %s seconds ---" % (time() - start_time))

    if verbose:
        print("Computed entity similarities.")
        print("--- %s seconds ---" % (time() - start_time))
    ent_table = np.zeros((n, n))  # Fill entity information with zeros by default.
    actual_ent_table = ent_table
    ent_doc_list = None
    if use_entities:
        ent_table, ent_doc_list = get_entity_table(query, dataset)
        actual_ent_table = ent_table

    # Deprecated relevance table computation
    relevance_table = [1.0] * membership_vectors.shape[0]  # Create a vector full of 1s

    has_start = False
    if start_nodes is not None:
        has_start = (len(start_nodes) > 0)
    if end_nodes is not None:
        has_end = (len(end_nodes) > 0)
    if verbose:
        print("Creating LP...")

    # Read previous solution and feed to LP. If none there is no previous solution.
    previous_varsdict = None
    # if os.path.isfile(varsdict_filename):
    #     with open(varsdict_filename, 'rb') as handle:
    #         previous_varsdict = pickle.load(handle)

    prob = create_LP(
        query,
        sim_table,
        membership_vectors,
        clust_sim_table,
        exp_temp_table,
        actual_ent_table,
        numclust,
        relevance_table,
        K=K,
        mincover=mincover,
        sigma_t=sigma_t,
        has_start=has_start,
        has_end=has_end,
        start_nodes=start_nodes,
        end_nodes=end_nodes,
        verbose=verbose,
        force_cluster=force_cluster,
        previous_varsdict=previous_varsdict
    )

    # if verbose:
    #     print("Saving model...")
    #     print("--- %s seconds ---" % (time() - start_time))

    # prob.writeLP("left_story.lp")

    if verbose:
        print("Solving model...")
        print("--- %s seconds ---" % (time() - start_time))

    # (GLPK_CMD(path = 'C:\\glpk-4.65\\w64\\glpsol.exe', options = ["--tmlim", "180"]))

    prob.solve(PULP_CBC_CMD(mip=False, warmStart=True, msg=verbose))

    varsdict = extract_varsdict(prob)

    # Overwrite last solution.
    # with open(varsdict_filename, 'wb') as handle:
    #     pickle.dump(varsdict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    graph_df = build_graph_df_multiple_starts(query, varsdict, prune=ceil(
        sqrt(K)), threshold=0.1/K, cluster_dict={})

    if verbose:
        print("Graph data frame construction...")
        print("--- %s seconds ---" % (time() - start_time))

    if strict_start and has_start:
        graph_df = graph_clean_up(graph_df, start_nodes)

    if verbose:
        print("Graph clean up...")
        print("--- %s seconds ---" % (time() - start_time))

    return [graph_df, (numclust, LpStatus[prob.status]), sim_table, clust_sim_table, ent_table, ent_doc_list]

## MAIN: Calling the Map Construction Method


In [34]:
data_news_articles = pd.read_csv(f"../../data/NewsData/text_data.csv")
data_news_articles["date"] = pd.to_datetime(data_news_articles["date"], format="%m/%d/%y")

# Load embeddings for news articles
news_data_embeds, _ = extract_embeddings(
    text=data_news_articles["full_text"].tolist(),
    foldername=f"../../data/NewsData",
    model_name="gpt4"
)

data_news_articles["embed"] = news_data_embeds.tolist()
data_news_articles["publication"] = ""
data_news_articles.sort_values(by="date").reset_index(names="idx")

# select sources and targets
news_data_src = np.random.choice(data_news_articles.index[:(len(data_news_articles) // 2) - 50], 50)
news_data_tgt = np.random.choice(data_news_articles.index[(len(data_news_articles) // 2) + 50:], 50)

data_news_articles

File '../../data/NewsData/embed_data-gpt4.pickle' loaded successfully.


Unnamed: 0,title,url,date,publication,full_text,Unnamed: 5,embed
0,U.S. restores commercial flights to Cuba,https://www.cnn.com/2016/02/12/politics/u-s-to...,2016-02-12,,President Barack Obama and first lady Michelle...,,"[0.013048902153968811, -0.015582204796373844, ..."
1,Obama announces Cuba visit,https://www.cnn.com/2016/02/17/politics/obama-...,2016-02-18,,President Barack Obama and first lady Michelle...,,"[0.029543746262788773, -0.030289117246866226, ..."
2,White House sees Cuba visit as chance to conso...,https://www.washingtonpost.com/politics/white-...,2016-02-19,,In the months since President Obama announced ...,,"[0.04994184896349907, -0.014324406161904335, -..."
3,Obama's Cuba trip guarantees another Castro cr...,https://nypost.com/2016/02/18/obamas-cuba-trip...,2016-02-19,,President Obama says hes headed to Cuba next m...,,"[0.03186975419521332, 0.005861963611096144, 0...."
4,Dissidents hope for public recognition from Ob...,http://www.firstpost.com/world/dissidents-hope...,2016-02-19,,HAVANA Ostracized by the government and mistru...,,"[0.04770871251821518, -0.030910532921552658, 0..."
...,...,...,...,...,...,...,...
535,Cuba: Disobeying Protest Ban to have Serious C...,https://havanatimes.org/news/cuba-disobeying-p...,2021-10-21,,The Cuban Attorney Generals Office issues its ...,,"[0.05380697548389435, 0.014619889669120312, 0...."
536,Report: Cuba Engaged in 'Brutal Abuses' Agains...,https://www.breitbart.com/politics/2021/10/21/...,2021-10-21,,"Cuban officials are committing ""brutal abuses""...",,"[0.023437807336449623, 0.016891997307538986, 0..."
537,Cuba-U.S. tensions mount over pending protests...,https://www.reuters.com/world/americas/cuba-us...,2021-10-25,,A vintage car passes by the U.S. Embassy carry...,,"[0.052503641694784164, -0.02012248896062374, 0..."
538,Cuba: Communists Display Weapons for 'People's...,https://www.breitbart.com/latin-america/2021/1...,2021-10-26,,Cuban communists flooded social media outlets ...,,"[0.0667709931731224, 0.04038030654191971, 0.02..."


In [35]:
landscape_news_data = NarrativeLandscape(
    verbose=True,
    impose_date_constraint=True
)

landscape_news_data.fit(
    news_data_embeds.numpy(),
    dates=data_news_articles["date"].values
)

Step 1/4: Constructing Projection Space with UMAP
Step 2/4: Discovering topics with HDBSCAN
	>>> Discovered 16 Topics
Step 3/4: Computing Mean K-NN Dist
Step 4/4: Constructing Coherence Graph
	 >>> Computing base coherence
	 >>> Computing sparse coherence
		>>> Creating Undirected Graph
		>>> Finding Maximum Spanning Tree
		>>> Getting Min Weight
		----- BEFORE MST -----
		Critical Coherence: 0.45204710055058356
		Num Edges: 145530
		Is Connected: True
		----- AFTER MST -----
		Num Edges: 75388
		Is Connected: True
		----- AFTER Constraints -----
		Num Edges: 75388
		Is Connected: True
	 >>> Building NetworkX graph


In [36]:
low_dim_embeds = landscape_news_data.low_dim_embeds
cluster_label_probs = landscape_news_data.cluster_label_probs

In [37]:
src = data_news_articles[
    data_news_articles["title"] == "China reports first death from mysterious outbreak in Wuhan"
].index[0]

tgt = data_news_articles[
    data_news_articles["title"] == "Airlines around the world are suspending flights to China as the coronavirus spreads"
].index[0]

In [38]:
# Map Length (usually values from 6 to 12 produce decent maps, but it depends on data set size and probably the underlying distribution of similarities).
k_input = 8

# % of average coverage we require. For small data sets 50-80 is good. For bigger data sets with many clusters you will likely only get 20%.
# This was tested with values up to 500. After that I'm not sure how well the model will perform.
mincover_input = 0

# Temporal distance penalty in DAYS. I left it on 30 as default for the Cuban data set.
# Lower values allow more temporally distant connections. Consider temporal density of the data when adjusting.
# Can set it to 0 and it will be discarded from the computation.
sigma_t = 0
use_temporal = False  # Use this to enable or disable the temporal penalty, by default it is on.

# Leave this as false, there was supposed to be a reward factor for events with common entities, but it adds too much computational time so not worth it.
use_entities = False

# If you enable strict start you will discard any storyline that does not start from the user-defined start node.
# It is recommended to disable this to allow for extra storylines that emerge from the LP solution.
strict_start = False

# Compute angular similarity
similarities = np.clip(cosine_similarity(np.array(data_news_articles["embed"].tolist())), -1, 1)
sim_table = (1 - np.arccos(similarities) / pi)
mask = np.ones(sim_table.shape, dtype=bool)
np.fill_diagonal(mask, 0)
max_value = sim_table[mask].max()
min_value = sim_table[mask].min()
sim_table = (sim_table - min_value) / (max_value - min_value)
sim_table = np.clip(sim_table, 0, 1)

# Compute topic similarity
numclust = 1
clust_sim = np.zeros((cluster_label_probs.shape[0], cluster_label_probs.shape[0]))

if len(cluster_label_probs.shape) > 1:
    numclust = cluster_label_probs.shape[1]
    cluster_label_probs[cluster_label_probs < 1/numclust] = 0
    cluster_label_probs[np.all(cluster_label_probs == 0,
                                        axis=1)] = np.ones(numclust) / numclust
    row_sums = cluster_label_probs.sum(axis=1)
    cluster_label_probs = cluster_label_probs / row_sums[:, np.newaxis]

    clust_sim = distance.cdist(
        cluster_label_probs,
        cluster_label_probs,
        lambda u, v: distance.jensenshannon(u, v, base=2.0)
    )
else:
    cluster_label_probs = np.ones((cluster_label_probs.shape[0], 1))

clust_sim_table = 1 - clust_sim

# Compute temporal distance
temporal_distance_table = compute_temp_distance_table(data_news_articles, "./narrative_maps/temp/news_articles")

In [39]:
graph_df_new, status, _, _, _, _ = solve_LP(
    data_news_articles,
    dataset="news_articles",
    membership_vectors=cluster_label_probs,
    K=k_input,
    mincover=mincover_input/100,
    sigma_t=sigma_t,
    start_nodes=[src],
    end_nodes=[tgt],
    verbose=False,
    use_entities=use_entities,
    use_temporal=use_temporal,
    strict_start=strict_start,
)
end_time = time() - start_time

# Post Processing
if 'Optimal' in status[1]:
    G = build_graph(graph_df_new)
    nm_storyline = graph_stories(G, start_nodes=[src], end_nodes=[tgt])[0]
    nm_storyline = list(map(lambda x: int(x), nm_storyline))
else:
    print(f"** Warning: Experiment '({src}, {tgt})' not optimal")

In [40]:
nt_storyline, _ = landscape_news_data.extract_narrative(src, tgt)

### Narrative Maps Storyline

In [41]:
print(Storyline(landscape_news_data, nm_storyline).reliability())
print("Coherences:", Storyline(landscape_news_data, nm_storyline).path_base_coherence())

data_news_articles.iloc[nm_storyline]["title"].values

0.7881041355276819
Coherences: [0.84412366 0.8076034  0.86883271 0.84179297 0.75563653 0.7149173
 0.70108524]


array(['China reports first death from mysterious outbreak in Wuhan',
       'Coronavirus: more cases and second death reported in China',
       "The Test a Deadly Coronavirus Outbreak Poses to China's Leadership",
       'Coronavirus: China advises against travel to Wuhan as deaths surge',
       'Chinese state media downplays coronavirus as Xi strikes positive tone',
       "China's Omnivorous Markets Are in the Eye of a Lethal Outbreak Once Again",
       "Coronavirus Crisis Exposes Cracks in China's Facade of Unity",
       'Airlines around the world are suspending flights to China as the coronavirus spreads'],
      dtype=object)

In [42]:
data_news_articles.iloc[nm_storyline]["date"]

85    2020-01-10
87    2020-01-17
93    2020-01-21
95    2020-01-22
97    2020-01-23
105   2020-01-25
112   2020-01-28
114   2020-01-29
Name: date, dtype: datetime64[ns]

### Narrative Trails Storyline

In [43]:
print(Storyline(landscape_news_data, nt_storyline).reliability())
print("Coherences:", Storyline(landscape_news_data, nt_storyline).path_base_coherence())

data_news_articles.iloc[nt_storyline]["title"].values

0.8766056760673703
Coherences: [0.87832668 0.88844851 0.88948914 0.8907074  0.83725422]


array(['China reports first death from mysterious outbreak in Wuhan',
       'Japan confirms first case of coronavirus infection',
       'New China virus: Cases triple as infection spreads to Beijing and Shanghai',
       'China coronavirus: Lockdown measures rise across Hubei province',
       "Coronavirus: Foreign Office warns against 'all but essential travel' to China",
       'Airlines around the world are suspending flights to China as the coronavirus spreads'],
      dtype=object)

In [44]:
data_news_articles.iloc[nt_storyline]["date"]

85    2020-01-10
86    2020-01-15
91    2020-01-20
98    2020-01-23
111   2020-01-28
114   2020-01-29
Name: date, dtype: datetime64[ns]

In [45]:
_, _, matched_path = embedding_based_dtw(news_data_embeds[nm_storyline], news_data_embeds[nt_storyline])
matched_path

[(0, 0), (1, 1), (2, 2), (3, 3), (4, 3), (5, 3), (6, 4), (7, 5)]