# Narreme Visualization

This second part of the code will focus on enphasizing the visualization and exploration of the different documents and topics we extracted from the previous notebook. By doing those we can understand the linking between each documents so that to create narrative connections between them.

This notebook is part of a master's thesis project in Digital Interaction Design at Politecnico di Milano, by Federico Denni.

In [None]:
from IPython.display import clear_output
!pip install pyvis networkx seaborn spacy
!python -m spacy download xx_sent_ud_sm
clear_output()

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy #nlp
from spacy import displacy #spacy visualizer library
import pyvis #interactive visualization
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt #plots
import seaborn as sns #make these plots nice

plt.style.use('ggplot')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
try:
    #Change this directory for other dataset
    df = pd.read_csv(r'/kaggle/input/probes-cultural/probes_analysis.csv',engine='python', encoding="utf-8")
    #eliminate compound scores
    
except FileNotFoundError:
      print("Error: file not found. Please upload the file or provide the correct path.")

In [None]:
df.head(10)

---

Now that is loaded, lets proceed to analyze and clean the different elements

In [None]:
#let's split the docs_n in sentences
nlp = spacy.load("xx_sent_ud_sm")#translate all sentences in english before using this

# Define a function to tokenize text using spaCy
def tokenize(text):
    if isinstance(text, str):
        doc = nlp(text)
        return [token.text for token in doc]
    return []

# Define a function to split text into sentences using spaCy
def split_sentences(text):
    if isinstance(text, str):
        doc = nlp(text)
        return [sent.text for sent in doc.sents]
    return []

# Apply the sentence splitting function to the new columns
df['docs_1_sentences'] = df['docs_1'].apply(split_sentences)
df['docs_2_sentences'] = df['docs_2'].apply(split_sentences)
df['docs_3_sentences'] = df['docs_3'].apply(split_sentences)

df.to_csv(r'/kaggle/working/sentences.csv', index=False)

df.head(10)

In [None]:
#lets now identify and extract the entities type in the sentences
nlp = spacy.load("en_core_web_sm")#translate all sentences in english before using this

df = pd.read_csv(r'/kaggle/working/sentences.csv', encoding="utf-8")
columns_to_extract = ['docs_1_sentences', 'docs_2_sentences', 'docs_3_sentences']

def get_entities_and_keywords(sentences, keywords):
    entities = []
    for sentence in sentences:
        doc = nlp(sentence)
        sentence_entities = [ent.label_ for ent in doc.ents] #if MISC, add word
        for keyword in keywords:
            if keyword in sentence:
                sentence_entities.append(keyword)
                
        entities.append(sentence_entities)
    return entities

# Apply the function to each column
df['doc_1_entity_list'] = df.apply(lambda row: get_entities_and_keywords(eval(row['docs_1_sentences']), eval(row['Keywords'])), axis=1)
df['doc_2_entity_list'] = df.apply(lambda row: get_entities_and_keywords(eval(row['docs_2_sentences']), eval(row['Keywords'])), axis=1)
df['doc_3_entity_list'] = df.apply(lambda row: get_entities_and_keywords(eval(row['docs_3_sentences']), eval(row['Keywords'])), axis=1)

df.head(10)

In [None]:
#transform all the list in a unique one and then eliminate whitespaces.
#filter out cardinals, percent
def extract_unique_words(list_of_lists):
    unique_words = set()
    for sublist in list_of_lists:
        unique_words.update(sublist)
    return [word for word in unique_words if word not in ['CARDINAL', 'PERCENT']] #we are not interested in those

# Apply the function to the specified columns
for column in ['doc_1_entity_list', 'doc_2_entity_list', 'doc_3_entity_list']:
    df[column] = df[column].apply(extract_unique_words)


df.to_csv(r'/kaggle/working/ent_list.csv', index=False)
df.head(10)

In [None]:
import ast  # To safely evaluate string representations of lists
df = pd.read_csv(r'/kaggle/working/ent_list.csv', encoding="utf-8")

# Convert string representations of lists to actual Python lists
for col in ['doc_1_entity_list', 'doc_2_entity_list', 'doc_3_entity_list']:
    df[col] = df[col].apply(ast.literal_eval)

# Extract entities and matches as previously discussed
def extract_entities(df):
    entity_data = []
    for _, row in df.iterrows():
        topic_id = row['Topic']
        for doc_col in ['doc_1_entity_list', 'doc_2_entity_list', 'doc_3_entity_list']:
            doc_name = doc_col.split('_')[1]  # Extract doc name (e.g., '1', '2', '3')
            entity_list = row[doc_col]
            if isinstance(entity_list, list):  # Ensure it's a list
                for word in entity_list:
                    entity_data.append({
                        'word': word,
                        'topic': topic_id,
                        'doc': f'{doc_name}_topic_{topic_id}',
                        'text': row[f'docs_{doc_name}']
                    })
            else:
                print(f"Warning: {doc_col} is not a list for row {row.name}") #Check this row, something happened
    return pd.DataFrame(entity_data)

# Create a DataFrame with all entities, their topics, and document sources
entities_df = extract_entities(df)

# Perform a self-join to find matches across all rows and columns
matches_df = entities_df.merge(
    entities_df,
    on='word',
    suffixes=('_source', '_target')
)

# Filter out self-matches (where source and target are identical)
matches_df = matches_df[matches_df['doc_source'] != matches_df['doc_target']]

# Eliminate duplicates by ensuring consistent ordering of source and target
matches_df['match'] = matches_df.apply(
    lambda x: tuple(sorted([x['doc_source'], x['doc_target']])), axis=1
)

# Keep only unique matches
uniques = matches_df.drop_duplicates(subset='match').drop(columns='match')

# Save the results to a CSV file (optional)
uniques.to_csv(r'/kaggle/working/matches.csv', index=False)
uniques.head(100)

In [None]:
#Make nodes file
# Load your matches.csv
matches_df = pd.read_csv(r'/kaggle/working/matches.csv')

# Extract unique nodes from source and target columns
source_nodes = matches_df[['doc_source', 'topic_source', 'text_source']].rename(columns={'doc_source': 'Id', 'topic_source': 'topic','text_source':'text'})
target_nodes = matches_df[['doc_target', 'topic_target', 'text_target']].rename(columns={'doc_target': 'Id', 'topic_target': 'topic','text_target':'text'})
nodes_df = pd.concat([source_nodes, target_nodes]).drop_duplicates(subset=['Id'])

# Add additional attributes if needed
# For example, you can extract the document number and topic number from the Id
nodes_df['document_number'] = nodes_df['Id'].apply(lambda x: int(x.split('_')[0]))
nodes_df['topic_number'] = nodes_df['Id'].apply(lambda x: int(x.split('_topic_')[1]))

# Save the nodes file
nodes_df.to_csv('nodes.csv', index=False)
nodes_df.head(10)


In [None]:
# Load your matches.csv
matches_df = pd.read_csv(r'/kaggle/working/matches.csv')

# Create the edges DataFrame
edges_df = matches_df[['doc_source', 'doc_target', 'word']]
edges_df = edges_df.rename(columns={'doc_source': 'Source', 'doc_target': 'Target', 'word': 'Shared_Word'})

# Calculate edge weights based on shared word occurrences
edge_weights = edges_df.groupby(['Source', 'Target']).size().reset_index(name='Weight')

# Merge edge weights back into the edges DataFrame
edges_df = pd.merge(edges_df[['Source', 'Target', 'Shared_Word']], edge_weights, on=['Source', 'Target'], how='left')

# Remove duplicate edges, keeping the weight
edges_df = edges_df.drop_duplicates(subset=['Source', 'Target'])

# Save the edges file
edges_df.to_csv('edges.csv', index=False)

edges_df.head(10)

In [None]:
import json

df = pd.read_csv(r'/kaggle/working/matches.csv', encoding="utf-8")

# Create a graph using networkx
G = nx.Graph()

# Define a color mapping for topics (6 topics)
topic_colors = {
    0: 'red',
    1: 'blue',
    2: 'green',
    3: 'orange',
    4: 'purple',
    5: 'cyan'
}

# Create a Pyvis network visualization
net = Network(
    notebook=True,
    cdn_resources="remote",
    bgcolor="#222222",
    font_color="white",
    height="750px",
    width="100%",
    select_menu=True,
    filter_menu=True
)

# Add nodes with clustering by document number and coloring by topic
for _, row in uniques.iterrows():
    topic_id_source = int(row['doc_source'].split('_topic_')[1])  # Extract topic ID from source doc name
    topic_id_target = int(row['doc_target'].split('_topic_')[1])  # Extract topic ID from target doc name
    
       # Prepare HTML content for hover frame
    source_text_html = f"{row['text_source']}"
    target_text_html = f"{row['text_target']}"
    
    # Add source node with color based on topic and HTML content as title
    net.add_node(
        row['doc_source'], 
        label=row['doc_source'], 
        color=topic_colors[topic_id_source], 
        title=source_text_html  # Use HTML formatted text for hover
    )
    
    # Add target node with color based on topic and HTML content as title
    net.add_node(
        row['doc_target'], 
        label=row['doc_target'], 
        color=topic_colors[topic_id_target], 
        title=target_text_html  # Use HTML formatted text for hover
    )

# Add nodes with text as data attributes for hover functionality
node_connections = {}  # Dictionary to track node degrees

for _, row in uniques.iterrows():
    # Increment connection count for source and target nodes
    node_connections[row['doc_source']] = node_connections.get(row['doc_source'], 0) + 1
    node_connections[row['doc_target']] = node_connections.get(row['doc_target'], 0) + 1

# Add edges to the graph from the unique matches DataFrame
for _, row in uniques.iterrows():
    net.add_edge(row['doc_source'], row['doc_target'])

# Adjust node sizes based on their number of connections (degree)
for node_id, degree in node_connections.items():
    net.get_node(node_id)['size'] = degree * 0.8  # Scale size; adjust multiplier as needed

# Convert NetworkX graph to Pyvis format
net.from_nx(G)

# Set options for smaller arrows and hierarchical layout
options = {
    "layout": {
        "hierarchical": {
            "enabled": False,
            "levelSeparation": 150,
            "nodeSpacing": 200,
            "treeSpacing": 200,
            "direction": "UD",
            "sortMethod": "hubsize"
        }
    },
    "nodes": {
        "shape": "text",
        "font": {
            "size": 12,
            "color": "#ffffff"
        }
    },
    "edges": {
        "arrows": {
            "to": {
                "enabled": True,
                "scaleFactor": 0.5, # Smaller arrows
                "type": "arrow"
            }
        },
        "smooth": False
    }
}

net.set_options(json.dumps(options))

# Save the network visualization to an HTML file
net.save_graph('network_visualization.html')
# Save and show the network visualization
net.show('network_visualization.html')