# Network visualization

Last updated: 09202023  
By: Lauren Liao  
Purpose: visualize and analyze from word similarity matrix 

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import seaborn as sns
import graph_tool.all as gt

%matplotlib inline

**Data loading**  
the data starts with the preprocessed text (pickled) file  
taking the upper right triangle plus a threshold of 50th quartile

In [3]:
mean_similarity_df = pd.read_pickle("../data/similarity_df_mean.pkl")

In [4]:
words = mean_similarity_df.columns

In [5]:
upper_tri_sim = mean_similarity_df.values[np.triu_indices(mean_similarity_df.shape[0], k=1)]
print("The 50th quartile is " + str(round(np.percentile(upper_tri_sim, q=50), 3)))

The 50th quartile is 0.867


In [6]:
upper_triangle = np.triu(mean_similarity_df, k=1)
upper_triangle[upper_triangle < np.percentile(upper_tri_sim, q=50)] = 0

In [7]:
upper_triangle = np.where(upper_triangle > 0, 1 - upper_triangle, upper_triangle)

## Graph creation and analysis

In [8]:
G = nx.Graph(upper_triangle)
node_labels = {i:word for i, word in enumerate(words)}
G = nx.relabel_nodes(G, node_labels)
num_nodes = len(G.nodes())
num_edges = len(G.edges())
print(num_nodes)
print(num_edges)

9242
21351331


In [62]:
# mean_similarity_df['health'].sort_values(ascending=False)

In [9]:
def print_distance(source_node, target_node):
    try:
        shortest_path = nx.shortest_path(G, source=source_node, target=target_node, weight="weight", method="dijkstra")
        weighted_distance = nx.shortest_path_length(G, source=source_node, target=target_node, weight="weight", method="dijkstra")
        print(f"shortest path from {source_node} to {target_node}: {shortest_path}")
        print(f"weighted distance: {round(weighted_distance, 3)}")
    except nx.NetworkXNoPath:
        print(f"there is no path from {source_node} to {target_node}.")

In [55]:
print_distance("health", "healthcare")

there is no path from health to healthcare.


In [None]:
print_distance("health", "racism")

In [11]:
print_distance("poverty", "racism")
print_distance("environment", "racism")
print_distance("environmental", "racism")
print_distance("disparity", "racism")

shortest path from poverty to racism: ['poverty', 'consequence', 'interpersonal', 'discrimination', 'racism']
weighted distance: 0.391
shortest path from environment to racism: ['environment', 'influence', 'interpersonal', 'discrimination', 'racism']
weighted distance: 0.378
shortest path from environmental to racism: ['environmental', 'structural', 'interpersonal', 'discrimination', 'racism']
weighted distance: 0.383
shortest path from disparity to racism: ['disparity', 'impact', 'disadvantage', 'interpersonal', 'discrimination', 'racism']
weighted distance: 0.469


In [12]:
print_distance("deprivation", "racism")
print_distance("economic", "racism")
print_distance("psychosocial", "racism")
print_distance("stress", "racism")

shortest path from deprivation to racism: ['deprivation', 'migratory', 'geography', 'inferiority', 'overt', 'prejudice', 'racism']
weighted distance: 0.342
shortest path from economic to racism: ['economic', 'environmental', 'structural', 'interpersonal', 'discrimination', 'racism']
weighted distance: 0.438
shortest path from psychosocial to racism: ['psychosocial', 'interpersonal', 'discrimination', 'racism']
weighted distance: 0.294
shortest path from stress to racism: ['stress', 'psychological', 'stigma', 'interpersonal', 'discrimination', 'racism']
weighted distance: 0.349


In [65]:
print_distance("structural", "interpersonal")
print_distance("structural", "form")
print_distance("form", "discrimination")

shortest path from structural to interpersonal: ['structural', 'interpersonal']
weighted distance: 0.132
shortest path from structural to form: ['structural', 'form']
weighted distance: 0.075
shortest path from form to discrimination: ['form', 'interpersonal', 'discrimination']
weighted distance: 0.238


In [13]:
print_distance("inequality", "discrimination")
print_distance("harassment", "discrimination")
print_distance("harassment", "racism")

shortest path from inequality to discrimination: ['inequality', 'disadvantage', 'interpersonal', 'discrimination']
weighted distance: 0.342
shortest path from harassment to discrimination: ['harassment', 'prejudice', 'discrimination']
weighted distance: 0.176
shortest path from harassment to racism: ['harassment', 'prejudice', 'racism']
weighted distance: 0.216


### Graph visualization with graph tool

this still needs to be edited

In [14]:
full_text = pd.read_pickle('../data/full_data_w_processedtext.pkl')
processed_text = full_text['processed_text'] 
sentences = [x for x in processed_text]
all_text_tokens = [item for sublist in sentences for item in sublist]
token_counts = Counter(all_text_tokens)

In [15]:
token_counts

Counter({'covid': 915,
         'failure': 264,
         'control': 492,
         'pandemic': 356,
         'inequality': 785,
         'made': 668,
         'england': 234,
         'worst': 75,
         'affected': 188,
         'europe': 249,
         'say': 715,
         'report': 1115,
         'griffin': 1,
         'coronavirus': 71,
         'coupled': 24,
         'widening': 33,
         'past_decade': 55,
         'led': 391,
         'highest': 205,
         'rate': 1106,
         'excess_death': 25,
         'new': 1171,
         'concluded': 76,
         'rather': 540,
         'focusing': 67,
         'narrow': 50,
         'economic': 685,
         'goal': 483,
         'health': 10937,
         'wellbeing': 346,
         'heart': 130,
         'government': 1059,
         'strategy': 620,
         'finding': 529,
         'recommendation': 250,
         'michael': 45,
         'marmot': 24,
         'director': 179,
         'university': 649,
         'college_london'

In [21]:
top_tokens = token_counts.most_common(100)

In [43]:
top_tokens_position = [key for key, value in node_labels.items() if value in dict(top_tokens).keys()]
mtx_gt = mean_similarity_df.iloc[top_tokens_position, top_tokens_position]

In [None]:
idx = mtx_gt.values.nonzero()
g = gt.Graph(directed=False)
g.add_edge_list(np.transpose(mtx_gt.values))

In [48]:
state = gt.minimize_nested_blockmodel_dl(g)
state.draw(output="../results/network_graph.pdf")

(<VertexPropertyMap object with value type 'vector<double>', for Graph 0x7fc6bc48c310, at 0x7fc6c51005d0>,
 <GraphView object, directed, with 3 vertices and 2 edges, edges filtered by (<EdgePropertyMap object with value type 'bool', for Graph 0x7fc6bcf78350, at 0x7fc6c45b56d0>, False), vertices filtered by (<VertexPropertyMap object with value type 'bool', for Graph 0x7fc6bcf78350, at 0x7fc6c26fc1d0>, False), at 0x7fc6bcf78350>,
 <VertexPropertyMap object with value type 'vector<double>', for Graph 0x7fc6bcf78350, at 0x7fc6bce3e1d0>)