# Visualizing topic words and connections using pyvis

Visualizing a network and make understanding the relationships much easier. In this notebook, we are connecting the top words per cluster for sci-kit learn's 20newsgroups<br>

1. Load and prepare the data

In [6]:
import pandas as pd
from pyvis.network import Network

df = pd.read_csv('20newsgroups_top_words.csv')
df = df[df['probability'] > 0] #exclude words with no probability

# limit the graph to the first 30 topics to make loading much faster
df = df[(df['topic'].astype(int) >= 0) & (df['topic'].astype(int) < 30)]

df['topic'] = df['topic'].astype(str)
df['word'] = df['word'].astype(str)

2. Generate the graph, using the data to determine things like node size. Keep a running list of drawn nodes so you can bridge the topics when needed

In [7]:
graph_name = '20 news groups topic words'

net = Network(notebook=True)
net.width = '800px'
net.height = '800px'
net.heading = graph_name

nodes = [] # used to determine if the node was already drawn

#Iterate through the records, drawing each node and connection as you go
for index,row in df.iterrows():
    source = row['topic']
    target = row['word']
    color = '#950001'
    
    if not source in nodes:
        nodes.append(source)
        size = 10
        net.add_node(source,source,title=source,size=size,color=color)
        
    if not target in nodes:
        nodes.append(target)
        size = row['probability'] * 100
        net.add_node(target,target,title=target,size=size,color=color)
    
    weight = df[df['word']==target].shape[0]
    if weight > 1:
        weight = weight * 5
    net.add_edge(source,target,weight=weight,color=color)
    
# Save, then show the graph
net.save_graph(graph_name + '.html')
net.show(graph_name + '.html')