In [1]:
import os
import sys
import pandas as pd
import yaml 
from matplotlib import pyplot as plt
from matplotlib import ticker as mticker
from matplotlib import colors as mcolors
import statsmodels.api as sm
import numpy as np
from itertools import product
import subprocess
import networkx as nx

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
#with open("../../config.yaml", "r") as f:
#    CONFIG = yaml.safe_load(f)
sys.path.append("../python")

import globals
import data_tools as dt

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
RAW_DATA_PATH = LOCAL_CONFIG["RAW_DATA_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]
R_PATH = LOCAL_CONFIG["R_PATH"]

RUN_R_SCRIPTS = False
OVERWRITE = False


In [2]:
items = dt.get_items(overwrite=OVERWRITE)
posts = dt.get_posts(overwrite=OVERWRITE)
comments = dt.get_comments(overwrite=OVERWRITE)

In [3]:
selector = (posts['text'].notnull()) & \
           (posts['text'].str.len() > 0) & \
           (posts['invoiceActionState'] != 'FAILED')
posts = posts.loc[selector].reset_index(drop=True)
posts['num_internal_links'] = posts['text'].apply(
    lambda x: len(dt.extract_internal_links(x))
)
posts_w_links = posts.loc[posts['num_internal_links']>0].reset_index(drop=True)

In [4]:
# create graph

DG = nx.DiGraph()
for idx, row in posts_w_links.iterrows():
    source_id = int(row['itemId'])
    source_user = row['userId']
    source_title = row['title']
    source_sub = row['subName']
    source_link = f"https://stacker.news/items/{source_id}"
    internal_links = dt.extract_internal_links(row['text'])
    for dest_id in [int(item) for item in internal_links]:
        destrow = posts.loc[posts['itemId']==dest_id]
        if (not destrow.empty) and (dest_id != source_id):
            dest_user = destrow['userId'].values[0]
            dest_title = destrow['title'].values[0]
            dest_sub = destrow['subName'].values[0]
            dest_link = f"https://stacker.news/items/{dest_id}"
            DG.add_node(source_id, title=source_title, territory=source_sub, link=source_link)
            DG.add_node(dest_id, title=dest_title, territory=dest_sub, link=dest_link)
            DG.add_edge(source_id, dest_id)

print(f"Number of nodes: {DG.number_of_nodes()}")
print(f"Number of edges: {DG.number_of_edges()}")

Number of nodes: 14797
Number of edges: 33649


In [5]:
# Node with most incoming links
max_in_degree_node = max(DG.in_degree, key=lambda x: x[1])
print(f"Node with most incoming links: {max_in_degree_node[0]} with {max_in_degree_node[1]} links")

Node with most incoming links: 410853 with 135 links


In [6]:
# Node with most outgoing links
max_out_degree_node = max(DG.out_degree, key=lambda x: x[1])
print(f"Node with most outgoing links: {max_out_degree_node[0]} with {max_out_degree_node[1]} links")

Node with most outgoing links: 1214495 with 167 links


In [7]:
# Get the subs
subs = set(nx.get_node_attributes(DG, 'territory').values())
sub2nodes = {}
for sub in subs:
    sub2nodes[sub] = [n for n, attr in DG.nodes(data=True) if attr.get('territory') == sub]
sub2colors = {}
for sub in subs:
    sub2colors[sub] = 'gray'  # default color
sub2colors['bitcoin'] = 'orange'
sub2colors['econ'] = 'green'
sub2colors['booksandarticles'] = 'tab:brown'
sub2colors['ai'] = 'purple'
sub2colors['stacker_sports'] = 'red'
sub2colors['tech'] = 'blue'
node_colors = [sub2colors[DG.nodes[n]['territory']] for n in DG.nodes()]

In [None]:
# Get positions with territory based separation

# supergraph of territory positions
supergraph = nx.cycle_graph(len(subs))
superpos = nx.spring_layout(supergraph, scale=3)
centers = list(superpos.values())
pos = {}
for center, sub in zip(centers, subs):
    nodes = sub2nodes[sub]
    subgraph = DG.subgraph(nodes)
    pos.update(nx.spring_layout(subgraph, center=center))


In [None]:
# Get positions based on automated communities

communities = nx.community.greedy_modularity_communities(DG.to_undirected())
supergraph = nx.cycle_graph(len(communities))
superpos = nx.spring_layout(supergraph, scale=2)
centers = list(superpos.values())
pos = {}
for center, comm in zip(centers, communities):
    subgraph = DG.subgraph(comm)
    pos.update(nx.spring_layout(subgraph, center=center))


In [8]:
# Get positions based on sfdp layout

pos = nx.nx_agraph.graphviz_layout(DG, prog='sfdp')

ImportError: requires pygraphviz http://pygraphviz.github.io/

In [None]:
plt.figure(figsize=(12, 12))

for sub, nodes in sub2nodes.items():
    nx.draw_networkx_nodes(
        DG, pos, 
        nodelist=nodes,
        node_color=sub2colors[sub],
        node_size=3,
        alpha=0.7 
    )
nx.draw_networkx_edges(
    DG, pos, 
    edge_color = 'gray',
    width = 0.2,
    alpha = 0.3,
    node_size=3,
    arrowsize=1
)

plt.show()

In [None]:
# Get positions based on layout


In [None]:
plt.figure(figsize=(12, 12))
nx.draw_networkx_nodes(
    DG, pos, 
    node_color = node_colors,
    node_size = 3,
    alpha = 0.7
)
nx.draw_networkx_edges(
    DG, pos,
    edge_color = 'gray',
    width = 0.2,
    alpha = 0.3,
    arrowsize = 1,
    node_size = 3
)
plt.show()

In [None]:
nodes = sub2nodes['econ']
subgraph = DG.subgraph(nodes)

communities = nx.community.greedy_modularity_communities(subgraph)

supergraph = nx.cycle_graph(len(communities))
superpos = nx.spring_layout(supergraph, scale=2)

centers = list(superpos.values())
pos = {}
for center, comm in zip(centers, communities):
    pos.update(nx.spring_layout(nx.subgraph(subgraph, comm), center=center))

colors = plt.cm.get_cmap('tab20', len(communities))

plt.figure(figsize=(12, 12))

for i, nodes in enumerate(communities):
    nx.draw_networkx_nodes(
        subgraph, pos,
        nodelist=nodes,
        node_size=3,
        node_color=colors(i),
        alpha=0.7
    )
nx.draw_networkx_edges(
    subgraph, pos,
    width=0.2,
    alpha=0.3,
    arrowsize=1,
    node_size=3
)

plt.show()

In [None]:
communities[0]