In [None]:
# reqruied libraries
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
data_folder = 'C:/STUFF/RESEARCH/Brandwatch/OUTPUT/v2'

In [None]:
nodes_df = pd.read_csv(data_folder + '/gephi_nodes.csv')
nodes_df

In [None]:
edges_df = pd.read_csv(data_folder + '/gephi_actor_te_edges.csv')
edges_df

In [None]:
# simplify graph by filtering edges using a TE threshold value
te_threshold = 0.1
edges_df = edges_df[edges_df.total_te > te_threshold]
edges_df

In [None]:
# get all nodes in the simplified graph
src_nodes = set(list(edges_df['Source'].unique()))
tgt_nodes = set(list(edges_df['Target'].unique()))
all_nodes = src_nodes.union(tgt_nodes)
print(f'num_nodes: {len(all_nodes)}')

In [None]:
# column name relations
cmap = {'UF':'tomato','UM':'lawngreen','TF':'cornflowerblue','TM':'fuchsia'}
classes=['UF', 'UM', 'TF', 'TM']
all_cols = []
src_cols = {}
for s in classes:
    src_cols[s] = []
    for t in classes:
        col_name = f"{s}_{t}"
        all_cols.append(col_name)
        src_cols[s].append(col_name)
        
print('all cols',all_cols)
print('src cols',src_cols)

In [None]:
# nodes with 0 in_degree
roots = all_nodes.difference(tgt_nodes)
roots

In [None]:
G = nx.from_pandas_edgelist(edges_df, source='Source', target='Target', edge_attr=['total_te'], create_using=nx.DiGraph())
print(G)

In [None]:
nx.draw(G)

In [None]:
# iterate over levels
this_level_nodes = roots
visited = set()
te_values = []
this_level = 0
while len(visited) < len(all_nodes):
    # mark this level as visited
    print(f" nodes in this level : {this_level_nodes}")
    visited.update(this_level_nodes)
    this_level += 1
    # process this level
    edges_from_this_level = edges_df[edges_df['Source'].isin(this_level_nodes)]
    sum_te_this_level = edges_from_this_level[all_cols].sum()
    sum_te_this_level = sum_te_this_level / sum_te_this_level.sum() # normalize
    te_values.append( sum_te_this_level )
    # calc nodes on next level
    this_level_nodes = set(edges_from_this_level['Target'].to_list()).difference(visited)
    
total_levels = this_level
te_levels_df = pd.concat(te_values, axis=1)
te_levels_df

In [None]:
all_labels = []
label_color = []
label_index = {}
next_index = 0

src = []
tgt = []
val = []

def get_label_index(this_label):
    global label_index
    global next_index
    if this_label not in label_index:
        all_labels.append(this_label)
        label_color.append(cmap[this_label[:2]])
        label_index[this_label] = next_index
        next_index += 1
    return label_index[this_label]

for this_level in range(total_levels):
    for this_src_class in cols:
        this_src_label = f"{this_src_class[:2]}_{this_level}"
        this_tgt_label = f"{this_src_class[3:]}_{this_level + 1}"
        print(this_src_label, this_tgt_label, te_levels_df.loc[this_src_class, this_level], cmap[this_src_label[:2]])
        this_src_label_index = get_label_index(this_src_label)
        this_tgt_label_index = get_label_index(this_tgt_label)
        src.append(this_src_label_index)
        tgt.append(this_tgt_label_index)
        val.append(te_levels_df.loc[this_src_class, this_level])
        

In [None]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = all_labels,
        color = label_color
    ),
    link = dict(
        source = src,
        target = tgt,
        value = val,
        color= [label_color[x] for x in src]
  ))])

fig.update_layout(title_text="Influence Cascade", font_size=10)
fig.show()