In [3]:
import pandas as pd
import networkx as nx
import pickle

### Dataset loading

In [2]:
# load dataset into pandas dataframe
df = pd.read_csv("../Dataset/comments_2022_step3.csv", header=0)
df['topics'] = df['topics'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').split(', '))
df = df.loc[df['subreddit']!='[deleted]']
df = df.loc[df['author']!='[deleted]']
df.head()

Unnamed: 0,id,parent_id,author,subreddit,body,topics,sub-types
0,hqqvfvb,t1_hqqt3r3,BasicComplexities,VaushV,"\n&gt;. You use French people flippantly, but ...",[racist],"['L', 'S', 'M', 'A']"
1,hqqxbu8,t1_hqqhgr2,RagingAardvark,daddit,Thank you! I was actually recently mulling ove...,"[culture appropriation, white supremacy, culture]","['L', 'S', 'E', 'M']"
2,hqr027z,t1_hqqz64g,AdventurousAnxiety78,Afghan,"No, the coins are one of many diverse versions...",[traditional dress],"['M', 'A']"
3,hqr0j0w,t3_rt7dak,stjeana,TooAfraidToAsk,Cultural appropriation is vs and doesnt have a...,"[culture appropriation, cool, respect, culture]","['I', 'S']"
4,hqr0ska,t3_rt0uqs,kbell2020,Dreadlocks,"Hi,\n\nWhite mom here, straight hair. My son i...",[culture appropriation],['M']


### Empty multilayer

In [4]:
# create multilayer network
MLN = nx.MultiGraph()
pickle.dump(MLN, open('MLN.pickle', 'wb'))

### User layer

In [5]:
for user in df['author'].unique():
    MLN.add_node(user, layer='user')

for row in df.groupby('subreddit')['author'].apply(list).to_frame().iterrows():
    sub = row[0]
    auts = list(row[1])[0]
    if len(auts) > 1:
        edges_u = [(a,b,sub) for idx, a in enumerate(auts) for b in auts[idx+1:]]
        for edge in edges_u:
            MLN.add_edge(edge[0], edge[1], layer='user', label=edge[2])

### Content layer

In [6]:
# nodes
for comment in df.iterrows():
    MLN.add_node(comment[1]['id'], layer='content')

In [6]:
edges_c = []
for i in range(0, len(df)):
    current_id = df.iloc[i]['id']
    current_topics = df.iloc[i]['topics']
    for j in range(i+1, len(df)):
        next_id = df.iloc[j]['id']
        next_topics = df.iloc[j]['topics']
        if len(set(current_topics).intersection(set(next_topics))) > 0:
            edges_c.append((current_id, next_id, list(set(current_topics).intersection(set(next_topics)))))
    if i % 1000 == 0:
        with open(f'tmp/edges_c_{i}.pickle', 'wb') as f:
            pickle.dump(edges_c, f)
        del edges_c[:]
        del edges_c
        edges_c = []

In [7]:
from os import listdir
from os.path import isfile, join
pickles = [f for f in listdir('tmp') if isfile(join('tmp', f))]
for file in pickles:
    with open(f'tmp/{file}', 'rb') as f:
        current = list(pickle.load(f))
    for edge in current:
        MLN.add_edge(edge[0], edge[1], layer='content', label=edge[2])
    del current[:]
    del current

### Social phenomenon layer

In [9]:
spl_nodes = ['A', 'M', 'L', 'S', 'I', 'E']
for node in spl_nodes:
    MLN.add_node(node, layer='socialphenomenon')
spl_edges = [('A', 'M', 'similarity'), ('A', 'S', 'association'), ('A', 'I', 'causality'), ('A', 'E', 'causality'),
            ('M', 'S', 'causality'), ('M', 'I', 'association'), ('M', 'E', 'causality'), ('L', 'S', 'association')]
for edge in spl_edges:
    MLN.add_edge(edge[0], edge[1], layer='socialphenomenon', label=edge[2])

### Multi-layer edges

In [96]:
# Edges between U and C
posting_edges = list(df[['author', 'id']].itertuples(index=False, name=None))
df_interactions = df[['author', 'id', 'parent_id']].copy()
df_interactions['parent_id'] = df_interactions['parent_id'].apply(lambda x: x.split('_')[1])
df_interactions = df_interactions.loc[df_interactions['parent_id'].isin(list(df_interactions['id']))]
interactions_edges = list(df_interactions[['author', 'parent_id']].itertuples(index=False, name=None))
u_c_edges = []
u_c_edges.extend(posting_edges)
u_c_edges.extend(interactions_edges)
for edge in u_c_edges:
    MLN.add_edge(edge[0], edge[1], layer='multi_u_c')

In [25]:
# Edges between C and S
c_s_edges = []
for row in df[['id', 'sub-types']].itertuples():
    _id = row[1]
    _sts = row[2]
    for st in _sts:
        c_s_edges.append((_id, st))
for edge in c_s_edges:
    MLN.add_edge(edge[0], edge[1], layer='multi_c_s')

# Clean

In [72]:
# Remove nodes and edges with no "layer"
nodes_to_remove = [node for node in MLN.nodes() if 'layer' not in MLN.nodes[node]]
edges_to_remove = [(u, v) for u, v, data in MLN.edges(data=True) if 'layer' not in data]
MLN.remove_nodes_from(nodes_to_remove)
MLN.remove_edges_from(edges_to_remove)
pickle.dump(MLN, open('MLN.pickle', 'wb'))

# Descriptive analyses

In [77]:
def get_sub_graph(G, layer):
    sub_graph = nx.Graph()
    for node, data in G.nodes(data=True):
        if 'layer' in data and data['layer'] == layer:
            sub_graph.add_node(node, **data)
    for node1, node2, data in G.edges(data=True):
        if 'layer' in data and data['layer'] == layer:
            sub_graph.add_edge(node1, node2, **data)
    return sub_graph

In [83]:
user_layer = get_sub_graph(MLN, 'user')
print(f'Nodes {len(user_layer.nodes)}')
print(f'Edges {len(user_layer.edges)}')
conn_comps = list(nx.connected_components(user_layer))
print(f'Num conn comp {len(conn_comps)}')
print(f'Max conn comp {len(max(conn_comps))}')

Nodes 16976
Edges 2642943
Num conn comp 1172
Max conn comp 21


In [84]:
content_layer = get_sub_graph(MLN, 'content')
print(f'Nodes {len(content_layer.nodes)}')
print(f'Edges {len(content_layer.edges)}')
conn_comps = list(nx.connected_components(content_layer))
print(f'Num conn comp {len(conn_comps)}')
print(f'Max conn comp {len(max(conn_comps))}')

Nodes 20102
Edges 75168155
Num conn comp 200
Max conn comp 19890


In [87]:
pickle.dump(user_layer, open('user_layer.pickle', 'wb'))
pickle.dump(content_layer, open('content_layer.pickle', 'wb'))

In [88]:
print(f'User density {nx.density(user_layer)}')
print(f'Content density {nx.density(content_layer)}')

User density 0.018343096170423047
Content density 0.37205483614086543


In [99]:
len(u_c_edges)

22439

In [97]:
nodes_edges = [u[0] for u in u_c_edges]
len(set(user_layer.nodes).difference(set(nodes_edges)))

0

In [98]:
nodes_edges = [u[0] for u in c_s_edges]
len(set(content_layer.nodes).difference(set(nodes_edges)))

0

In [100]:
del user_layer

In [101]:
del content_layer

# Draw graph

In [102]:
import networkx as nx
import matplotlib.pyplot as plt

user_layer = pickle.load(open('user_layer.pickle', 'rb'))
content_layer = pickle.load(open('content_layer.pickle', 'rb'))

connected_components = nx.connected_components(user_layer)
biggest_cc = max(connected_components, key=len)
nodes = list(biggest_cc)

for u, v, data in MLN.edges(data=True):
    if data['layer'] == 'multi_u_c' and u in nodes:
        nodes.append(v)
for u, data in MLN.nodes(data=True):
    if data['layer'] == 'socialphenomenon':
        nodes.append(u)
        
del user_layer
del content_layer

def draw_multilayer_graph(G):
    pos = nx.spring_layout(G)
    node_labels = {}
    for node, data in G.nodes(data=True):
        node_labels[node] = node
        if data['layer'] == 'user':
            nx.draw_networkx_nodes(G, pos, [node], node_size=50, node_color='red')
        elif data['layer'] == 'content':
            nx.draw_networkx_nodes(G, pos, [node], node_size=50, node_color='green')
        elif data['layer'] == 'socialphenomenon':
            nx.draw_networkx_nodes(G, pos, [node], node_size=50, node_color='blue')
    for u, v, data in G.edges(data=True):
        if data['layer'] == 'multi_u_c':
            nx.draw_networkx_edges(G, pos, [(u, v)], edge_color='red', width=1.0)
        elif data['layer'] == 'multi_c_s':
            nx.draw_networkx_edges(G, pos, [(u, v)], edge_color='green', width=1.0)
    nx.draw_networkx_labels(G, pos, node_labels)
    plt.axis('off')
    plt.show()

# Take a part of the graph MLN
sub_graph = nx.Graph(MLN.subgraph(nodes))
draw_multilayer_graph(sub_graph)

KeyboardInterrupt: 