In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import networkx as nx
import os
os.chdir('../Network-analysis/')
import utilities as util
os.chdir('../src/')


['C:\\Users\\drobi\\Desktop\\uni\\CS4195\\higgs-twitter-network-science\\src', 'C:\\Users\\drobi\\Anaconda3\\python39.zip', 'C:\\Users\\drobi\\Anaconda3\\DLLs', 'C:\\Users\\drobi\\Anaconda3\\lib', 'C:\\Users\\drobi\\Anaconda3', '', 'C:\\Users\\drobi\\AppData\\Roaming\\Python\\Python39\\site-packages', 'C:\\Users\\drobi\\Anaconda3\\lib\\site-packages', 'C:\\Users\\drobi\\Anaconda3\\lib\\site-packages\\win32', 'C:\\Users\\drobi\\Anaconda3\\lib\\site-packages\\win32\\lib', 'C:\\Users\\drobi\\Anaconda3\\lib\\site-packages\\Pythonwin', '../']


In [12]:
social_file_path = "../network-data/higgs-social_network.edgelist"
# day timestamp edgelists
# mention_file_path = "../network-data/higgs-mention_network.edgelist"
# retweet_file_path = "../network-data/higgs-retweet_network.edgelist"
# reply_file_path = "../network-data/higgs-reply_network.edgelist"
# full timestamps edgelists
reply_file_path = "../output/reply_timestamps.edgelist"
mention_file_path = "../output/mention_timestamps.edgelist"
retweet_file_path = "../output/retweet_timestamps.edgelist"

In [13]:
def extract_nodes(df):
    return set(df['source'].unique()).union(set(df['target'].unique()))

def load_nodes(file_path):
    # much faster to do the operations on dataframes
    df = pd.read_csv(file_path, delimiter = ' ', names=['source', 'target', 'day'])
    nodes = extract_nodes(df)
    print(f"loaded {file_path} has {len(nodes)} nodes")
    return nodes


In [14]:
# create the intersection of nodes across all interaction mechanisms.

reply_nodes = load_nodes(reply_file_path)
mention_nodes = load_nodes(mention_file_path)
reply_mention_nodes = reply_nodes.intersection(mention_nodes)
del reply_nodes, mention_nodes


retweet_nodes = load_nodes(retweet_file_path)
reply_mention_retweet_nodes = reply_mention_nodes.intersection(retweet_nodes)
del reply_mention_nodes, retweet_nodes

print(f"intersected network has {len(reply_mention_retweet_nodes)} nodes")

loaded ../output/reply_timestamps.edgelist has 38918 nodes
loaded ../output/mention_timestamps.edgelist has 116408 nodes
loaded ../output/retweet_timestamps.edgelist has 256491 nodes
intersected network has 21346 nodes


In [15]:
def largest_connected_component(G):
    # not supported for directed networks
    subgraphs = sorted(nx.connected_components(G), key=len, reverse=True)
    return G.subgraph(subgraphs[0])

In [16]:
def build_network(df, directed = False):

    if directed: 
        G = nx.DiGraph()
    else:
        G = nx.Graph()

    # Check if exists a day column
    if "day" in df.columns:
        to_add = [(source, target, {"day": day}) for source, target, day in zip(df["source"], df["target"], df["day"])]
    else:
        to_add = zip(df["source"], df["target"])
    G.add_edges_from(to_add)
    return G

In [17]:
def extract_bidirectional_subnetwork(G):
    """
    creates an undirected sub network from a directed network
     
    that only contains the edges where both 
    
    u -> v and v -> u

    are in the directed network. In the followers/following sense,

    This extracts the sub network where the users follow eachother.

    """
    sub_G = nx.Graph()

    for e in G.edges:
        if len(e) == 2:
            u, v = e
        else:
            u, v, _ = e
        if G.has_edge(v, u):
            sub_G.add_edge(u,v)

    return sub_G

In [18]:
def load_subnetwork(file_path, sub_nodes, directed = False):

    df = pd.read_csv(file_path, delimiter = ' ', names=['source', 'target', 'day'])
    if df['day'].isna().all():
        df.drop(columns=['day'], inplace=True)
    sub_df = df[((df['source'].isin(sub_nodes)) & (df['target'].isin(sub_nodes)))]

    return build_network(sub_df, directed)

In [19]:
def get_friendships(G):

    G = extract_bidirectional_subnetwork(G.to_undirected())

    return G

def get_lcc(G, directed = False):

    lcc = largest_connected_component(G.to_undirected())

    return lcc        

In [20]:
# following/followers is a directed network
social_network = load_subnetwork(social_file_path, reply_mention_retweet_nodes)
lcc_social_network = get_lcc(social_network)

friends_network = get_friendships(social_network)

del social_network
lcc_friends_network = get_lcc(friends_network)
del friends_network

nx.write_edgelist(lcc_friends_network, '../output/higgs-friends-lcc.edgelist')
nx.write_edgelist(lcc_social_network, '../output/higgs-social-lcc.edgelist')

In [21]:
social_reply_network = load_subnetwork(reply_file_path, lcc_social_network.nodes)
social_mention_network = load_subnetwork(mention_file_path, lcc_social_network.nodes)
social_retweet_network = load_subnetwork(retweet_file_path, lcc_social_network.nodes)

friends_reply_network = load_subnetwork(reply_file_path, lcc_friends_network.nodes)
friends_mention_network = load_subnetwork(mention_file_path, lcc_friends_network.nodes)
friends_retweet_network = load_subnetwork(retweet_file_path, lcc_friends_network.nodes)

paths = ['../output/higgs-social-reply.edgelist', '../output/higgs-social-mention.edgelist', '../output/higgs-social-retweet.edgelist', '../output/higgs-friends-reply.edgelist', '../output/higgs-friends-mention.edgelist', '../output/higgs-friends-retweet.edgelist']
networks = [social_reply_network, social_mention_network, social_retweet_network, friends_reply_network, friends_mention_network, friends_retweet_network]


for path, network in zip(paths, networks):
    nx.write_edgelist(network, path)
    with open(path, 'r') as f:
        data = f.readlines()

    # Remove the "day" key from each line of data
    data = [line.strip().replace("{'day': ", "").replace("}", "") for line in data]

    # Convert the data into a list of tuples with integer values
    data = [tuple(map(int, line.split())) for line in data]

    # Write the modified data to a new file
    with open(path, 'w') as f:
        for line in data:
            f.write(f"{line[0]} {line[1]} {line[2]}\n")
        
