In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import networkx as nx
import os
import random
os.chdir('../Network-analysis/')
import utilities as util
os.chdir('../src/')

In [None]:
friends_file_path = "../output/higgs-friends-lcc.edgelist"
friends_reply_path = "../output/higgs-friends-reply.edgelist"
friends_mention_path = "../output/higgs-friends-mention.edgelist"
friends_retweet_path = "../output/higgs-friends-retweet.edgelist"

In [None]:
# Read the edge list file into a graph
G_reply = nx.read_edgelist(friends_reply_path, data=(("day", int),))
#print(G_reply.edges(data=True))

subgraph_reply = util.get_subgraph(G_reply, 5000)
subgraph_reply_df = pd.DataFrame(subgraph_reply.edges(data=True), columns=['source', 'target', 'attribute'])
# Extract the 'day' attribute from the 'attribute' column and add it as a separate column
subgraph_reply_df['day'] = [d['day'] for d in subgraph_reply_df['attribute']]
# Drop the 'attribute' column, which is no longer needed
subgraph_reply_df.drop('attribute', axis=1, inplace=True)

# there are days above 7, so we need to remove them
subgraph_reply_df = subgraph_reply_df[subgraph_reply_df['day'] <= 7]
#unique_days = subgraph_reply_df['day'].unique()
#print(unique_days)

In [None]:
# First implementation of temporal spreading
# From the plot I am not sure if this is correct, the number of infected nodes is pretty low,
# but that could be because the huge majority of interactions happen at time 1, and i am considering a node as spreader from the next day is infected
# if we remove this condition I expect the number of infected nodes to be almost the same as the number of nodes in the subgraph at day 1

subgraph_reply_df = subgraph_reply_df.sort_values(by=['day'])
subgraph_reply_df.reset_index(drop=True, inplace=True)

iterations = 100
infected_nodes_time_dict = {0:[1 for _ in range(iterations)]} 
seed_nodes = []
for n in range(iterations):
  seed_node = random.choice(list(subgraph_reply.nodes()))
  seed_nodes.append(seed_node)
  infected_nodes = {seed_node}

  cur_timestamp = 0
  nodes_infected_in_timestamp = set()

  # Iterate over all days
  for day in range(1, 8):
    # For each day get the edges that were created on that day
    edges = subgraph_reply_df[subgraph_reply_df['day'] == day]
    # Take the dataframe subset where either the source or the target is in the infected nodes
    susceptible_nodes_rows = edges[edges['source'].isin(infected_nodes) | edges['target'].isin(infected_nodes)]
    # Make a set of all the nodes that are in susceptible_nodes_rows
    susceptible_nodes = set(susceptible_nodes_rows['source']).union(set(susceptible_nodes_rows['target']))
    # Add the susceptible nodes to the infected nodes
    infected_nodes = infected_nodes.union(susceptible_nodes)
    # Add the infected nodes to the infected_nodes_time_dict
    if day in infected_nodes_time_dict:
      infected_nodes_time_dict[day].append(len(infected_nodes))
    else:
      infected_nodes_time_dict[day] = [len(infected_nodes)]
      
# Take the average of the infected nodes for each timestamp
infected_nodes_time_dict = {k: sum(v)/len(v) for k, v in infected_nodes_time_dict.items()}

plt.plot(list(infected_nodes_time_dict.keys()), list(infected_nodes_time_dict.values()))
