In [177]:
import pickle
import pandas as pd
import numpy as np
from scipy import stats
from collections import defaultdict
import random
import copy
import math
from scipy.optimize import linprog
import time
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib as mpl
import networkx as nx

# 1. Load input

In [178]:
lines = 500000
random.seed(10)
dataset = 'russian'

Load episode set $E$ with the users that retweeted each original tweet in the trace. 

Each episode $E_{s}$ includes the users that retweeted s, ordered chronologically, as they appear in the trace. The first user in each episode is the user that originally tweeted the tweet, and is denoted by $r_{s}$. Subsequent users in $E_{s}$ are users that retweeted s, either directly from user $r_{s}$ or from another user that appears in Es before them.

In [179]:
D = pickle.load(open("./Extracted/"+dataset+"/D"+ str(lines) + ".p", "rb"))

In [180]:
E = pickle.load(open("./Extracted/"+dataset+"/E"+ str(lines) + ".p", "rb"))

In [181]:
for tweet in E:
    E[tweet] = list(dict.fromkeys(E[tweet]))

Load the set of original tweets denoted by $S$. 

The set of original tweets is denoted by $S$, where $S$ = S is the total number of original tweets

In [182]:
S = pickle.load(open("./Extracted/"+dataset+"/S"+ str(lines) + ".p", "rb"))

Load $U$ set xith unique users

In [183]:
U = pickle.load(open("./Extracted/"+dataset+"/U"+ str(lines) + ".p", "rb"))
U = list(U)

Load $M_{ij}$ variables that count number of episodes where the ordered pair (i,j) appears

In [184]:
M = pickle.load(open("./Extracted/"+dataset+"/M_d"+ str(lines) + ".p", "rb"))

Load $Q_{ij}$ results for the ordered pair (i,j) derived from the consrained algorithm

In [185]:
Q = pickle.load(open("./Extracted/"+dataset+"/results/Q_constrained_"+ str(lines) + ".p", "rb"))

Load $s_{ij}$ derived from the consrained algorithm

In [186]:
s = pickle.load(open("./Extracted/"+dataset+"/results/s_constrained_"+ str(lines) + ".p", "rb"))

Load $k_{ij}$ derived from Saito

In [187]:
k = pickle.load(open("./Extracted/"+dataset+"/results/k_saito_"+ str(lines) + ".p", "rb"))

Load $Q_{ij}$ derived from Newman's

In [188]:
Q_newman = pickle.load(open("./Extracted/"+dataset+"/results/Q_newman_"+ str(lines) + ".p", "rb"))

## Necessary functions

In [190]:
def flatten(obj):
    if type(obj) == list:
        return [l for L in obj for l in L]
    if type(obj) == dict:
        return [l for i in obj for l in obj[i].values()]
    if type(obj) == defaultdict:
        return [l for i in obj for l in obj[i].values()]

In [191]:
def create_serpent_graph(U,D):
    G = nx.DiGraph()
    G.add_nodes_from(U)
    for tweet in D:
        for time in D[tweet]:
            ind = list(D[tweet].keys()).index(time)
            if ind+1==len(D[tweet]): break
            next_time = list(D[tweet].keys())[ind+1]
            for u1 in D[tweet][time]:
                for u2 in D[tweet][next_time]:
                    G.add_edge(u1,u2)
    return G

In [192]:
def create_saito_graph(U,k):
    G = nx.DiGraph()
    G.add_nodes_from(U)

    for i in k:
        for j in k[i]:
            if k[i][j] > 0.5:
                G.add_edge(i,j)
    return G

In [193]:
def create_star_graph(U,E):
    G = nx.DiGraph()
    G.add_nodes_from(U)
    for s in E:
        for j in E[s][1:]:
            G.add_edge(E[s][0],j)
    return G

In [194]:
def create_our_graph(U,Q):
    G = nx.DiGraph()
    G.add_nodes_from(U)
    for i in Q:
        for j in Q[i]:
            if Q[i][j] > 0.5:
                G.add_edge(i,j)
    return G

In [195]:
def create_newman_graph(U,Q):
    G = nx.DiGraph()
    G.add_nodes_from(U)

    for i in Q:
        for j in Q[i]:
            if Q[i][j] > 0.5:
                G.add_edge(i,j)
    return G

In [196]:
def wedge_metric(graph):
    numerator = 0
    denominator = 0

    for i in graph.nodes:
        leaders = set(graph.predecessors(i))
        followers = set(graph.successors(i))
        friends = leaders.intersection(followers)
        if len(leaders) + len(followers) < 2:
            continue
        if leaders==followers and len(leaders)==1:
            continue
        L = len(leaders)
        F = len(followers)
        LintF = len(friends)
        numerator += L*F - LintF
        denominator += (L+F)**2 - L - F - 2*LintF

    if denominator != 0:
        result = 2 * numerator / denominator
    else:
        result = 0
        
    return result

In [197]:
def f_check(E, Q):
    '''
    Function that checks feasibility of results

    '''
    retweets = 0 # minimum existing edges
    infeasible_episodes = 0 
    total_feasible_edges = []
    total_inf = 0 
    for s in E:
        feasible_edges = 0 
        for j in E[s]:
            indx = E[s].index(j)
            if indx!=0:
                u_before = E[s][:indx]
                for i in u_before: 
                    if i in Q and j in Q[i] and Q[i][j] > 0.5:
                        feasible_edges +=1
                        total_feasible_edges.append((i,j))

        infeasible = (len(E[s]) - 1) - feasible_edges
        if infeasible > 0:
#             print('Tweet', s, 'retweeted by', len(E[s])-1, 'users in total. But, we only found:', feasible_edges, 'feasible edges, so the infeasible ones are:', infeasible)            
            total_inf+=infeasible
            infeasible_episodes+=1
            
        retweets += len(E[s])-1
        total_feasible_edges = list(set(total_feasible_edges))
        
    return infeasible_episodes
        
#     print('Total feasbile edges:', len(total_feasible_edges), 'Number of retweets', retweets)
#     print('Total infeasible edges:', total_inf)

In [198]:
def longest_path(E, U, Q, k, graph_type):
    max_l = 0
    max_path = 0
    
    if graph_type=='ours' or graph_type=='newman':    
        for s in E:
            G = nx.DiGraph()
            G.add_nodes_from(U)
            for j in E[s][1:]:
                    indx = E[s].index(j)
                    u_before = E[s][:indx]
                    for i in u_before: 
                            if j in Q[i] and Q[i][j] > 0.5:
                                G.add_edge(i,j)

            if len(nx.dag_longest_path(G))>max_l: 
                max_l = len(nx.dag_longest_path(G))
                max_path = nx.dag_longest_path(G)
                
    if graph_type=='star':
        for s in E:
            G = nx.DiGraph()
            G.add_nodes_from(U)
            for j in E[s][1:]:
                G.add_edge(E[s][0],j)

            if len(nx.dag_longest_path(G))>max_l: 
                max_l = len(nx.dag_longest_path(G))
                max_path = nx.dag_longest_path(G)
                
    if graph_type=='saito':    
        for s in E:
            G = nx.DiGraph()
            G.add_nodes_from(U)
            for j in E[s][1:]:
                    indx = E[s].index(j)
                    u_before = E[s][:indx]
                    for i in u_before: 
                            if j in k[i] and k[i][j] > 0.5:
                                G.add_edge(i,j)

            if len(nx.dag_longest_path(G))>max_l: 
                max_l = len(nx.dag_longest_path(G))
                max_path = nx.dag_longest_path(G)
                
    return max_l, max_path   


In [199]:
def plot_tweetgraph(s, E, S, Q, k, pos, graph_type):
        
        G = nx.DiGraph()
        
        if graph_type=='ours' or graph_type=='newman':
            t = 0
            for j in E[s][1:]:
                G.add_node(j)
                indx = E[s].index(j)
                u_before = E[s][:indx]
                for i in u_before: 
                        G.add_node(i)
                        if j in Q[i] and Q[i][j] > 0.5:
                            G.add_edge(i,j, label=f'{t}')
                            t+=1
                            
        elif graph_type=='star':
            
            G.add_node(E[s][0])
            for j in E[s][1:]:
                    G.add_edge(E[s][0],j)
                    G.add_node(j)
                    
        elif graph_type=='saito':
            t = 0
            for j in E[s][1:]:
                G.add_node(j)
                indx = E[s].index(j)
                u_before = E[s][:indx]
                for i in u_before: 
                        G.add_node(i)
                        if j in k[i] and k[i][j] > 0.5:
                            G.add_edge(i,j, label=f'{t}')
                            t+=1
                            
        elif graph_type=='serpent':
            for time in D[s]:
                ind = list(D[tweet].keys()).index(time)
                if ind+1==len(D[tweet]): break
                next_time = list(D[tweet].keys())[ind+1]
                for u1 in D[tweet][time]:
                    for u2 in D[tweet][next_time]:
                        G.add_edge(u1,u2)

        # print('- Edges:', G.edges)
        color_map = []
        cmap = plt.get_cmap('Greens')

        for node in G:
            if node == S[tweet]:
                color_map.append('green')
            else:
                color_map.append('yellow')
        
        pos = nx.spring_layout(G)        
        nx.draw_networkx_nodes(G, pos, node_color = color_map, cmap=plt.get_cmap('jet'), node_size = 300)
        nx.draw_networkx_edges(G, pos, edge_color='r', arrows=True)
        nx.draw_networkx_labels(G, pos, font_size=8)
        if graph_type=='ours' or graph_type=='saito':
            nx.draw_networkx_edge_labels(G, pos, font_size=8)
        plt.show()
    

# Statistics

## 1. number of infeasible episodes

In [23]:
data = dict()
data['Graph Type with Lines: ' + str(lines)] = ['Ours','Saito','Star','Serpent', 'Newman']

In [24]:
inf_ep_ours = f_check(E, Q)
inf_ep_saito = f_check(E, k)
inf_ep_newman = f_check(E, Q_newman)

data['Infeasible Episodes'] = [inf_ep_ours, inf_ep_saito, 0, 0, inf_ep_newman]

## 2. Number of edges

In [200]:
#Create graphs
G_star = create_star_graph(U,E)
G_ours = create_our_graph(U,Q)
G_saito = create_saito_graph(U,k)
G_newman = create_newman_graph(U,Q_newman)
G_serpent = create_serpent_graph(U,D)

In [26]:
edges_ours = len(G_ours.edges())
edges_saito = len(G_saito.edges())
edges_star = len(G_star.edges())
edges_serpent = len(G_serpent.edges())
edges_newman = len(G_newman.edges())

data['Number of edges'] = [edges_ours, edges_saito, edges_star, edges_serpent, edges_newman]

## 3. Average out degree

In [27]:
av_ours = sum(d[1] for d in G_ours.out_degree())/float(len(G_ours))
av_saito = sum(d[1] for d in G_saito.out_degree())/float(len(G_saito))
av_star = sum(d[1] for d in G_star.out_degree())/float(len(G_star))
av_serpent = sum(d[1] for d in G_serpent.out_degree())/float(len(G_serpent))
av_newman = sum(d[1] for d in G_newman.out_degree())/float(len(G_newman))

data['Average out degree'] = [av_ours, av_saito, av_star, av_serpent, av_newman]

In [28]:
degree_sequence = list(G_ours.out_degree())
max_degree_our = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_saito.out_degree())
max_degree_saito = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_star.out_degree())
max_degree_star = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_serpent.out_degree())
max_degree_serpent = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_newman.out_degree())
max_degree_newman = max(np.array(degree_sequence)[:,1])

data['Max out degree'] = [max_degree_our, max_degree_saito, max_degree_star, max_degree_serpent, max_degree_newman]


In [31]:
degree_sequence = list(G_ours.in_degree())
max_degree_our = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_saito.in_degree())
max_degree_saito = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_star.in_degree())
max_degree_star = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_serpent.in_degree())
max_degree_serpent = max(np.array(degree_sequence)[:,1])

degree_sequence = list(G_newman.in_degree())
max_degree_newman = max(np.array(degree_sequence)[:,1])

data['Max in degree'] = [max_degree_our, max_degree_saito, max_degree_star, max_degree_serpent, max_degree_newman]

## 5. Graph diameter

The maximum among all the distances between a vertex to all other vertices is considered as the diameter of the Graph G.

In [30]:
def diameter_scc(G, graph_type):
    maxv = 0 
    for C in nx.strongly_connected_components(G):
        C = G.subgraph(C)
        if len(C)>1: # skip one nodes
            if nx.diameter(C)>maxv:
                maxv=nx.diameter(C)
    return maxv
            
gd_ours = diameter_scc(G_ours, 'Ours')
gd_saito = diameter_scc(G_saito, 'Saito')
gd_star = diameter_scc(G_star, 'Star')
gd_serpent = diameter_scc(G_serpent, 'Serpent')
gd_newman = diameter_scc(G_newman, 'Newman')

data['Max graph diameter'] = [gd_ours, gd_saito, gd_star, gd_serpent, gd_newman]

KeyboardInterrupt: 

In [None]:
def avg_sh_path(G, graph_type):
    maxv = 0 
    for C in nx.strongly_connected_components(G):
        C = G.subgraph(C)
        if len(C)>1: # skip one nodes
            if nx.average_shortest_path_length(C)>maxv:
                maxv = nx.average_shortest_path_length(C)
    return maxv 

avg_ours = avg_sh_path(G_ours, 'Ours')
avg_saito = avg_sh_path(G_saito, 'Saito')
avg_star = avg_sh_path(G_star, 'Star')
avg_serpent = avg_sh_path(G_serpent, 'Serpent')
avg_newman = avg_sh_path(G_newman, 'Newman')

data['Max average shortest path'] = [avg_ours, avg_saito, avg_star, avg_serpent, avg_newman]

# 6. Number of connected components

In [285]:
def number_cc(G, graph_type):
    scc = 0 
    wcc = 0 
    for C in nx.strongly_connected_components(G):
        C = G.subgraph(C)
        if len(C)>0: # skip one nodes
            scc+=1
    for C in nx.weakly_connected_components(G):
        C = G.subgraph(C)
        if len(C)>1: # skip one nodes
            wcc+=1
    return scc, wcc

In [284]:
scc_ours, wcc_ours = number_cc(G_ours, 'Ours')
scc_ours, wcc_ours

(17, 542)

In [75]:
scc_saito, wcc_saito = number_cc(G_saito, 'Saito')

In [76]:
scc_star, wcc_star = number_cc(G_star, 'Star')

In [77]:
scc_serpent, wcc_serpent = number_cc(G_serpent, 'Serpent')

In [78]:
scc_newman, wcc_newman = number_cc(G_newman, 'Newman')

In [131]:
data['Number of scc'] = [scc_ours, scc_saito, scc_star, scc_serpent, scc_newman]
data['Number of wcc'] = [wcc_ours, wcc_saito, wcc_star, wcc_serpent, wcc_newman]

In [80]:
scc_ours, scc_saito, scc_star, scc_serpent, scc_newman

(6, 0, 8, 2, 0)

In [82]:
wcc_ours, wcc_saito, wcc_star, wcc_serpent, wcc_newman

(153, 1593, 153, 153, 54)

In [None]:
df = pd.DataFrame(data)
df.to_csv('../Results.csv', mode='a', index = False, header=True)

## 7. Users

In [129]:
def user_analytics(G):
    in_deg_centrality = nx.in_degree_centrality(G)
    out_deg_centrality = nx.out_degree_centrality(G)
    close_centrality = nx.closeness_centrality(G)   
    return(max(in_deg_centrality.values()),max(out_deg_centrality.values()), max(close_centrality.values()))

# bet_centrality = nx.betweenness_centrality(G, normalized = True, endpoints = False)
    # pr = nx.pagerank(G, alpha = 0.8)

In [125]:
# import operator
# in_deg_centrality = nx.in_degree_centrality(G)
# max(in_deg_centrality.items(), key=operator.itemgetter(1))

# import operator
# out_deg_centrality = nx.out_degree_centrality(G)
# max(out_deg_centrality.items(), key=operator.itemgetter(1))

In [126]:
# dict(sorted(dict(G.out_degree()).items(), key=lambda item: item[1], reverse=True))
# dict(sorted(out_deg_centrality.items(), key=lambda item: item[1], reverse=True))

In [128]:
# G.degree(1381900111)

In [175]:
in_d, out_d, close = user_analytics(G_serpent)
print('-Max in degree centrality:', in_d)
print('-Max out degre centrality:', out_d)
print('-Max closeness centrality:', close)

-Max in degree centrality: 0.005998571768626518
-Max out degre centrality: 0.005332063794334682
-Max closeness centrality: 0.22641026660105865
