In [None]:
import pickle
import numpy as np
from collections import defaultdict
import random

# 1. Load input

Load episode set $E$ with the users that retweeted each original tweet in the trace. 

Each episode $E_{s}$ includes the users that retweeted s, ordered chronologically, as they appear in the trace. The first user in each episode is the user that originally tweeted the tweet, and is denoted by $r_{s}$. Subsequent users in $E_{s}$ are users that retweeted s, either directly from user $r_{s}$ or from another user that appears in Es before them.

In [None]:
lines = 500000

In [None]:
E = pickle.load(open("./extracted/E"+ str(lines) + ".p", "rb"))

In [None]:
for s in E:
    E[s] = list(dict.fromkeys(E[s]))

Load the set of original tweets denoted by $S$. 

The set of original tweets is denoted by $S$, where |$S$| = S is the total number of original tweets

In [None]:
S = pickle.load(open("./extracted/S"+ str(lines) + ".p", "rb"))

Load $U$ set with unique users

In [None]:
U = pickle.load(open("./extracted/U"+ str(lines) + ".p", "rb"))
U = list(U)

Load $D$ dictionary

In [None]:
D = pickle.load(open("./extracted/D"+ str(lines) + ".p", "rb"))

# 2. Find important quantities

In [None]:
N = len(U)
print('Number of unique users N =', N)

In [None]:
print('Number of Episodes (original tweets) S =',len(E))

# 3. Saito's Algorithm

In [None]:
def flatten(obj):
    if type(obj) == list:
        return [l for L in obj for l in L]
    if type(obj) == dict:
        return [l for i in obj for l in obj[i].values()]
    if type(obj) == defaultdict:
        return [l for i in obj for l in obj[i].values()]

In [None]:
def saito(eps, D):
        """ 
        This function is the main algorithm for path inference with constraints according to Saito et al. [1].

        Parameters
        ----------
            eps : float
                Convergence criterion.
            D : dict
                Dictionary with D_{ij} values that was created in the trace-preprocessing.ipynb notebook.

        Returns
        ----------
            k : dict
                Dictionary that includes the influence probabilities k_{ij} for each (i,j) pair


        [1] K. Saito, R. Nakano,  and M. Kimura, ``Prediction of Information Diffusion Probabilities 
        for Independent Cascade Model'', in International Conference on Knowledge-Based and Intelligent 
        Information and Engineering Systems}, vol. 5179, 2008, pp. 67-75.
        """

        iterat = 1
        # ======================== INITIALIZE ========================

        k = defaultdict(dict)
        Splus = defaultdict(dict)
        Sminus = defaultdict(dict)
        for s in D:
                for t in list(D[s]):
                        if t!=0:
                            for w in D[s][t]:
                                indx = list(D[s]).index(t)
                                previous_t = list(D[s])[indx-1:indx]
                                for p_t in previous_t:
                                    for u in list(D[s][p_t]):
                                        k[u][w] = random.uniform(0,1)
                                        if u in Splus and w in Splus[u]:
                                            Splus[u][w].append(s)
                                        else:
                                            Splus[u][w]=[s]
        
        for s in D:
            for u in flatten(list(D[s].values())):
                for w in k[u]:
                    if s not in Splus[u][w]:
                        if u in Sminus and w in Sminus[u]:
                            Sminus[u][w].append(s)
                        else:
                            Sminus[u][w]=[s]                        
        
        for u in k:
            for w in k[u]:
                if w not in Sminus[u]:
                    Sminus[u][w] = [] 
            

        # ======================== START ========================

        while True:
            # Step 1 ==== UPDATE VALUES ====
            P = defaultdict(dict)
            for s in D:
                    for t in list(D[s]):
                            if t!=0:
                                for w in D[s][t]:
                                    indx = list(D[s]).index(t)
                                    previous_t = list(D[s])[indx-1:indx]
                                    pr = 1 
                                    for p_t in previous_t:
                                        for u in list(D[s][p_t]):
                                            pr*=(1-k[u][w])
                                    P[w][s] = 1 - pr 
            for u in k:
                for w in k[u]:
                    k[u][w] = (1/(len(Splus[u][w])+ len(Sminus[u][w]))) *sum(k[u][w]/P[w][s] for s in Splus[u][w])
            
            if iterat > 1:
                new_k = np.array(flatten(k))
                new_P = np.array(flatten(P))
                changek = np.linalg.norm(new_k - old_k)                  
                changeP = np.linalg.norm(new_P - old_P)  
                if changek < eps and changeP < eps: 
                    old_k = np.array(flatten(k))
                    old_P = np.array(flatten(P))
                    break
                else: 
                    old_k = np.array(flatten(k))
                    old_P = np.array(flatten(P))
                    iterat += 1
            if iterat == 1:
                flag = False
                old_k = np.array(flatten(k))
                old_P = np.array(flatten(P))
                iterat+=1
        # ======================== END ========================
        return k

In [None]:
random.seed(10)
eps = 10**-3
k = saito(eps, D)

In [None]:
pickle.dump(k, open("./extracted/k_saito_"+ str(lines) + ".p", "wb"))