In [1]:
import pickle
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import collections
from collections import defaultdict
import random
import copy
import math
import itertools
from scipy.optimize import linprog
from pulp import LpMaximize, LpProblem, LpStatus, lpSum, LpVariable
import time
import pulp
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib as mpl
import networkx as nx

In [2]:
lines = 224667
dataset = 'covid'
random.seed(10)

In [3]:
E = pickle.load(open("./Extracted/"+dataset+"/E"+ str(lines) + ".p", "rb"))

In [4]:
for tweet in E:
    E[tweet] = list(dict.fromkeys(E[tweet]))

In [5]:
D = pickle.load(open("./Extracted/"+dataset+"/D"+ str(lines) + ".p", "rb"))

In [7]:
S = pickle.load(open("./Extracted/"+dataset+"/S"+ str(lines) + ".p", "rb"))

In [8]:
U = pickle.load(open("./Extracted/"+dataset+"/U"+ str(lines) + ".p", "rb"))
U = list(U)

In [30]:
M = pickle.load(open("./Extracted/"+dataset+"/M_d"+ str(lines) + ".p", "rb"))

In [31]:
N = len(U)
print('Number of unique users N =', N)

Number of unique users N = 44602


In [11]:
print('Number of Episodes (original tweets) S =',len(E))

Number of Episodes (original tweets) S = 9722


In [12]:
active_pairs_n = 0 
for i in M:
    active_pairs_n+=len(M[i])

In [14]:
print('Number of active user pairs in the trace:', active_pairs_n, 'out of the', N*(N-1), 'possible pairs')

Number of active user pairs in the trace: 41020780 out of the 1989293802 possible pairs


In [16]:
def update_s(model, M, Q, x, a, b, s, active_pairs, lam):
    """ 
    This function updates the s_{ij} parameters of the optimization problemm

    Parameters
    ----------
    M : dict
        Dictionary with M_{ij} values tha show how many times an ordered pair (i,j) appears in the trace.
    Q : dict
        Dictionary with the Q_{ij} values
    W : dict
        Dictionary with the coefficients of the problem 
    x : dict
        Dictionary with decision variables
    active_pairs : list
        Active pairs of the problem
    s : dict
        Dictionary with the updated s_{ij} parameters
        
    Returns
    ----------
    s : dict
        Dictionary with the updated s_{ij} parameters
    """
    
    W = defaultdict(dict)

    for pair in active_pairs:
        i = pair[0]
        j = pair[1]
        W[i][j] = M[i][j]*((Q[i][j]*math.log(a/(1-a)))+ (1-Q[i][j])*math.log(b/(1-b))) + random.uniform(0,0.001)

    s = pulp_solve(model, active_pairs, W, x, s, lam)
    return(s, W)

In [34]:
def update_s(model, M, Q, x, a, b, s, active_pairs, lam, l_dual):
    """ 
    This function updates the s_{ij} parameters of the optimization problemm

    Parameters
    ----------
    M : dict
        Dictionary with M_{ij} values tha show how many times an ordered pair (i,j) appears in the trace.
    Q : dict
        Dictionary with the Q_{ij} values
    W : dict
        Dictionary with the coefficients of the problem 
    x : dict
        Dictionary with decision variables
    active_pairs : list
        Active pairs of the problem
    s : dict
        Dictionary with the updated s_{ij} parameters
        
    Returns
    ----------
    s : dict
        Dictionary with the updated s_{ij} parameters
    """
    
    W = defaultdict(dict)

    for pair in active_pairs:
        i = pair[0]
        j = pair[1]
        W[i][j] = M[i][j]*((Q[i][j]*math.log(a/(1-a)))+ (1-Q[i][j])*math.log(b/(1-b))) + random.uniform(0,0.001)

    s, l_dual = pulp_solve(model, active_pairs, W, x, s, lam, l_dual)
    return(s, W, l_dual)

In [18]:
def pulp_solve(model, active_pairs, W, x, s, lam, l_dual):
    """ 
    This function solves the optimization problem with PULP.
    
    Parameters
    ----------
    model : PulpModel
        Model initialized from pulp
    active_pairs : list
        Active pairs of the problem
    W : dict
        Dictionary with the coefficients of the problem 
    x : dict
        Dictionary with decision variables
    lam : float
        Lambda value by which we decreace the W's
    
    Returns
    ----------
    s : dict
        Dictionary with found s_{ij} parameters
    
    """
    
    maxv = max((set(max(list(i.values()) for i in W.values()))))
    c = maxv + 0.0001
    
    sm = dict()
    for i in W:
        sm[i] = dict() 
        for j in W[i]:
            if j not in sm[i]: sm[i][j]=0
            sm[i][j]+=(W[i][j]-c + sum(l_dual[j][e] for e in l_dual[j]))
    
    for i in sm:
        for j in sm[i]:
            if sm[i][j] > 0: s[i][j] = 1
            else: s[i][j] = 0
    
    sm2 = dict()
    for s in D:
        for users_list in list(D[s].values()):
            index_now = list(D[s].values()).index(users_list)
            if index_now != 0:
                for j in users_list:
                        u_before_l = list(D[s].values())[:index_now]
                        u_before = [item for sublist in u_before_l for item in sublist]
                        if j in u_before: u_before.remove(j)
                        for i in u_before:
                            sm2[j][s]+=(s[i][j]-1)
    for j in sm2:
        for s in sm2[j]:
            if sm2[j][s] > 0: l_dual[j][s] = 1
            else: l_dual[j][s] = 1                       
    
    return s, l_dual
                    

In [37]:
def pulp_create_reduct(E, M):
    """ 
    This function initializes the optimization problem with reduced constraints. Runs the first time

    Returns
    ----------
    model : LpProblem
        Pulp model of the problem 
    x : Dict
        Dictionary with decision variables
    """
    
    # Initialize the maximization problem
    model = LpProblem(name="constr-newman", sense=LpMaximize)
    
    # For each episode in $E$, each line in constraints_list includes:
    # - for each user j in $E$, the users before them in pos 0 and 
    # - the user itself in pos 1
    
    sij = defaultdict(dict)
    constraints_list = []
    
    l_dual = dict()
    for s in D:
        for users_list in list(D[s].values()):
            index_now = list(D[s].values()).index(users_list)
            if index_now == 0:
                for u in list(D[s].values())[1]:
                    sij[D[s][0][0]][u] = 1
            else:
                for j in users_list:
                    i_before_l = list(D[s].values())[:index_now]
                    i_before = [item for sublist in i_before_l for item in sublist]
                    if j in i_before: i_before.remove(j)
                    constraints_list.append([i_before, j])
                    if j not in l_dual: l_dual[j] = dict()
                    l_dual[j][s] = random.uniform(0,50)

    len_b = len(constraints_list)
    print('constraints before', len(constraints_list))

    # phase I: delete all constraints that include a pair with sij = 1
    
#     for c in list(constraints_list):
#         j = c[1]
#         for i in c[0]:
#             if i in sij and j in sij[i] and sij[i][j]==1:
#                 constraints_list.remove(c)
#                 break
                
    # For each episode in $E$, the constraints_dictionary includes:
    # - the user j as keys
    # - the users that come before them for each constraint that they appear in pos j as values
    
    constraints_list.sort()
    constraints_list = list(constraints_list for constraints_list,_ in itertools.groupby(constraints_list))
    constraints_dict = dict()

    ind = 0 
    for c in constraints_list:
            j = c[1]
            if j in constraints_dict:
                constraints_dict[j].append([c[0],ind])
            else:
                constraints_dict[j] = []
                constraints_dict[j].append([c[0],ind])
            ind+=1

#     for j in constraints_dict:
#         for constraint1 in constraints_dict[j]:
#             for constraint2 in constraints_dict[j]:
#                 if constraint1[1]!=constraint2[1]:
#                     if set(constraint1[0]).issubset(constraint2[0]):
#                         if constraint1[0]!=constraint2[0] and collections.Counter(constraint1[0]) != collections.Counter(constraint2[0]): 
#                             if [constraint2[0],j] in constraints_list: # we may have already deleted it
#                                 constraints_list.remove([constraint2[0],j])
                    
#     len_a = len(constraints_list)          
    
#     print('constraints after', len(constraints_list))
#     print('change of constraints:', abs(len_a - len_b))
    
    # Add decision variables 
    # Create active pairs and sij's randomly 
    
    active_pairs = []
    for i in M:
        for j in M[i]:
            if i not in sij or j not in sij[i]:
                active_pairs.append((i,j))
                sij[i][j] = random.uniform(0, 1)

    active_pairs = list(set(active_pairs))
        
    print('Active pairs:', len(active_pairs))
        
    return model, x, active_pairs, sij, l_dual

In [19]:
def newman(eps, N, M, active_pairs_n, lam):
        """ 
        This function is the main algorithm for path inference with constraints. 

        Parameters
        ----------
        rep : int 
            Number of times to run the algorithm.
        N : int
            Number of users in the trace.
        M : dict
            Dictionary with M_{ij} values tha show how many times an ordered pair (i,j) appears in the trace.

        Returns
        ----------

        """

        iterat = 1
        # ======================== INITIALIZE ========================
        print('Initialize...')

        # 0.1 -- Initialize Q dictionary 

        Q = defaultdict(dict)

        # 0.2 -- Initialize a, b, r $ parameters
        a = random.uniform(0.5, 1)
        b = random.uniform(0, 0.5)
        r = random.uniform(0, 1)

#         a= 0.9999 
#         b= 0.0001 
#         r= 0.003

        print('Initial a=', a,'b=', b, 'r=', r)

        print('Create pulp model with reduced constraints and variables...')
        # Create pulp model, one time
        model, x, active_pairs, s, l_dual = pulp_create_reduct(E, M)
        
        # ======================== START algorithm ========================

        while True: # Repeat until convergence
            # Step 1 ==== UPDATE VALUES ====

            # 1.1 -- Update Q dictionary
            
            Q = update_Q(M, Q, s, a, b, r)
            # 1.2 -- Update a,b,r,s_{ij} parameters according to Equations
            
            a = update_a(M, Q, s)
            b = update_b(M, Q, s)
            r = update_r(Q, N, active_pairs_n)

            if a ==1: 
                a = 0.9999999
                flag = True
            if b ==0: 
                b = 0.0001
                flag = True
            
            s, W  = update_s(model, M, Q, x, a, b, s, active_pairs, lam, l_dual)

            # Step 2 ==== CHECK CONVERGENCE ====
            
            if iterat > 1:
                new_q = np.array(flatten(Q))
                new_a = a
                new_b = b
                
                cost = sum(s[i][j]*M[i][j]*(Q[i][j]*math.log(a/(1-a))+(1-Q[i][j])*math.log(b/(1-b))) for i in s for j in s[i])
                print('Objective cost:', cost)
                new_cost = cost
                
                changeq = np.linalg.norm(new_q - old_q)  
                changea = abs(new_a-old_a)
                changeb = abs(new_b-old_b)
                changecost = abs(new_cost - old_cost)

                if changeq<eps:
                    print('-----')
                    print('- Change q:', changeq, 'Change a:', changea, 'Change b:', changeb)
                    print('- Changecost:', changecost)
                    print('- a=', a,'b=',b,'r=', r)
                    print('cost:', cost)
                
                    Q_old = copy.deepcopy(Q)
                    old_q = np.array(flatten(Q_old))
                    old_a = a
                    old_b = b
                    old_cost = cost        
                    break
                else: 
                    print('-----')
                    print('- Change q:', changeq, 'Change a:', changea, 'Change b:', changeb)
                    print('- Changecost:', changecost)
                    print('- a=', a,'b=',b,'r=', r)
                    print('cost:', cost)
                
                    Q_old = copy.deepcopy(Q)
                    old_q = np.array(flatten(Q_old))
                    old_a = a
                    old_b = b
                    old_cost = cost
                    
                    iterat += 1
                    
            if iterat == 1:
                flag = False
                old_q = np.array(flatten(Q))
                old_a = a
                old_b = b
                
                cost = sum(s[i][j]*M[i][j]*(Q[i][j]*math.log(a/(1-a))+(1-Q[i][j])*math.log(b/(1-b))) for i in s for j in s[i])
                old_cost = cost
                
                iterat+=1
                
        # ======================== END algorithm ========================
    
        return iterat, W, a, b, r, s, Q, cost

In [20]:
start = time.time()
random.seed(10)
print('------------------SEED:', i)
eps = 0.001
lam = 1
iterat, W, a, b, r, s, Q, cost = newman(eps, N, M, active_pairs_n, lam)
end = time.time()
print('Time:', end-start)

------------------SEED: 1197612772160344064
Initialize...
Initial a= 0.7857012973449568 b= 0.2144445273375573 r= 0.5780913011344704
Create pulp model with reduced constraints and variables...


NameError: name 'pulp_create_reduct' is not defined