In [1]:
## Imports

import pandas as pd
import networkx as nx
import itertools
import numpy as np

In [2]:
## Helper function used to parse wikipedia data

def custom_parsing(path):
    ## Initializing everything before parsing
    
    result_list = []

    to_node = None
    from_node = None
    sign = None

    ## Opening the file
    ## Chose encoding="iso8859_16" as simple "UTF-8" gave me errors
    with open(path, 'r', encoding="iso8859_16") as f:
        
        ## For each line ... 
        for line in f:
            ## Split the line by " "
            splitted = line.split()

            ## If empty line, continue
            if(len(splitted) == 0):
                continue

            ## If this is a "U" line ...
            elif(splitted[0] == 'U'):
                ## Take the id of the nominated user
                to_node = int(splitted[1])

            ## If this is a "V" line ...
            elif(splitted[0] == 'V'):
                ## Take the sign of the vote
                sign = int(splitted[1])
                
                ## Take the id of voter
                from_node = int(splitted[2])
                
                ## If the vote was neutral, don't take it (continue)
                ## Else store the line in the intermediary list
                if(sign == 0):
                    continue
                else:
                    result_list.append([from_node, to_node, sign])
            
            ## If this is any other kind of line, continue
            else:
                continue

    ## Converting the intermediary list into a dataframe and name columns correctly
    result_df = pd.DataFrame(result_list, columns=['FromNodeId', 'ToNodeId', 'Sign'])
    
    return result_df

In [3]:
## Loading data and sorting

epinions_df = pd.read_csv("data/soc-sign-epinions.txt", sep="\t", header=None, 
                          comment="#", names=['FromNodeId', 'ToNodeId', 'Sign'])
epinions_df = epinions_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

slashdot_df = pd.read_csv("data/soc-sign-Slashdot090221.txt", sep="\t", header=None, 
                          comment="#", names=['FromNodeId', 'ToNodeId', 'Sign'])
slashdot_df = slashdot_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

wikipedia_df = custom_parsing("data/wikiElec.ElecBs3.txt")
wikipedia_df = wikipedia_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

In [4]:
epinions_graph = nx.from_pandas_edgelist(epinions_df, source="FromNodeId", target="ToNodeId", 
                                         edge_attr="Sign", create_using=nx.DiGraph)

slashdot_graph = nx.from_pandas_edgelist(slashdot_df, source="FromNodeId", target="ToNodeId", 
                                         edge_attr="Sign", create_using=nx.DiGraph)

wikipedia_graph = nx.from_pandas_edgelist(wikipedia_df, source="FromNodeId", target="ToNodeId", 
                                          edge_attr="Sign", create_using=nx.DiGraph)

In [26]:
c_links_names = [f't{i}' for i in range(1, 17)]
    
census = pd.DataFrame(0, index = pd.Index(c_links_names), columns = ['+', '-']) 

In [27]:
census

Unnamed: 0,+,-
t1,0,0
t2,0,0
t3,0,0
t4,0,0
t5,0,0
t6,0,0
t7,0,0
t8,0,0
t9,0,0
t10,0,0


In [53]:
RELEVANT_TRIADS = ['030T', '120U', '120C', '210', '300']

C_LINKS_TYPES = {
    tuple(sorted([('vw', 1), ('wu', 1)])) : 't1',
    tuple(sorted([('vw', 1), ('wu', -1)])) : 't2',
    tuple(sorted([('vw', 1), ('uw', 1)])) : 't3',
    tuple(sorted([('vw', 1), ('uw', -1)])) : 't4',
    tuple(sorted([('vw', -1), ('wu', 1)])) : 't5',
    tuple(sorted([('vw', -1), ('wu', -1)])) : 't6',
    tuple(sorted([('vw', -1), ('uw', 1)])) : 't7',
    tuple(sorted([('vw', -1), ('uw', -1)])) : 't8', 
    tuple(sorted([('wv', 1), ('wu', 1)])) : 't9',
    tuple(sorted([('wv', 1), ('wu', -1)])) : 't10',
    tuple(sorted([('wv', 1), ('uw', 1)])) : 't11',
    tuple(sorted([('wv', 1), ('uw', -1)])) : 't12',
    tuple(sorted([('wv', -1), ('wu', 1)])) : 't13',
    tuple(sorted([('wv', -1), ('wu', -1)])) : 't14',
    tuple(sorted([('wv', -1), ('uw', 1)])) : 't15',
    tuple(sorted([('wv', -1), ('uw', -1)])) : 't16',
}


def compute_c_link_types(graph, v, u, w): 
    edge_dist = {
        'vw': w in graph[v],
        'wv': v in graph[w],
        'uw': w in graph[u],
        'wu': u in graph[w]
    }
    
    duplicates = []
    non_duplicates = []
    
    ## If an edge v->w exists in the graph ...
    if(edge_dist['vw']):
        ## AND an edge w->v exists in the graph ...
        if(edge_dist['wv']):
            ## (the edges formed by the pair (v, w) are stored in the "duplicates" list)
            duplicates.append([(v, w, 'vw'), (w, v, 'wv')])
        else:
            ## Only the edge v->w exists, store it in the "non_duplicates" list
            non_duplicates.append((v, w, 'vw'))
    else:
        ## Only the w->v exists, store it in the "non_duplicates" list
        non_duplicates.append((w, v, 'wv'))

    ## If an edge u->w exists in the graph ...
    if(edge_dist['uw']):
        ## AND an edge w->u exists in the graph ...
        if(edge_dist['wu']):
            ## (the edges formed by the pair (u, w) are stored in the "duplicates" list)
            duplicates.append([(u, w, 'uw'), (w, u, 'wu')])
        else:
            ## Only the edge u->w exists, store it in the "non_duplicates" list
            non_duplicates.append((u, w, 'uw'))
    else:
        ## Only the w->u exists, store it in the "non_duplicates" list
        non_duplicates.append((w, u, 'wu'))
    
    dup_len = len(duplicates)
    non_dup_len = len(non_duplicates)

    ## Computing the different combination possible from the duplicates list
    ## If only one pair was added to the duplicates list, there would be 2 combinations
    ## If two pairs were added to the duplicates list, there would be 4 combinations
    ## If three pairs were added to the duplicates list, there would be 8 combinations
    duplicates_combs = itertools.product(*duplicates)
    
    result = []
    
    ## combi is of the form []
    for combi in duplicates_combs:
        c_link_index = []
        
        for i in range(dup_len):
            edge_direction = combi[i][2]
            edge_sign = graph[combi[i][0]][combi[i][1]]['Sign']
            c_link_index.append((edge_direction, edge_sign))
            
        for i in range(non_dup_len):
            edge_direction = non_duplicates[i][2]
            edge_sign = graph[non_duplicates[i][0]][non_duplicates[i][1]]['Sign']
            c_link_index.append((edge_direction, edge_sign))
        
        c_link_type = C_LINKS_TYPES[tuple(sorted(c_link_index))]
        
        result.append(c_link_type)
    
    return result


def c_links_census(graph):
    # Initialize the count for each triad to be zero.
    census = {
        f't{i}': {
            '+': 0,
            '-': 0
    } for i in range(1, 17)}
    
    census_edges = {
        f't{i}' : set() for i in range(1, 17)
    }
    
    c_links_names = [f't{i}' for i in range(1, 17)]
    
    census = pd.DataFrame(0, index = pd.Index(c_links_names), columns = ['+', '-']) 
    
    n = len(graph)
    
    m = {v: i for i, v in enumerate(graph)}
    
    for v in graph:
        vnbrs = set(graph.succ[v])
        
        for u in vnbrs:
            if m[u] <= m[v]:
                continue

            neighbors_old = (vnbrs | set(graph.pred[v]) | set(graph.succ[u]) | set(graph.pred[u])) - {u, v}

            neighbors1 = (set(graph.pred[v]) & set(graph.pred[u]))
            neighbors2 = (set(graph.pred[v]) & set(graph.succ[u]))
            neighbors3 = (vnbrs & set(graph.pred[u]))
            neighbors4 = (vnbrs & set(graph.succ[u])) 
            
            neighbors = (neighbors1 | neighbors2 | neighbors3 | neighbors4) - {u, v}
            
#             printable = ''
            
#             if(len(list(neighbors_old)) < len(list(neighbors))):
#                 printable = 'TAHCHELEK !'
#             else:
#                 printable = 'BON BON BON !'
            
#             print(f'old : {len(list(neighbors_old))} / new : {len(list(neighbors))} ! {printable}')
            
            for w in neighbors:
                if m[u] < m[w] :
#                     code = nx.triads._tricode(graph, v, u, w)

                    c_link_types = compute_c_link_types(graph, v, u, w)

                    for c_link_type in c_link_types:
                        vu_sign = graph[v][u]['Sign']

                        sign_idx = '+' if vu_sign == 1 else '-'

                        census.loc[c_link_type][sign_idx] += 1
                        census_edges[c_link_type].add((v, u))
                    
    return census, census_edges

In [54]:
c_link_census_epinions, edges_per_c_link = c_links_census(epinions_graph)

In [55]:
c_link_census_epinions

Unnamed: 0,+,-
t1,1308071,18558
t2,26717,31654
t3,1302381,25752
t4,33647,5131
t5,49359,82704
t6,7582,13710
t7,21420,67587
t8,63042,21874
t9,1879820,94867
t10,22217,45328


In [56]:
def compute_baseline(graph, list_of_c_links, gen_or_rec):
    
    if(gen_or_rec != 'generative' and gen_or_rec != 'receptive'):
        raise ValueError('Impossible value for gen_or_rec argument !')
    
    sum_of_baselines = 0
    
    for c_link in list_of_c_links:
        
        if(gen_or_rec == 'generative'):
            v = c_link[0]
            succ = graph.succ[v]

            list_of_edges = [(v, successor) for successor in succ]
            list_of_positive_edges = [
                (v, successor) for successor in succ if graph[v][successor]['Sign'] == 1
            ]
        else:
            u = c_link[1]
            pred = graph.pred[u]
            
            list_of_edges = [(predecessor, u) for predecessor in pred]
            list_of_positive_edges = [
                (predecessor, u) for predecessor in pred if graph[predecessor][u]['Sign'] == 1
            ]
        
        total_edges = len(list_of_edges)
        total_positive_edges = len(list_of_positive_edges)
        
        sum_of_baselines += total_positive_edges/total_edges
        
    return sum_of_baselines

c_links_names = [key for key in edges_per_c_link.keys()]
baselines = pd.DataFrame(index = pd.Index(c_links_names), columns = ['generative', 'receptive']) 

# baselines = {
#     key : {
#         'generative' : 0,
#         'receptive' : 0
#     } for key in edges_per_c_link.keys()
# }

for c_link_type in edges_per_c_link.keys():    
    list_of_c_links = list(edges_per_c_link[c_link_type])
    
    print(80 * '=')
    print(c_link_type)
    print('Beginning generative baseline ...')
    
    baselines.loc[c_link_type]['generative'] = compute_baseline(epinions_graph, list_of_c_links, 'generative')
    
    print('Generative baselines finished !')
    print('Beginning receptive baseline ...')
    
    baselines.loc[c_link_type]['receptive'] = compute_baseline(epinions_graph, list_of_c_links, 'receptive')
    
    print('Receptive baseline finished !')
    print(80 * '=')

t1
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t2
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t3
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t4
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t5
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t6
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t7
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t8
Beginning generative baseline ...
Generative baselines finished !
Beginni

In [89]:
baselines = baselines.astype(float)

In [97]:
datafr = c_link_census_epinions
datafr['count'] = datafr['+'] + datafr['-']
datafr['p(+)'] = datafr['+'] / datafr['count']
datafr

Unnamed: 0,+,-,count,p(+)
t1,1308071,18558,1326629,0.986011
t2,26717,31654,58371,0.45771
t3,1302381,25752,1328133,0.98061
t4,33647,5131,38778,0.867683
t5,49359,82704,132063,0.373753
t6,7582,13710,21292,0.356096
t7,21420,67587,89007,0.240655
t8,63042,21874,84916,0.742404
t9,1879820,94867,1974687,0.951958
t10,22217,45328,67545,0.328921


In [98]:
datafr = pd.concat([datafr, baselines], axis=1)
datafr

Unnamed: 0,+,-,count,p(+),generative,receptive
t1,1308071,18558,1326629,0.986011,111321.119852,117893.7749
t2,26717,31654,58371,0.45771,15729.166791,15039.086481
t3,1302381,25752,1328133,0.98061,97317.230153,103410.663906
t4,33647,5131,38778,0.867683,10812.877387,11945.291535
t5,49359,82704,132063,0.373753,12637.912432,21845.810968
t6,7582,13710,21292,0.356096,3798.576457,6545.755823
t7,21420,67587,89007,0.240655,7864.566209,14372.535243
t8,63042,21874,84916,0.742404,10262.996304,16267.418979
t9,1879820,94867,1974687,0.951958,122365.296161,133796.271486
t10,22217,45328,67545,0.328921,15190.799857,15501.347066


In [100]:
datafr['s_g'] = datafr['count']*datafr['p(+)'] - datafr['generative']
datafr['s_g'] = datafr['s_g'] / np.sqrt(datafr['generative'] * (1 - (datafr['generative']/datafr['count'])))
datafr['s_g'] = round(datafr['s_g'], 1)

datafr['s_r'] = datafr['count']*datafr['p(+)'] - datafr['receptive']
datafr['s_r'] = datafr['s_r'] / np.sqrt(datafr['receptive'] * (1 - (datafr['receptive']/datafr['count'])))
datafr['s_r'] = round(datafr['s_r'], 1)

In [101]:
pred_B_g = [1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1]
pred_B_r = [1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1]
pred_S_g = [1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1]
pred_S_r = [1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1]

datafr['pred_B_g'] = pred_B_g
datafr['pred_B_r'] = pred_B_r
datafr['pred_S_g'] = pred_S_g
datafr['pred_S_r'] = pred_S_r


In [103]:
datafr['B_g'] = datafr['pred_B_g'] * datafr['s_g'] > 0
datafr['B_r'] = datafr['pred_B_r'] * datafr['s_r'] > 0
datafr['S_g'] = datafr['pred_S_g'] * datafr['s_g'] > 0
datafr['S_r'] = datafr['pred_S_r'] * datafr['s_r'] > 0

In [104]:
final = datafr[['count', 'p(+)', 's_g', 's_r', 'B_g', 'B_r', 'S_g', 'S_r']]

In [105]:
final

Unnamed: 0,count,p(+),s_g,s_r,B_g,B_r,S_g,S_r
t1,1326629,0.986011,3747.5,3631.4,True,True,True,True
t2,58371,0.45771,102.5,110.5,False,False,False,True
t3,1328133,0.98061,4012.7,3882.6,True,True,False,True
t4,38778,0.867683,258.6,238.7,False,False,True,True
t5,132063,0.373753,343.5,203.8,False,False,True,False
t6,21292,0.356096,67.7,15.4,True,True,False,False
t7,89007,0.240655,160.1,64.2,False,False,False,False
t8,84916,0.742404,555.6,407.9,True,True,True,False
t9,1974687,0.951958,5187.4,4943.8,True,True,True,False
t10,67545,0.328921,64.8,61.4,False,False,False,False


In [10]:
# new_index = pd.Index([f't{i}' for i in range(1, 17)])

# datafr.set_index(new_index, inplace=True, drop=True)
