In [2]:
## Imports

import pandas as pd
import networkx as nx
import itertools
import numpy as np

### Load the data

In [3]:
## Title hyperlinks
title_links = pd.read_csv("Data/soc-redditHyperlinks-title.tsv",'\t')
## Body hyperlinks
body_links = pd.read_csv("Data/soc-redditHyperlinks-body.tsv",'\t')

### Take a look at the data

In [5]:
title_links.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,rddtgaming,rddtrust,1u4pzzs,2013-12-31 16:39:18,1,"25.0,23.0,0.76,0.0,0.44,0.12,0.12,4.0,4.0,0.0,..."
1,xboxone,battlefield_4,1u4tmfs,2013-12-31 17:59:11,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
2,ps4,battlefield_4,1u4tmos,2013-12-31 17:59:40,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
3,fitnesscirclejerk,leangains,1u50xfs,2013-12-31 19:01:56,1,"49.0,43.0,0.775510204082,0.0,0.265306122449,0...."
4,fitnesscirclejerk,lifeprotips,1u51nps,2013-12-31 21:02:28,1,"14.0,14.0,0.785714285714,0.0,0.428571428571,0...."


In [6]:
body_links.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


## the data is presented is as follows

- SOURCE_SUBREDDIT: the subreddit where the link originates
- TARGET_SUBREDDIT: the subreddit where the link ends
- POST_ID: the post in the source subreddit that starts the link
- TIMESTAMP: time time of the post
- POST_LABEL: label indicating if the source post is explicitly negative towards the target post. The value is -1 if the source is negative towards the target, and 1 if it is neutral or positive. The label is created using crowd-sourcing and training a text based classifier, and is better than simple sentiment analysis of the posts. Please see the reference paper for details.
- POST_PROPERTIES: a vector representing the text properties of the source post, listed as a list of comma separated numbers. The vector elements are the following:
    1. Number of characters
    2. Number of characters without counting white space
    3. Fraction of alphabetical characters
    4. Fraction of digits
    5. Fraction of uppercase characters
    6. Fraction of white spaces
    7. Fraction of special characters, such as comma, exclamation mark, etc.
    8. Number of words
    9. Number of unique works
    10. Number of long words (at least 6 characters)
    11. Average word length
    12. Number of unique stopwords
    13. Fraction of stopwords
    14. Number of sentences
    15. Number of long sentences (at least 10 words)
    16. Average number of characters per sentence
    17. Average number of words per sentence
    18. Automated readability index
    19. Positive sentiment calculated by VADER
    20. Negative sentiment calculated by VADER
    21. Compound sentiment calculated by VADER
    22. LIWC_Funct
    23. LIWC_Pronoun
    24. LIWC_Ppron
    25. LIWC_I
    26. LIWC_We
    27. LIWC_You
    28. LIWC_SheHe
    29. LIWC_They
    30. LIWC_Ipron
    31. LIWC_Article
    32. LIWC_Verbs
    33. LIWC_AuxVb
    34. LIWC_Past
    35. LIWC_Present
    36. LIWC_Future
    37. LIWC_Adverbs
    38. LIWC_Prep
    39. LIWC_Conj
    40. LIWC_Negate
    41. LIWC_Quant
    42. LIWC_Numbers
    43. LIWC_Swear
    44. LIWC_Social
    45. LIWC_Family
    46. LIWC_Friends
    47. LIWC_Humans
    48. LIWC_Affect
    49. LIWC_Posemo
    50. LIWC_Negemo
    51. LIWC_Anx
    52. LIWC_Anger
    53. LIWC_Sad
    54. LIWC_CogMech
    55. LIWC_Insight
    56. LIWC_Cause
    57. LIWC_Discrep
    58. LIWC_Tentat
    59. LIWC_Certain
    60. LIWC_Inhib
    61. LIWC_Incl
    62. LIWC_Excl
    63. LIWC_Percept
    64. LIWC_See
    65. LIWC_Hear
    66. LIWC_Feel
    67. LIWC_Bio
    68. LIWC_Body
    69. LIWC_Health
    70. LIWC_Sexual
    71. LIWC_Ingest
    72. LIWC_Relativ
    73. LIWC_Motion
    74. LIWC_Space
    75. LIWC_Time
    76. LIWC_Work
    77. LIWC_Achiev
    78. LIWC_Leisure
    79. LIWC_Home
    80. LIWC_Money
    81. LIWC_Relig
    82. LIWC_Death
    83. LIWC_Assent
    84. LIWC_Dissent
    85. LIWC_Nonflu
    86. LIWC_Filler    

###  Note that POST_PROPERTIES were constructed in order to assign a sign (-1 or 1) to the directed link betweet the SOURCE_SUBREDDIT and TARGET_SUBREDDIT.
### These signs are contained in POST_LABEL. Thus, we will basically need only those three features


In [7]:
## Drop unwanted features 
title_links = title_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT"]]
body_links = body_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT"]]

## Print some characteristics for Title links
print(20*'=')
print('Title links :')
title_source_subreddits = title_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the title links dataset:\t {len(title_source_subreddits)}')
title_source_set = set(title_source_subreddits)
print(f'The number of unique source subreddits: \t {len(title_source_set)}')
title_subreddits = set(title_links.SOURCE_SUBREDDIT.values.tolist() + title_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the title dataset:\t {len(title_subreddits)}')
print(20*'=')

## Print some characteristics for Body links
print(20*'=')
print("Body links :")
body_source_subreddits = body_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the body links dataset: \t {len(body_source_subreddits)}')
body_source_set = set(body_source_subreddits)
print(f'The number of unique source subreddits : \t {len(body_source_set)}')
body_subreddits = set(body_links.SOURCE_SUBREDDIT.values.tolist() + body_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the body dataset :\t {len(body_subreddits)}')
print(20*'=')

Title links :
The number of links in the title links dataset:	 571927
The number of unique source subreddits: 	 43695
the set of all subreddits in the title dataset:	 54075
Body links :
The number of links in the body links dataset: 	 286561
The number of unique source subreddits : 	 27863
the set of all subreddits in the body dataset :	 35776


In [11]:
## Here we concatenate the both dataframes to create a unique dataset of connexions between subreddits.
Data = pd.concat([title_links,body_links])

print("All subreddit links :")
source_subreddits = Data.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in all data: \t {len(source_subreddits)}')
source_set = set(source_subreddits)
print(f'The number of unique source subreddits:  {len(source_set)}')
subreddits = set(Data.SOURCE_SUBREDDIT.values.tolist() + Data.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits :\t\t {len(subreddits)}')

All subreddit links :
The number of links in all data: 	 858488
The number of unique source subreddits:  55863
the set of all subreddits :		 67180


In [12]:
## Let's see the number and pourcentages of positive and negatives links
number_pos_links = Data.where(Data.LINK_SENTIMENT==1).count()[0]
number_neg_links = Data.where(Data.LINK_SENTIMENT==-1).count()[0]
print(f'There is {number_pos_links} positive links and {number_neg_links} negative links')

print(f'REDDIT : Pourcentage of (+)edges is {round(number_pos_links*100/len(Data),1)}% and pourcentage of (-)edges is {round(number_neg_links*100/len(Data),1)}%')

There is 776278 positive links and 82210 negative links
REDDIT : Pourcentage of (+)edges is 90.4% and pourcentage of (-)edges is 9.6%


### Comparing these pourcentages to the ones of Epinions or Slashdot datasets, We can see that Reddit has a lot less negatives links than the others. We can state (hypothetize ?) that the interactions betweet subreddits are mostly positive and that negative links are especially for conflicts rather than just negative opinion/vote and of course, conflicts are much less likely.

Now let's create a graph of subreddits links

In [13]:
complete_graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = "LINK_SENTIMENT",create_using=nx.DiGraph)

nbr_nodes = complete_graph.number_of_nodes()
nbr_edges = complete_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)

sum_of_pos = sum(1 if w["LINK_SENTIMENT"]==1 else 0 for (_,_,w) in complete_graph.edges(data=True))
sum_of_neg = sum(1 if w["LINK_SENTIMENT"]==-1 else 0 for (_,_,w) in complete_graph.edges(data=True))
pourc_of_pos = round(100 *sum_of_pos/nbr_edges,1)
pourc_of_neg = round(100 *sum_of_neg/nbr_edges,1)
print("REDDIT : Pourcentage of (+)edges is {pos}% and pourcentage of (-)edges is {neg}%".format(pos = pourc_of_pos, neg= pourc_of_neg))

REDDIT:    Nodes = 67180  Edges = 339643
REDDIT : Pourcentage of (+)edges is 92.5% and pourcentage of (-)edges is 7.5%


We can see that this way of creating the graph led us to dropping an huge number of edges (from 858488 to 339643) while keeping the same number of nodes.

### Since our dataset contains a lot of duplicates edges (Some of those edges may be all positive or all negative or a combination of positive and negative signs). Creating our directed graph directly from those edges, will ommit these duplicates and will take into account only their last occurences. Thus , it will take only the last sign of the link betweet those two subreddits. This will alter our perspectives since in a case where all links but the LAST ONE , between subreddit A and subreddit B were negative, this generated graph will take only the positive link and we will be dropping very important information.

Let's try and generate a multiple directed graph (MultiDiGraph)

In [18]:
complete_multi_graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = "LINK_SENTIMENT",create_using=nx.MultiDiGraph)

nbr_nodes = complete_multi_graph.number_of_nodes()
nbr_edges = complete_multi_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)

sum_of_pos = sum(1 if w["LINK_SENTIMENT"]==1 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
sum_of_neg = sum(1 if w["LINK_SENTIMENT"]==-1 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
pourc_of_pos = round(100 *sum_of_pos/nbr_edges,1)
pourc_of_neg = round(100 *sum_of_neg/nbr_edges,1)
print("REDDIT : Pourcentage of (+)edges is {pos}% and pourcentage of (-)edges is {neg}%".format(pos = pourc_of_pos, neg= pourc_of_neg))

REDDIT:    Nodes = 67180  Edges = 858488
REDDIT : Pourcentage of (+)edges is 90.4% and pourcentage of (-)edges is 9.6%


### We can see now that this graph contains more informations about the data than the other one; It complete and have all edges and nodes

### Since one of our objectives is to compare the similarities and differences between the individuals datasets (Epinions, slashdot and wikipedia) and the communities dataset (Reddit) , apart from this methode of creating a multiple edges directed graph, we decides to generate another graph that , for each multiple signed edges between same two nodes, will create a single edge with the mean of all those signs as unique weight.

In [19]:
## Here we group by (source,target) but instead of keeping all weights, we sum them into one weight
groupedBySource_Target_mean = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).LINK_SENTIMENT.apply(lambda x: np.mean(x)).to_frame()
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe 
listed_source = [ elem[0] for elem in groupedBySource_Target_mean.index]
listed_target = [ elem[1] for elem in groupedBySource_Target_mean.index]
listed_sign = [ elem[0] for elem in groupedBySource_Target_mean.values ]

data_mean = pd.DataFrame({"source":listed_source,"target":listed_target,"sign":listed_sign})

graph_mean = nx.from_pandas_edgelist(data_mean,source='source', target="target", edge_attr = "sign",create_using=nx.DiGraph)

In [29]:
print(f'This meaned graph contains {len(data_mean)} signed edges.')
print(f'The number of edges which mean is equal to 1 is {len(data_mean[data_mean.sign==1])}')
print(f'The number of edges which mean is equal to -1 is {len(data_mean[data_mean.sign==-1])}')

This meaned graph contains 339643 signed edges.
The number of edges which mean is equal to 1 is 298473
The number of edges which mean is equal to -1 is 18104


### We can see that the mean sign of the edges by majority 1 or -1. 
### From this, we decided to define "FRIENDS Communities" by the communities which only have **multiple** positive edges. and "ENEMIES communities" by the communities which only have multiple negative edges.

for this, we need to isolate the communities that have multiple edges between them : ie: we can't consider that two communities are enemies just by having only one negative link between them. so we just keep multiple linked communities

In [35]:
## We groupby the data using pairs (source,target) and collect all link signs between same pair in one attribute (a dictionary)
groupedBySource_Target_pair = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).LINK_SENTIMENT.apply(lambda x: list(x)).to_frame()
groupedBySource_Target_pair.LINK_SENTIMENT = groupedBySource_Target_pair.LINK_SENTIMENT.apply(lambda x: dict(zip(np.arange(len(x)), x)))
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe 
listed_source_dict = [ elem[0] for elem in groupedBySource_Target_pair.index]
listed_target_dict = [ elem[1] for elem in groupedBySource_Target_pair.index]
listed_sign_dict = [ elem[0] for elem in groupedBySource_Target_pair.values ]

data_dict_df = pd.DataFrame({"source":listed_source_dict,"target":listed_target_dict,"sign":listed_sign_dict})

In [47]:
indices =[]
for i,elem in zip(range(len(data_dict_df.sign.values)),data_dict_df.sign.values):
    if len(elem)>=2: indices.append(i)

In [52]:
data_multiple_edges = data_dict_df.loc[indices]

In [53]:
data_multiple_edges

Unnamed: 0,source,target,sign
1,07scape,osrstranscripts,"{0: 1, 1: 1}"
6,0magick,occult,"{0: 1, 1: 1}"
10,0x02,writingprompts,"{0: 1, 1: 1}"
12,0x10c,techcompliant,"{0: 1, 1: 1}"
22,100daysofketo,keto,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}"
...,...,...,...
339622,zurich,switzerland,"{0: 1, 1: 1}"
339629,zxspectrum,retrogaming,"{0: 1, 1: 1}"
339634,zylooxwrites,writingprompts,"{0: 1, 1: 1, 2: 1, 3: 1}"
339635,zyramains,leagueoflegends,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: ..."


In [55]:
multiple_edges_meaned_data = data_mean.loc[indices].copy()

In [57]:
multiple_edges_meaned_data[multiple_edges_meaned_data.sign==-1]

Unnamed: 0,source,target,sign
1624,5555555,relationship_advice,-1.0
1986,9gag,askreddit,-1.0
2615,abuseinterrupted,changemyview,-1.0
2689,abuseinterrupted,theredpill,-1.0
2696,abuseinterrupted,upliftingnews,-1.0
...,...,...,...
335653,writingprompts,savedyouaclick,-1.0
336280,wtfdidijustread,mensrights,-1.0
337522,yankees,nfl,-1.0
337825,yishansucks,iama,-1.0


In [None]:
def compute_count_wanted_triads(graph):
    triads = nx.triadic_census(graph)
    total_triads = triads['030T']+triads['030C']
    total_triads= total_triads+(2*(triads['120U']+triads['120D']+triads['120C']))
    total_triads=total_triads+(4*triads['210'])+(8*triads['300'])
    return total_triads

total_triads = compute_count_wanted_triads(complete_graph)

In [2]:
## Helper function used to parse wikipedia data

def custom_parsing(path):
    ## Initializing everything before parsing
    
    result_list = []

    to_node = None
    from_node = None
    sign = None

    ## Opening the file
    ## Chose encoding="iso8859_16" as simple "UTF-8" gave me errors
    with open(path, 'r', encoding="iso8859_16") as f:
        
        ## For each line ... 
        for line in f:
            ## Split the line by " "
            splitted = line.split()

            ## If empty line, continue
            if(len(splitted) == 0):
                continue

            ## If this is a "U" line ...
            elif(splitted[0] == 'U'):
                ## Take the id of the nominated user
                to_node = int(splitted[1])

            ## If this is a "V" line ...
            elif(splitted[0] == 'V'):
                ## Take the sign of the vote
                sign = int(splitted[1])
                
                ## Take the id of voter
                from_node = int(splitted[2])
                
                ## If the vote was neutral, don't take it (continue)
                ## Else store the line in the intermediary list
                if(sign == 0):
                    continue
                else:
                    result_list.append([from_node, to_node, sign])
            
            ## If this is any other kind of line, continue
            else:
                continue

    ## Converting the intermediary list into a dataframe and name columns correctly
    result_df = pd.DataFrame(result_list, columns=['FromNodeId', 'ToNodeId', 'Sign'])
    
    return result_df

In [3]:
## Loading data and sorting

epinions_df = pd.read_csv("data/soc-sign-epinions.txt", sep="\t", header=None, 
                          comment="#", names=['FromNodeId', 'ToNodeId', 'Sign'])
epinions_df = epinions_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

slashdot_df = pd.read_csv("data/soc-sign-Slashdot090221.txt", sep="\t", header=None, 
                          comment="#", names=['FromNodeId', 'ToNodeId', 'Sign'])
slashdot_df = slashdot_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

wikipedia_df = custom_parsing("data/wikiElec.ElecBs3.txt")
wikipedia_df = wikipedia_df.sort_values(by=["FromNodeId", "ToNodeId"]).reset_index(drop=True)

In [4]:
epinions_graph = nx.from_pandas_edgelist(epinions_df, source="FromNodeId", target="ToNodeId", 
                                         edge_attr="Sign", create_using=nx.DiGraph)

slashdot_graph = nx.from_pandas_edgelist(slashdot_df, source="FromNodeId", target="ToNodeId", 
                                         edge_attr="Sign", create_using=nx.DiGraph)

wikipedia_graph = nx.from_pandas_edgelist(wikipedia_df, source="FromNodeId", target="ToNodeId", 
                                          edge_attr="Sign", create_using=nx.DiGraph)

In [26]:
c_links_names = [f't{i}' for i in range(1, 17)]
    
census = pd.DataFrame(0, index = pd.Index(c_links_names), columns = ['+', '-']) 

In [27]:
census

Unnamed: 0,+,-
t1,0,0
t2,0,0
t3,0,0
t4,0,0
t5,0,0
t6,0,0
t7,0,0
t8,0,0
t9,0,0
t10,0,0


In [1]:
RELEVANT_TRIADS = ['030T', '120U', '120C', '210', '300']

C_LINKS_TYPES = {
    tuple(sorted([('vw', 1), ('wu', 1)])) : 't1',
    tuple(sorted([('vw', 1), ('wu', -1)])) : 't2',
    tuple(sorted([('vw', 1), ('uw', 1)])) : 't3',
    tuple(sorted([('vw', 1), ('uw', -1)])) : 't4',
    tuple(sorted([('vw', -1), ('wu', 1)])) : 't5',
    tuple(sorted([('vw', -1), ('wu', -1)])) : 't6',
    tuple(sorted([('vw', -1), ('uw', 1)])) : 't7',
    tuple(sorted([('vw', -1), ('uw', -1)])) : 't8', 
    tuple(sorted([('wv', 1), ('wu', 1)])) : 't9',
    tuple(sorted([('wv', 1), ('wu', -1)])) : 't10',
    tuple(sorted([('wv', 1), ('uw', 1)])) : 't11',
    tuple(sorted([('wv', 1), ('uw', -1)])) : 't12',
    tuple(sorted([('wv', -1), ('wu', 1)])) : 't13',
    tuple(sorted([('wv', -1), ('wu', -1)])) : 't14',
    tuple(sorted([('wv', -1), ('uw', 1)])) : 't15',
    tuple(sorted([('wv', -1), ('uw', -1)])) : 't16',
}


def compute_c_link_types(graph, v, u, w): 
    edge_dist = {
        'vw': w in graph[v],
        'wv': v in graph[w],
        'uw': w in graph[u],
        'wu': u in graph[w]
    }
    
    duplicates = []
    non_duplicates = []
    
    ## If an edge v->w exists in the graph ...
    if(edge_dist['vw']):
        ## AND an edge w->v exists in the graph ...
        if(edge_dist['wv']):
            ## (the edges formed by the pair (v, w) are stored in the "duplicates" list)
            duplicates.append([(v, w, 'vw'), (w, v, 'wv')])
        else:
            ## Only the edge v->w exists, store it in the "non_duplicates" list
            non_duplicates.append((v, w, 'vw'))
    else:
        ## Only the w->v exists, store it in the "non_duplicates" list
        non_duplicates.append((w, v, 'wv'))

    ## If an edge u->w exists in the graph ...
    if(edge_dist['uw']):
        ## AND an edge w->u exists in the graph ...
        if(edge_dist['wu']):
            ## (the edges formed by the pair (u, w) are stored in the "duplicates" list)
            duplicates.append([(u, w, 'uw'), (w, u, 'wu')])
        else:
            ## Only the edge u->w exists, store it in the "non_duplicates" list
            non_duplicates.append((u, w, 'uw'))
    else:
        ## Only the w->u exists, store it in the "non_duplicates" list
        non_duplicates.append((w, u, 'wu'))
    
    dup_len = len(duplicates)
    non_dup_len = len(non_duplicates)

    ## Computing the different combination possible from the duplicates list
    ## If only one pair was added to the duplicates list, there would be 2 combinations
    ## If two pairs were added to the duplicates list, there would be 4 combinations
    ## If three pairs were added to the duplicates list, there would be 8 combinations
    duplicates_combs = itertools.product(*duplicates)
    
    result = []
    
    ## combi is of the form []
    for combi in duplicates_combs:
        c_link_index = []
        
        for i in range(dup_len):
            edge_direction = combi[i][2]
            edge_sign = graph[combi[i][0]][combi[i][1]]['Sign']
            c_link_index.append((edge_direction, edge_sign))
            
        for i in range(non_dup_len):
            edge_direction = non_duplicates[i][2]
            edge_sign = graph[non_duplicates[i][0]][non_duplicates[i][1]]['Sign']
            c_link_index.append((edge_direction, edge_sign))
        
        c_link_type = C_LINKS_TYPES[tuple(sorted(c_link_index))]
        
        result.append(c_link_type)
    
    return result


def c_links_census(graph):
    # Initialize the count for each triad to be zero.
    t1 = 0
    t2 = 0
    t3 = 0
    t4 = 0
    t5 = 0
    t6 = 0
    t7 = 0
    t8 = 0
    t9 = 0
    t10 = 0
    t11 = 0
    t12 = 0
    t13 = 0
    t14 = 0
    t15 = 0
    t16 = 0
    
    census = {
        f't{i}': {
            '+': 0,
            '-': 0
    } for i in range(1, 17)}
    
    census_edges = {
        f't{i}' : set() for i in range(1, 17)
    }
    
    c_links_names = [f't{i}' for i in range(1, 17)]
    
    census = pd.DataFrame(0, index = pd.Index(c_links_names), columns = ['+', '-']) 
    
    n = len(graph)
    
    m = {v: i for i, v in enumerate(graph)}
    
    for v in graph:
        vnbrs = set(graph.succ[v])
        
        for u in vnbrs:
            if m[u] <= m[v]:
                continue

            neighbors_old = (vnbrs | set(graph.pred[v]) | set(graph.succ[u]) | set(graph.pred[u])) - {u, v}

            neighbors1 = (set(graph.pred[v]) & set(graph.pred[u]))
            neighbors2 = (set(graph.pred[v]) & set(graph.succ[u]))
            neighbors3 = (vnbrs & set(graph.pred[u]))
            neighbors4 = (vnbrs & set(graph.succ[u])) 
            
            #neighbors = (neighbors1 | neighbors2 | neighbors3 | neighbors4) - {u, v}
            
#             printable = ''
            
#             if(len(list(neighbors_old)) < len(list(neighbors))):
#                 printable = 'TAHCHELEK !'
#             else:
#                 printable = 'BON BON BON !'
            
#             print(f'old : {len(list(neighbors_old))} / new : {len(list(neighbors))} ! {printable}')
            
            for w in neighbors1:
                ## Here triads are : v <--x--> u (either t9 or t10 , or t13 or t14)
                sign_xv= '-' if graph[x][v]['sign']==-1 else '+'
                sign_xu= '-' if graph[x][u]['sign']==-1 else '+'
                typ = sign_xv+sign_xu
                if typ=='++': t9 +=1
                elif typ=='+-': t10+=1
                elif typ=='-+':t13+=1
                else : t14+=1
                
            for w in neighbors2:
                ## Here triads are : v--> u --> x --> v (either t11 or t12 , or t15 or t16)
                sign_xv= '-' if graph[x][v]['sign']==-1 else '+'
                sign_ux= '-' if graph[u][x]['sign']==-1 else '+'
                typ = sign_xv+sign_ux
                if typ=='++': t11 +=1
                elif typ=='+-': t12+=1
                elif typ=='-+':t15+=1
                else : t16+=1
                
                
            for w in neighbors3:
                ## Here triads are : v--> x --> u (either t11 or t12 , or t15 or t13)
                sign_vx= '-' if graph[v][x]['sign']==-1 else '+'
                sign_xu= '-' if graph[x][u]['sign']==-1 else '+'
                typ = sign_vx+sign_xu
                if typ=='++': t1 +=1
                elif typ=='+-': t2+=1
                elif typ=='-+':t5+=1
                else : t6+=1
                    
            
            for w in neighbors4:
                ## Here triads are : v--> x <-- u (either t3 or t4 , or t7 or t8)
                sign_vx= '-' if graph[v][x]['sign']==-1 else '+'
                sign_ux= '-' if graph[u][x]['sign']==-1 else '+'
                typ = sign_vx+sign_ux
                if typ=='++': t3 +=1
                elif typ=='+-': t4+=1
                elif typ=='-+':t7+=1
                else : t8+=1
                    
                    
    return [t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16]

In [2]:
listed = c_links_census(epin_graph)

NameError: name 'epin_graph' is not defined

In [54]:
c_link_census_epinions, edges_per_c_link = c_links_census(epinions_graph)

In [55]:
c_link_census_epinions

Unnamed: 0,+,-
t1,1308071,18558
t2,26717,31654
t3,1302381,25752
t4,33647,5131
t5,49359,82704
t6,7582,13710
t7,21420,67587
t8,63042,21874
t9,1879820,94867
t10,22217,45328


In [56]:
def compute_baseline(graph, list_of_c_links, gen_or_rec):
    
    if(gen_or_rec != 'generative' and gen_or_rec != 'receptive'):
        raise ValueError('Impossible value for gen_or_rec argument !')
    
    sum_of_baselines = 0
    
    for c_link in list_of_c_links:
        
        if(gen_or_rec == 'generative'):
            v = c_link[0]
            succ = graph.succ[v]

            list_of_edges = [(v, successor) for successor in succ]
            list_of_positive_edges = [
                (v, successor) for successor in succ if graph[v][successor]['Sign'] == 1
            ]
        else:
            u = c_link[1]
            pred = graph.pred[u]
            
            list_of_edges = [(predecessor, u) for predecessor in pred]
            list_of_positive_edges = [
                (predecessor, u) for predecessor in pred if graph[predecessor][u]['Sign'] == 1
            ]
        
        total_edges = len(list_of_edges)
        total_positive_edges = len(list_of_positive_edges)
        
        sum_of_baselines += total_positive_edges/total_edges
        
    return sum_of_baselines

c_links_names = [key for key in edges_per_c_link.keys()]
baselines = pd.DataFrame(index = pd.Index(c_links_names), columns = ['generative', 'receptive']) 

# baselines = {
#     key : {
#         'generative' : 0,
#         'receptive' : 0
#     } for key in edges_per_c_link.keys()
# }

for c_link_type in edges_per_c_link.keys():    
    list_of_c_links = list(edges_per_c_link[c_link_type])
    
    print(80 * '=')
    print(c_link_type)
    print('Beginning generative baseline ...')
    
    baselines.loc[c_link_type]['generative'] = compute_baseline(epinions_graph, list_of_c_links, 'generative')
    
    print('Generative baselines finished !')
    print('Beginning receptive baseline ...')
    
    baselines.loc[c_link_type]['receptive'] = compute_baseline(epinions_graph, list_of_c_links, 'receptive')
    
    print('Receptive baseline finished !')
    print(80 * '=')

t1
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t2
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t3
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t4
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t5
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t6
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t7
Beginning generative baseline ...
Generative baselines finished !
Beginning receptive baseline ...
Receptive baseline finished !
t8
Beginning generative baseline ...
Generative baselines finished !
Beginni

In [89]:
baselines = baselines.astype(float)

In [97]:
datafr = c_link_census_epinions
datafr['count'] = datafr['+'] + datafr['-']
datafr['p(+)'] = datafr['+'] / datafr['count']
datafr

Unnamed: 0,+,-,count,p(+)
t1,1308071,18558,1326629,0.986011
t2,26717,31654,58371,0.45771
t3,1302381,25752,1328133,0.98061
t4,33647,5131,38778,0.867683
t5,49359,82704,132063,0.373753
t6,7582,13710,21292,0.356096
t7,21420,67587,89007,0.240655
t8,63042,21874,84916,0.742404
t9,1879820,94867,1974687,0.951958
t10,22217,45328,67545,0.328921


In [98]:
datafr = pd.concat([datafr, baselines], axis=1)
datafr

Unnamed: 0,+,-,count,p(+),generative,receptive
t1,1308071,18558,1326629,0.986011,111321.119852,117893.7749
t2,26717,31654,58371,0.45771,15729.166791,15039.086481
t3,1302381,25752,1328133,0.98061,97317.230153,103410.663906
t4,33647,5131,38778,0.867683,10812.877387,11945.291535
t5,49359,82704,132063,0.373753,12637.912432,21845.810968
t6,7582,13710,21292,0.356096,3798.576457,6545.755823
t7,21420,67587,89007,0.240655,7864.566209,14372.535243
t8,63042,21874,84916,0.742404,10262.996304,16267.418979
t9,1879820,94867,1974687,0.951958,122365.296161,133796.271486
t10,22217,45328,67545,0.328921,15190.799857,15501.347066


In [100]:
datafr['s_g'] = datafr['count']*datafr['p(+)'] - datafr['generative']
datafr['s_g'] = datafr['s_g'] / np.sqrt(datafr['generative'] * (1 - (datafr['generative']/datafr['count'])))
datafr['s_g'] = round(datafr['s_g'], 1)

datafr['s_r'] = datafr['count']*datafr['p(+)'] - datafr['receptive']
datafr['s_r'] = datafr['s_r'] / np.sqrt(datafr['receptive'] * (1 - (datafr['receptive']/datafr['count'])))
datafr['s_r'] = round(datafr['s_r'], 1)

In [101]:
pred_B_g = [1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1]
pred_B_r = [1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1]
pred_S_g = [1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1]
pred_S_r = [1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1]

datafr['pred_B_g'] = pred_B_g
datafr['pred_B_r'] = pred_B_r
datafr['pred_S_g'] = pred_S_g
datafr['pred_S_r'] = pred_S_r


In [103]:
datafr['B_g'] = datafr['pred_B_g'] * datafr['s_g'] > 0
datafr['B_r'] = datafr['pred_B_r'] * datafr['s_r'] > 0
datafr['S_g'] = datafr['pred_S_g'] * datafr['s_g'] > 0
datafr['S_r'] = datafr['pred_S_r'] * datafr['s_r'] > 0

In [104]:
final = datafr[['count', 'p(+)', 's_g', 's_r', 'B_g', 'B_r', 'S_g', 'S_r']]

In [105]:
final

Unnamed: 0,count,p(+),s_g,s_r,B_g,B_r,S_g,S_r
t1,1326629,0.986011,3747.5,3631.4,True,True,True,True
t2,58371,0.45771,102.5,110.5,False,False,False,True
t3,1328133,0.98061,4012.7,3882.6,True,True,False,True
t4,38778,0.867683,258.6,238.7,False,False,True,True
t5,132063,0.373753,343.5,203.8,False,False,True,False
t6,21292,0.356096,67.7,15.4,True,True,False,False
t7,89007,0.240655,160.1,64.2,False,False,False,False
t8,84916,0.742404,555.6,407.9,True,True,True,False
t9,1974687,0.951958,5187.4,4943.8,True,True,True,False
t10,67545,0.328921,64.8,61.4,False,False,False,False


In [10]:
# new_index = pd.Index([f't{i}' for i in range(1, 17)])

# datafr.set_index(new_index, inplace=True, drop=True)
