In [None]:
from IPython.core.display import HTML
from datascience import *

import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('fivethirtyeight')

import os
import pandas as pd
import networkx as nx

In [None]:
# this is where you'd grab the data online
data_url = "https://snap.stanford.edu/data/wiki-talk-temporal.txt.gz"
# to make things faster in lecture, I pre-saved the file on my computer
#data_url = os.path.expanduser(os.path.join("~", "Dropbox", "data-explore", "wiki-talk", "wiki-talk-temporal.txt.gz"))

# Strength of weak ties in the Wiki Talk network

We'll look at the WikiTalk network, which is avaialble at the [SNAP data archive](https://snap.stanford.edu/data/wiki-Talk.html).

According to the website,

> The network contains all the users and discussion from the inception of Wikipedia till January 2008. 
>  * Nodes in the network represent Wikipedia users 
>  * A directed edge from node i to node j means that user i edited a talk page of user j at least once.
    


### Read in the datset

In [None]:
%%time

raw_data = pd.read_csv(data_url, sep=" ", header=None)

# rename the columns
raw_data = raw_data.rename(index=str, columns={0 : "from", 1 : "to", 2 : "time"})
raw_data.head()

NB: this next cell takes about a minute

In [None]:
%%time

# create a network from the message logs
raw_net = nx.from_pandas_edgelist(raw_data, 
                                   source='from', 
                                   target='to', 
                                   edge_attr=['time'], 
                                   create_using=nx.MultiDiGraph())

Simplify the network structure to turn it into an undirected network.

In [None]:
%%time

# filter out directed edges between nodes where connection is reciprocated
# (and also self-loops, if there are any)
di_edges_recip = [(u,v,d) for (u,v,d) in raw_net.edges if (v,u) in raw_net.edges and v != u]

len(di_edges_recip)

In [None]:
%%time

# now create an undirected network with edges weighted by the number of interactions
new_net = nx.Graph()
for edge in di_edges_recip:
    # if the edge is already in the network, add one to its weight
    if (edge[0], edge[1]) in new_net.edges:
        new_net[edge[0]][edge[1]]['weight'] = new_net[edge[0]][edge[1]]['weight'] + 1
    # otherwise, add it
    else:
        new_net.add_edge(edge[0], edge[1], weight=1)
        


### Some general info about the network

Number of nodes and edges:

In [None]:
print("Number of nodes: ", new_net.number_of_nodes())
print("Number of edges: ", new_net.number_of_edges())
print("Average degree: ", 2*new_net.number_of_edges()/new_net.number_of_nodes())

Extract the degrees into a Table, which we'll use to make a histogram of the degree distribution

In [None]:
new_net_degrees = Table().with_columns([
    'id', new_net.nodes(),
    'degree', [d for (n,d) in new_net.degree] # this  gets list of degree values from the (node,degree) pairs
])

A very small number of nodes have very large degrees; for example, the largest degree is almost 1200

In [None]:
np.max(new_net_degrees['degree'])

... so to keep the histogram readable, let's only plot up to the 95th percentile

In [None]:
np.percentile(new_net_degrees['degree'], 95)

In [None]:
plt.hist(new_net_degrees['degree'], bins=np.arange(0,26,1));

Look at the number of components and their size:

In [None]:
%%time

# get the giant component
new_net_components = sorted(nx.connected_components(new_net), key=len, reverse=True)

print("Number of components:", len(new_net_components))
print("frac nodes in GC: ", new_net.subgraph(new_net_components[0]).number_of_nodes() / new_net.number_of_nodes())

Pick out the giant component

In [None]:
new_net_gc = new_net.subgraph(new_net_components[0])

Recall that Onnela et al (2007) introduced the concept of *neighborhood overlap* as a way of quantifying how 'bridge-like' an edge is.  The *neighborhood overlap* of an edge joining a node A and a node B is defined to be

$$
\text{neighborhood overlap} = \frac{\text{number of nodes who are neighbors of both A and B}}{\text{number of nodes who are neighbors of at least one of A or B}}
$$

where the denominator does not count $A$ or $B$.

The neighborhood overlap ranges from 0 to 1. For an edge that is a *local bridge*, the neighborhood overlap would be 0. For a bridge that joins two nodes who are completely interconnected (i.e., a very not 'bridge-like' edge), the neighborhood overlap would be 1.

Let's calculate the neighborhood overlap of every edge in the giant component of our wiki talk network.

(This takes about a minute and a half to do)

In [None]:
%%time

# calculate the neighborhood overlap for each edge
for edge in new_net_gc.edges:
    
    # grab the neighbors of the node at either end of this edge
    # (not counting the two nodes involved in the edge)
    u_nbrs = set(new_net_gc.neighbors(edge[0])) - set([edge[1]])
    v_nbrs = set(new_net_gc.neighbors(edge[1])) - set([edge[0]])
    
    # figure out how many neighbors the neighborhoods have in common (numerator)
    common_nbrs = len(u_nbrs & v_nbrs) 
    
    # figure out how many unique neighbors they have between them
    poss_nbrs = len(u_nbrs | v_nbrs)
    
    # calculate the neighborhood overlap and add it to the edge
    # as an attribute
    new_net_gc[edge[0]][edge[1]]['nbhd_overlap'] = common_nbrs / poss_nbrs

### Plot the relationship between tie strength and neighborhood overlap

As we saw in lecture, Onnela et al (2007) found that in their cell phone call network, edges that were bridge-like -- that is, edges with low values of neighborhood overlap -- tended to be weak ties.

Let's see if that pattern holds up in the giant component of the wiki talk data.

Recall that we added neighborhood overlap to the edges in the giant component. To make plotting easier, let's now convert the giant component back into an edge list:

In [None]:
net_df = nx.to_pandas_edgelist(new_net_gc)

net_df

In [None]:
net_df.plot(kind='scatter', x='weight', y='nbhd_overlap')

It's hard to tell what's going on from the plot above; there's a point for each edge (about 340k in total).

We can plot each point with partial transparency to try to reveal where the mass of the data lies:

In [None]:
net_df.plot(kind='scatter', x='weight', y='nbhd_overlap', alpha=.05)

... but it's still pretty hard to really tell what's going on.

Let's group the ties into 10 different groups of equal size, from lowest weight to highest weight. Then we'll calculate the average neighborhood overlap within each of these groups.  (You might remember that Onnela et al did something similar in their analysis.) 

In [None]:
net_df['tie_strength'] = pd.qcut(net_df['weight'], 10, duplicates='drop')

Let's look at the categories that Pandas created for us:

In [None]:
net_df.groupby('tie_strength')['nbhd_overlap'].mean()

In [None]:
net_agg = pd.DataFrame(net_df.groupby('tie_strength')['nbhd_overlap'].mean())
net_agg['tie_strength'] = net_agg.index
net_agg

In [None]:
net_agg.plot(kind='bar', x='tie_strength', y='nbhd_overlap', color='royalblue');

This looks roughly consistent with the Onnela et al finding we talked about in lecture: weak ties tend to have lower neighborhood overlap, making them more like local bridges.

The analysis above roughly parallels the Onnela et al. paper. However, a student made an astute observation: **the strength of weak ties hypothesis really predicts that bridges will be weak ties; it does not necessarily imply that all weak ties are bridges**. This suggests that we should look at how the bridginess of a tie is related to average tie strength. In other words, it suggests that we want to flip the plot above so that neighborhood overlap is on the x axis and average tie strength is on the y axis.

Let's do that now:

In [None]:
net_df['no_bin'] = pd.qcut(net_df['nbhd_overlap'], 10, duplicates='drop')

In [None]:
net_df.groupby('no_bin')['weight'].mean()

In [None]:
net_agg2 = pd.DataFrame(net_df.groupby('no_bin')['weight'].mean())
net_agg2['nbhd_overlap'] = net_agg2.index
net_agg2

In [None]:
net_agg2.plot(kind='bar', x='nbhd_overlap', y='weight', color='royalblue');

We see that, as the strength of weak ties hypothesis would predict, neighborhood overlap is positively associated with tie strength: the bins with larger neighborhood overlap values have higher average weights (tie strength).