# 0.Import Libraries

In [1]:
x = [10,2,5] + [5, 8, 4]

In [3]:
import networkx as nx
import matplotlib.pyplot as plt
import nxviz as nv
import datetime
import pickle
# Import necessary modules
from itertools import combinations
from collections import defaultdict

# 1.Load Libraries

In [5]:

T = pickle.load(open("Data\github_users.p", 'rb'))

# 2.Learning Number of Nodes and Edges

In [None]:
len(T.nodes()), len(T.edges())

# 3.Plotting Degree Centrality

In [None]:
# Plot the degree distribution of the GitHub collaboration network
plt.hist(list(nx.degree_centrality(T).values()))
plt.show()

# 4.Getting a Subgraph and Plotting Betweenness Centrality

In [27]:

nodes_of_interest = ['u8229', 'u8231', 'u8521', 'u5814', 'u3570', 'u655', 'u243', 'u6104', 'u8286', 'u2258', 'u4768']

# Define get_nodes_and_nbrs()
def get_nodes_and_nbrs(G, nodes_of_interest):
    """
    Returns a subgraph of the graph `G` with only the `nodes_of_interest` and their neighbors.
    """
    nodes_to_draw = []

    # Iterate over the nodes of interest
    for n in nodes_of_interest:

        # Append the nodes of interest to nodes_to_draw
        nodes_to_draw.append(n)

        # Iterate over all the neighbors of node n
        for nbr in G.neighbors(n):

            # Append the neighbors of n to nodes_to_draw
            nodes_to_draw.append(nbr)

    return G.subgraph(nodes_to_draw)

# Extract the subgraph with the nodes of interest: T_draw
T_draw = get_nodes_and_nbrs(T,nodes_of_interest)

In [None]:
# Plot the degree distribution of the GitHub collaboration network
plt.hist(list(nx.betweenness_centrality(T_draw).values()))
plt.show()

# 5.Connected Components

In [None]:
nx.connected_components(T)

In [None]:
list(nx.connected_components(T))

In [None]:
for g in list(nx.connected_components(T)):
    if len(g) == 9:
        print(g)

# 6.Matrix Plot

In [None]:
# Calculate the largest connected component: largest_ccs
largest_ccs = sorted((T.subgraph(c) for c in nx.connected_components(T)), key=lambda x: len(x))[-4]

# Create the customized Matrix plot: h
h = nv.matrix(largest_ccs)

# Draw the Matrix plot to the screen
plt.show()

# 7.Arc Plot

In [None]:
# Iterate over all the nodes in G, including the metadata
for n in largest_ccs.nodes:

    # Calculate the degree of each node: G.node[n]['degree']
    largest_ccs.nodes[n]['degree'] = nx.degree(T, n)

# Create the Arc plot: a
a = nv.arc(largest_ccs, sort_by='degree')

# Draw the Arc plot to the screen
plt.show()

# 8.Circos Plot

In [79]:
# Iterate over all the nodes, including the metadata
for n, d in T.nodes(data=True):
    # Calculate the degree of each node: G.node[n]['degree']
    T.nodes[n]['degree'] = nx.degree(T, n)


In [None]:
# Calculate the largest connected component: largest_ccs
largest_ccs = sorted((T.subgraph(c) for c in nx.connected_components(T)), key=lambda x: len(x))[-2]
# Create the Circos plot: c
c = nv.circos(largest_ccs , sort_by='degree')

# Draw the Circos plot to the screen
plt.show()

In [None]:
# Calculate the largest connected component: largest_ccs
largest_ccs = sorted((T.subgraph(c) for c in nx.connected_components(T)), key=lambda x: len(x))[-4]
# Create the Circos plot: c
c = nv.circos(largest_ccs , sort_by='degree')

# Draw the Circos plot to the screen
plt.show()

# 9.Finding Cliques

In [None]:
# Calculate the maximal cliques in G: cliques
cliques = nx.find_cliques(T)

# Count and print the number of maximal cliques in G
print(len(list(cliques)))

In [None]:
# Find the author(s) that are part of the largest maximal clique: largest_clique
# Calculate the largest connected component: largest_ccs
largest_ccs = sorted((T.subgraph(c) for c in nx.connected_components(T)), key=lambda x: len(x))[-1]

largest_clique = sorted((c for c in nx.find_cliques(largest_ccs)), key=lambda x:len(x))[-6]

# Create the subgraph of the largest_clique: G_lc
G_lc = T.subgraph(largest_clique)

# Create the Circos plot: c
c = nv.circos(G_lc) 

# Draw the Circos plot to the screen
plt.show()

In [None]:
# Find the author(s) that are part of the largest maximal clique: largest_clique
# Calculate the largest connected component: largest_ccs
largest_ccs = sorted((T.subgraph(c) for c in nx.connected_components(T)), key=lambda x: len(x))[-1]

largest_clique = sorted((c for c in nx.find_cliques(largest_ccs)), key=lambda x:len(x))[-50]

# Create the subgraph of the largest_clique: G_lc
G_lc = T.subgraph(largest_clique)

# Create the Circos plot: c
c = nv.circos(G_lc) 

# Draw the Circos plot to the screen
plt.show()

In [None]:
# Find the author(s) that are part of the largest maximal clique: largest_clique
# Calculate the largest connected component: largest_ccs
largest_ccs = sorted((T.subgraph(c) for c in nx.connected_components(T)), key=lambda x: len(x))[-1]

largest_clique = sorted((c for c in nx.find_cliques(largest_ccs)), key=lambda x:len(x))[-50]

# Create the subgraph of the largest_clique: G_lc
G_lc = T.subgraph(largest_clique)

# Create the Circos plot: c
c = nv.arc(G_lc) 

# Draw the Circos plot to the screen
plt.show()

# 10.Finding important collaborators 

In [None]:
# Compute the degree centralities of G: deg_cent
deg_cent = nx.degree_centrality(T)

# Compute the maximum degree centrality: max_dc
max_dc = max(deg_cent.values())

# Find the user(s) that have collaborated the most: prolific_collaborators
prolific_collaborators = [n for n, dc in deg_cent.items() if dc == max_dc]

# Print the most prolific collaborator(s)
print(prolific_collaborators)

# 11.Finding the largest communities

In [None]:

# Identify the largest maximal clique: largest_max_clique
largest_max_clique = set(sorted(nx.find_cliques(T), key=lambda x: len(x))[-1])

# Create a subgraph from the largest_max_clique: G_lmc
G_lmc = T.subgraph(largest_max_clique).copy()

# Go out 1 degree of separation
for node in list(G_lmc.nodes()):
    G_lmc.add_nodes_from(T.neighbors(node))
    G_lmc.add_edges_from(zip([node]*len(list(T.neighbors(node))), T.neighbors(node)))


# Record each node's degree centrality score
for n in G_lmc.nodes():
    G_lmc.nodes[n]['degree centrality'] = nx.degree_centrality(G_lmc)[n]

# Create the Arc plot: a
a = nv.arc(G_lmc, sort_by='degree centrality')

# Draw the Arc plot to the screen
a
plt.show()

In [None]:

# Identify the largest maximal clique: largest_max_clique
largest_max_clique = set(sorted(nx.find_cliques(T), key=lambda x: len(x))[-10])

# Create a subgraph from the largest_max_clique: G_lmc
G_lmc = T.subgraph(largest_max_clique).copy()

# Go out 1 degree of separation
for node in list(G_lmc.nodes()):
    G_lmc.add_nodes_from(T.neighbors(node))
    G_lmc.add_edges_from(zip([node]*len(list(T.neighbors(node))), T.neighbors(node)))


# Record each node's degree centrality score
for n in G_lmc.nodes():
    G_lmc.nodes[n]['degree centrality'] = nx.degree_centrality(G_lmc)[n]

# Create the Arc plot: a
a = nv.arc(G_lmc, sort_by='degree centrality')

# Draw the Arc plot to the screen
a
plt.show()

In [None]:

# Identify the largest maximal clique: largest_max_clique
largest_max_clique = set(sorted(nx.find_cliques(T), key=lambda x: len(x))[-2000])

# Create a subgraph from the largest_max_clique: G_lmc
G_lmc = T.subgraph(largest_max_clique).copy()

# Go out 1 degree of separation
for node in list(G_lmc.nodes()):
    G_lmc.add_nodes_from(T.neighbors(node))
    G_lmc.add_edges_from(zip([node]*len(list(T.neighbors(node))), T.neighbors(node)))


# Record each node's degree centrality score
for n in G_lmc.nodes():
    G_lmc.nodes[n]['degree centrality'] = nx.degree_centrality(G_lmc)[n]

# Create the Arc plot: a
a = nv.arc(G_lmc, sort_by='degree centrality')

# Draw the Arc plot to the screen
a
plt.show()

# 12.Recommending System

In [None]:
# Initialize the defaultdict: recommended
recommended = defaultdict(int)

# Iterate over all the nodes in G
for n, d in T.nodes(data=True):

    # Iterate over all possible triangle relationship combinations
    for n1, n2 in combinations(list(T.neighbors(n)), 2):

        # Check whether n1 and n2 do not have an edge
        if not T.has_edge(n1, n2):

            # Increment recommended
            recommended[(n1, n2)] += 1

# Identify the top 10 pairs of users
all_counts = sorted(recommended.values())
top10_pairs = [pair for pair, count in recommended.items() if count > all_counts[-10]]
print(top10_pairs)
