In [5]:
import networkx as nx
import pyspark
import pandas as pd
from operator import itemgetter
import matplotlib.pyplot as plt
import collections
from community import community_louvain
from networkx.algorithms.community.centrality import girvan_newman
import itertools
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [6]:
# Helper function for printing various graph properties
def describe_graph(G):
    print(nx.info(G))
    """if nx.is_connected(G):
        print("Avg. Shortest Path Length: %.4f" %nx.average_shortest_path_length(G))
        print("Diameter: %.4f" %nx.diameter(G)) # Longest shortest path
    else:
        print("Graph is not connected")
        print("Diameter and Avg shortest path length are not defined!")"""
    print("Sparsity: %.4f" %nx.density(G))  # #edges/#edges-complete-graph
    # #closed-triplets(3*#triangles)/#all-triplets
    print("Global clustering coefficient aka Transitivity: %.4f" %nx.transitivity(G))

In [32]:
egos_followers= spark.read.csv('network/public_users_followers.csv',header=True)


In [33]:
egos_followers = public_users_followers.withColumnRenamed('ids', 'followers_ids')

In [34]:
egos_followers.toPandas().head()

Unnamed: 0,user_id,followers_ids,next_cursor
0,129007523,4703547618,0
1,129007523,774663607,0
2,129007523,175036533,0
3,129007523,2650183599,0
4,129007523,704185769,0


In [49]:
egos_followers.count()

45813

In [38]:
egos_followees= spark.read.csv('network/public_users_friends.csv',header=True)

In [39]:
egos_followees = public_users_friends.withColumnRenamed('ids', 'followees_ids')

In [40]:
egos_followees.toPandas().head()

Unnamed: 0,user_id,followees_ids,next_cursor
0,129007523,14502789,0
1,129007523,168653254,0
2,129007523,124871255,0
3,575831485,123737099,0
4,575831485,185757778,0



# Graph

In [54]:
#followers graph

ego_follower_G =nx.from_pandas_edgelist(egos_followers.toPandas(), source='followers_ids', target='user_id', edge_attr=None, create_using= nx.DiGraph())
describe_graph(ego_follower_G)

Name: 
Type: DiGraph
Number of nodes: 46179
Number of edges: 45813
Average in degree:   0.9921
Average out degree:   0.9921
Sparsity: 0.0000
Global clustering coefficient aka Transitivity: 0.0000


In [57]:
#followees graph

ego_followees_G =nx.from_pandas_edgelist(egos_followees.toPandas(), source='user_id', target='followees_ids', edge_attr=None, create_using= nx.DiGraph())
describe_graph(ego_followees_G)

Name: 
Type: DiGraph
Number of nodes: 55252
Number of edges: 58933
Average in degree:   1.0666
Average out degree:   1.0666
Sparsity: 0.0000
Global clustering coefficient aka Transitivity: 0.0000


In [58]:
ego_alter_G=nx.compose(ego_follower_G,ego_followees_G)

In [60]:
#Calculate # of betweenness
betweenness=nx.betweenness_centrality(ego_alter_G)

In [61]:
#Calculate clustering coefficient
clustering=nx.clustering(ego_alter_G)

In [62]:
#Calculate # of followees
number_of_followees=egos_followees.groupBy("user_id").agg({'followees_ids':'count'})


In [72]:
clustering

{'4703547618': 0,
 '129007523': 0,
 '774663607': 0,
 '175036533': 0,
 '2650183599': 0,
 '704185769': 0,
 '2229050017': 0,
 '1472618786': 0,
 '794109906': 0,
 '49447168': 0,
 '336730016': 0,
 '168653254': 0,
 '207290993': 0,
 '169588023': 0,
 '168552497': 0,
 '141599240': 0,
 '773299351640682497': 0,
 '32670411': 0,
 '238549856': 0,
 '34093917': 0,
 '923868942': 0,
 '290473066': 0,
 '2981923836': 0,
 '895940017834475520': 0,
 '555533734': 0,
 '1017167014153027584': 0,
 '770772211779117056': 0,
 '789058159096856576': 0,
 '895967149260492801': 0,
 '905022900280975364': 0,
 '182737850': 0,
 '4446154454': 0,
 '3030210088': 0,
 '747627370111107072': 0,
 '2310781728': 0,
 '784090449267068928': 0,
 '2978522518': 0,
 '832584146488680449': 0,
 '4563808215': 0,
 '830875548029218816': 0,
 '760816120333004801': 0,
 '805818032098422784': 0,
 '3283767175': 0,
 '807678542419021824': 0,
 '1485336902': 0,
 '749950844393648128': 0,
 '43197342': 0,
 '26129367': 0,
 '725313892340097024': 0,
 '7373462339672

In [68]:
dff = pd.DataFrame.from_dict(betweenness, orient="index")

In [71]:
dff.reset_index().to_csv('betweenness.csv')

In [81]:
pd.DataFrame.from_dict(clustering, orient="index").to_csv('clustering_coeff.csv')

In [82]:
number_of_followees.toPandas().to_csv('number_of_followees.csv')