In [0]:
from pyspark.sql.session import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import functions as F
from graphframes import *

In [None]:
data_output = spark.read.option("header","true").parquet("dbfs:/mnt/group12/sentiment/")

In [0]:
#Filter people who have never been replied 
data_output.filter(F.col('n_users_in_conversation') != 0).select(F.col('username'),F.col('users_in_conversation').alias('repliedBy')).show()

In [0]:
#merge username and people replies him/her
union_name = data_output.filter(F.col('n_users_in_conversation') != 0).withColumn('nameSum', F.array_union(F.col('users_in_conversation'),F.array(F.col('username'))))\
                        .select(F.col('nameSum'))

In [0]:
# explode array and delete repeated name, then create vertex of network graph
vertex = union_name.withColumn('nameSum', F.explode(F.col('nameSum'))).distinct().withColumnRenamed("nameSum", "id")

In [0]:
# the number of node
vertex.count()

In [0]:
# expolde array as dst
edges_1 = data_output.select(F.col("username"), F.explode(F.col('users_in_conversation')).alias('dst')).filter(F.col('n_users_in_conversation') != 0)
                   

In [0]:
# add new column "relationship", rename username as src
edges_1 = edges_1.withColumn('relationship', F.lit('repliedBy')).withColumnRenamed("username", "src")

In [0]:
# the number of edge
edges_1.count()

In [0]:
edges_withWeight = edges_1.groupBy('src','dst').count().withColumnRenamed("count", "weight")

In [0]:
edges_2 = edges_withWeight.withColumn('relationship', F.lit('repliedBy'))

In [0]:
edges_2.count()

In [0]:
#create graph
network_graph = GraphFrame(vertex, edges_1)

In [0]:
display(network_graph.degrees)

id,degree
Slammer64,962
WhiteMail3678,21
LegalLolicon,336
bbarian,154
BucketHead41391,91
CW1,175
leadguitar,154
greyhat63,16
Joshmeister,209
GertCornelis,4


In [0]:
display(network_graph.inDegrees)

id,inDegree
Hister,34
DeoVindice,6
LegalLolicon,122
Slammer64,841
TooChainz,2
WhiteMail3678,16
Evenstar,1
BradBarnhardSho,14
CW1,35
rm,316


In [0]:
display(network_graph.outDegrees)

id,outDegree
DeanNestor7,562
ProudPrimate,4
Hister,60
CW1,140
Slammer64,121
LegalLolicon,214
greyhat63,15
Perspicacious01,1
Tuxedojoe,15
qoheleth,109


In [0]:
display(network_graph.edges)

src,dst,relationship
Frank61,RDFloyd,repliedBy
DemonTwoSix,RockyBasterd,repliedBy
DemonTwoSix,DemonTwoSix,repliedBy
DemonTwoSix,RockyBasterd,repliedBy
DemonTwoSix,DemonTwoSix,repliedBy
DemonTwoSix,StoneSovryn,repliedBy
ManweSulimo828,ManweSulimo828,repliedBy
ManweSulimo828,wacko2,repliedBy
ManweSulimo828,ManweSulimo828,repliedBy
ManweSulimo828,FurDog54,repliedBy


In [0]:
# Checkpoint write
edges_1.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/edges")
vertex.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/vertex")

In [0]:
# connected conponents
sc.setCheckpointDir("/tmp/graphframes-example-connected-components")
result_CC = network_graph.connectedComponents()
result_CC.select("id", "component").orderBy("component").show()


In [0]:
result_CC.select("id", "component").groupBy("component").count().show()

In [0]:
# Giant component filtering
vertex_Filtered = result_CC.select('id').filter(F.col('component') == 0)

In [0]:
vertex_Filtered.count()

In [0]:
edges_Filtered = edges_1.join(vertex_Filtered, edges_1.src == vertex_Filtered.id, "inner" )

In [0]:
edges_Filtered.count()

In [0]:
# Checkpoint output
vertex_Filtered.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/vertex_filtered")
edges_Filtered.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/edges_filtered")


In [0]:
# Checkpoint reading
vertex_Filtered = spark.read.option("header","true").parquet("dbfs:/mnt/group12/graph/vertex_filtered")
edges_Filtered = spark.read.option("header","true").parquet("dbfs:/mnt/group12/graph/edges_filtered")

In [0]:
#recreate graph with filtered vertex and edges
graph_Filtered = GraphFrame(vertex_Filtered, edges_Filtered)

In [0]:
# Triangle count
results = graph_Filtered.triangleCount()

In [0]:
display(results.describe())

summary,count,id
count,16813.0,16813
mean,336.7469220246238,5.866471030204233E17
stddev,2130.444869957944,3.1591913322235663E18
min,0.0,-DeepThought-
max,49427.0,zyalia6tz


In [0]:
# Run PageRank until convergence to tolerance "tol".
results_pageRank = graph_Filtered.pageRank(resetProbability=0.15, tol=0.01)

In [0]:
edges_withWeight = results_pageRank.edges.select("src", "dst", "weight")

In [0]:
results_withoutSort = results_pageRank.vertices.select("id", "pagerank")

In [0]:
results_sortedByRank = results_withoutSort.sort(F.col('pagerank').desc())

In [0]:
# Pagerank checkpoint output
results_sortedByRank.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/pagerank")

In [0]:
# LPA
results_LPA = graph_Filtered.labelPropagation(maxIter=20)
display(results_LPA)

id,label
LyaNichol,26
Nanatchi,29
rm,65
Imokurnt,19
Wineyards,54
BalaamsAss,0
Klatuu,22
DeoVindice,7
REVOLUTION2017,34
Tweetest_boi,50


In [0]:
results_LPA.select(F.col('label')).distinct().count()

In [0]:
results_LPA_Sorted = results_LPA.groupBy(F.col('label')).count().sort(F.col('count').desc())

In [0]:
# Sort communities of more than 4 users
results_LPA_Filtered = results_LPA_Sorted.select(F.col('label'), F.col('count')).filter(F.col('count') >= 5)

In [0]:
results_LPA_Filtered.show()

In [0]:
# LPA checkpoint output
results_LPA_Filtered.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/result_LPA")

In [0]:
results_LPA = results_LPA.join(results_LPA_Filtered, results_LPA.label == results_LPA_Filtered.label, "inner" ).select('id', results_LPA.label)

In [0]:
# Reading checkpoint data
results_sortedByRank = spark.read.option("header","true").parquet("dbfs:/mnt/group12/graph/pagerank")

In [0]:
results_joined = results_sortedByRank.join(results_LPA, results_sortedByRank.id == results_LPA.id, "inner" ).select(results_sortedByRank.id, "pagerank", "label")

In [0]:
results_joined.count()

In [0]:
# Final vertex checkpoint write
results_joined.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/final_vertex")

In [0]:
# Checkpoint read
results_joined = spark.read.option("header","true").parquet("dbfs:/mnt/group12/graph/final_vertex")

In [0]:
edges_Filtered = spark.read.option("header","true").parquet("dbfs:/mnt/group12/graph/edges_filtered")

In [0]:
original_edges = spark.read.option("header", "true").parquet("dbfs:/mnt/group12/graph/edges")

In [0]:
edges_Filtered.count()

In [0]:
# Filter edges of nodes only in the filtered vertex
edges_Final = original_edges.join(results_joined, original_edges.src == results_joined.id, "inner" ).select("src", "dst", "relationship")
edges_Final = edges_Final.join(results_joined, edges_Final.dst == results_joined.id, "inner" ).select("src", "dst", "relationship")

In [0]:
edges_Final = edges_Final.groupBy('src','dst').count().withColumnRenamed("count", "weight") # only unique edges

In [0]:
edges_Final.count()

In [0]:
vertex.count()

In [0]:
# Final edges checkpoint write
edges_Final.write.mode('overwrite').parquet("dbfs:/mnt/group12/graph/final_edges")

In [0]:
edges_Final.printSchema()

In [0]:
# Checkpoint reading
vertex = spark.read.option("header","true").parquet("dbfs:/mnt/group12/graph/final_vertex")

In [0]:
# Storing vertex and edges in a single JSON file
vertex.coalesce(1).write.mode('overwrite').json("dbfs:/mnt/group12/graph/vertex_json")
edges_Final.coalesce(1).write.mode('overwrite').json("dbfs:/mnt/group12/graph/edges_json")

In [0]:
display(dbutils.fs.ls("mnt/group12/graph/vertex_json"))

path,name,size
dbfs:/mnt/group12/graph/vertex_json/_SUCCESS,_SUCCESS,0
dbfs:/mnt/group12/graph/vertex_json/_committed_6242222523956333587,_committed_6242222523956333587,115
dbfs:/mnt/group12/graph/vertex_json/_started_6242222523956333587,_started_6242222523956333587,0
dbfs:/mnt/group12/graph/vertex_json/part-00000-tid-6242222523956333587-74c2336f-a0eb-4337-8a40-23ae7431ea80-4548-1-c000.json,part-00000-tid-6242222523956333587-74c2336f-a0eb-4337-8a40-23ae7431ea80-4548-1-c000.json,50105


In [0]:
display(dbutils.fs.ls("mnt/group12/graph/edges_json"))

path,name,size
dbfs:/mnt/group12/graph/edges_json/_SUCCESS,_SUCCESS,0
dbfs:/mnt/group12/graph/edges_json/_committed_1177843133252683134,_committed_1177843133252683134,115
dbfs:/mnt/group12/graph/edges_json/_started_1177843133252683134,_started_1177843133252683134,0
dbfs:/mnt/group12/graph/edges_json/part-00000-tid-1177843133252683134-9b7f2d76-36bc-4a11-bf6d-d6cd0c797c32-4796-1-c000.json,part-00000-tid-1177843133252683134-9b7f2d76-36bc-4a11-bf6d-d6cd0c797c32-4796-1-c000.json,1584032
