In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode, split, regexp_replace, collect_list, lead
from pyspark.sql import Window

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Graph processing II')
    .config("spark.jars.packages", "graphframes:graphframes:0.7.0-spark2.4-s_2.11")
    .getOrCreate()
)

In [None]:
from graphframes import *

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

badges_input_path = os.path.join(project_path, 'data/badges')

users_input_path = os.path.join(project_path, 'data/users')

In [None]:
badgesDF = (
    spark
    .read
    .option('path', badges_input_path)
    .load()
)

usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
).cache()

In [None]:
badgesDF.printSchema()

In [None]:
verticesDF = (
    badgesDF
    .filter(col('name').isNotNull())
    .select(
        col('name').alias('id')
    )
    .distinct()
)

In [None]:
w = Window().partitionBy('user_id').orderBy('src_time')

edgesDF = (
    badgesDF
    .withColumn('src', col('name'))
    .withColumn('src_time', col('date'))
    .withColumn('dst', lead('src', 1).over(w))
    .withColumn('dst_time', lead('src_time', 1).over(w))
    .filter(col('dst').isNotNull())
    .filter(col('src') != col('dst'))
    .select(
        'user_id',
        'src_time',
        'src',
        'dst',
        'dst_time'
    )
    .orderBy('user_id', 'src_time')
)

In [None]:
verticesDF.show()

In [None]:
edgesDF.show()

In [None]:
verticesDF.count()

In [None]:
edgesDF.count()

In [None]:
badgesGraph = GraphFrame(verticesDF, edgesDF)

# Find frequent edges

In [None]:
(
    badgesGraph
    .edges
    .groupBy('src', 'dst')
    .count()
    .orderBy(desc('count'))
).show(n=5)

In [None]:
(
    badgesGraph
    .inDegrees
    .orderBy(desc('inDegree'))
).show(n=5)

In [None]:
(
    badgesGraph
    .outDegrees
    .orderBy(desc('outDegree'))
).show(n=5)

See how many users continued in their path after they got some badge.

In [None]:
(
    badgesGraph
    .inDegrees
    .alias('in')
    .join(badgesGraph.outDegrees.alias('out'), col('in.id') == col('out.id'))
    .withColumn('degreeRatio', col('outDegree') / col('inDegree'))
    .orderBy(('degreeRatio'))
).show(n=10)

In [None]:
(
    badgesGraph
    .find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")
).show()