In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode, split, regexp_replace, collect_list, lead
from pyspark.sql import Window

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('Graph processing II')
    .config("spark.jars.packages", "graphframes:graphframes:0.7.0-spark2.4-s_2.11")
    .getOrCreate()
)

In [17]:
from graphframes import *

In [7]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

badges_input_path = os.path.join(project_path, 'data/badges')

users_input_path = os.path.join(project_path, 'data/users')

In [23]:
badgesDF = (
    spark
    .read
    .option('path', badges_input_path)
    .load()
    .filter(col('user_id') == 2)
)

usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
).cache()

In [20]:
badgesDF.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- name: string (nullable = true)
 |-- class: long (nullable = true)



In [24]:
verticesDF = (
    badgesDF
    .filter(col('name').isNotNull())
    .select(
        col('name').alias('id')
    )
    .distinct()
)

In [31]:
w = Window().partitionBy('user_id').orderBy('src_time')

edgesDF = (
    badgesDF
    .withColumn('src', col('name'))
    .withColumn('src_time', col('date'))
    .withColumn('dst', lead('src', 1).over(w))
    .withColumn('dst_time', lead('src_time', 1).over(w))
    .filter(col('dst').isNotNull())
    .filter(col('src') != col('dst'))
    .select(
    #    'user_id',
    #    'src_time',
        'src',
        'dst',
    #    'dst_time'
    )
    .orderBy('user_id', 'src_time')
)

In [26]:
verticesDF.show()

+--------------+
|            id|
+--------------+
|        Quorum|
|     Announcer|
|Autobiographer|
|     Supporter|
+--------------+



In [32]:
edgesDF.show()

+--------------+---------+
|           src|      dst|
+--------------+---------+
|Autobiographer|Supporter|
|     Supporter|   Quorum|
|        Quorum|Announcer|
+--------------+---------+



In [33]:
verticesDF.count()

4

In [34]:
edgesDF.count()

3

In [36]:
badgesGraph = GraphFrame(verticesDF, edgesDF)

Py4JJavaError: An error occurred while calling o339.createGraph.
: java.lang.NoSuchMethodError: scala.Predef$.refArrayOps([Ljava/lang/Object;)Lscala/collection/mutable/ArrayOps;
	at org.graphframes.GraphFrame$.apply(GraphFrame.scala:676)
	at org.graphframes.GraphFramePythonAPI.createGraph(GraphFramePythonAPI.scala:10)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
(
    
)