In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode, split, regexp_replace, collect_list

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Graph processing I')
    .config("spark.jars.packages", "graphframes:graphframes:0.5.0-spark2.1-s_2.11")
    .getOrCreate()
)

In [None]:
from graphframes import *

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

answers_input_path = os.path.join(project_path, 'data/answers')

users_input_path = os.path.join(project_path, 'data/users')

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
).sample(0.1, False, 24).cache()

usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
)

In [None]:
usersDF.dropDuplicates(['user_id']).count()

In [None]:
answersDF.show(truncate=8, n=5)

In [None]:
verticesDF = (
    answersDF
    .filter(col('user_id').isNotNull())
    .select('user_id')
    .distinct()
    .join(usersDF, 'user_id')
    .select(
        col('user_id').alias('id'),
        col('display_name')
    )
)

In [None]:
edgesDF = (
    answersDF.alias('src')
    .join(
        answersDF.alias('dst'),
        'question_id'
    )
    .select(
        col('src.user_id').alias('src'), 
        col('dst.user_id').alias('dst')
    )
    .filter(col('src') != col('dst'))
)

In [None]:
edgesDF.count()

In [None]:
verticesDF.count()

In [None]:
usersGraph = GraphFrame(verticesDF, edgesDF)

In [None]:
communities = usersGraph.labelPropagation(5)

In [None]:
(
  communities
    .groupBy('label')
    .agg(count('*').alias('cnt'))
    .orderBy(desc('cnt'))
).show()

In [None]:
(
  communities
  .filter(col("label") == 1492)
).show()

In [None]:
pr = (
   usersGraph.pageRank(tol=0.01)
 )

In [None]:
(
  pr
  .vertices
  .orderBy(desc('pagerank'))
).show()