# Graph processing using GraphFrames

In this notebook you will construct a graph from answers and users datasets and use GraphFrames library to run some algorithms on it.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode, split, regexp_replace, collect_list, greatest, least

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Graph processing I')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.0-spark3.0-s_2.12")
    .getOrCreate()
)

In [None]:
from graphframes import *

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'data/answers')

users_input_path = os.path.join(project_path, 'data/users')

# Task

Create a graph from users and answers. The users will be represented as nodes in the graph and two users will be connected by edge if they answered the same question.

* run Label Propagation to find some communities / clusters of users
* run PageRank to find important nodes in the graph 
* take only sample of answers to reduce the size of the graph if you run in local mode

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
).sample(0.1, False, 24).cache()

usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
).cache()

#### Create vertices:

Hint:
* select user_id
* deduplicate
* rename the col to id
* you may keep additional cols as metadata (joined from users)

In [None]:
verticesDF = (
    answersDF
    .filter(col('user_id').isNotNull())
    .select('user_id')
    .distinct()
    .join(usersDF, 'user_id')
    .select(
        col('user_id').alias('id'),
        col('display_name')
    )
)

#### Create edges:

Hint:
* do self-join of answers on question_id
* filter out records where user_id from left side is the same as from right side
* rename user_id cols as src / dst 

In [None]:
edgesDF = (
    answersDF.alias('src')
    .join(
        answersDF.alias('dst'),
        'question_id'
    )
    .withColumn('src', greatest(col('src.user_id'), col('dst.user_id')))
    .withColumn('dst', least(col('src.user_id'), col('dst.user_id')))
    .select('src', 'dst')
    .filter(col('src') != col('dst'))
)

#### Create the graph:

Hint:
* use GraphFrame(vertices, edges) 

In [None]:
usersGraph = GraphFrame(verticesDF, edgesDF)

#### See some properties of the graph:

Hint:
* count number of edges
* count number of vertices

In [None]:
edgesDF.count()

In [None]:
verticesDF.count()

#### Find frequent edges

In [None]:
(
    usersGraph
    .edges
    .groupBy('src', 'dst')
    .count()
    .orderBy(desc('count'))
).show(n=5)

#### Find communities

Hint:
* use labelPropagation
* see how many users are in each community
 * group by label and count
* see what users are in a given community
 * filter on label col

In [None]:
communities = usersGraph.labelPropagation(5)

In [None]:
(
  communities
    .groupBy('label')
    .agg(count('*').alias('cnt'))
    .orderBy(desc('cnt'))
).show()

In [None]:
(
  communities
  .filter(col("label") == 1492)
).show()

#### Compute PageRank

* use pageRank method
* order the vertices by pagerank

In [None]:
pr = (
   usersGraph.pageRank(tol=0.01)
 )

In [None]:
(
  pr
  .vertices
  .orderBy(desc('pagerank'))
).show()

In [None]:
spark.stop()