In [1]:
import findspark
findspark.init()

In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode, split, regexp_replace, collect_list

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

import os

In [3]:
spark = (
    SparkSession
    .builder
    .appName('Graph processing I')
    .config("spark.jars.packages", "graphframes:graphframes:0.5.0-spark2.1-s_2.11")
    .getOrCreate()
)

In [4]:
from graphframes import *

In [7]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

answers_input_path = os.path.join(project_path, 'data/answers')

users_input_path = os.path.join(project_path, 'data/users')

In [8]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
).sample(0.1, False, 24).cache()

usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
)

In [9]:
usersDF.dropDuplicates(['user_id']).count()

153439

In [10]:
answersDF.show(truncate=8, n=5)

+---------+-------------+--------+--------+-------+-----+-----------+
|answer_id|creation_date|    body|comments|user_id|score|question_id|
+---------+-------------+--------+--------+-------+-----+-----------+
|   322985|     2017-...|<p>Fr...|       0| 149024|    3|     322983|
|   348066|     2017-...|<p>Fo...|       0| 104696|    0|     348031|
|   442899|     2018-...|<p>Wo...|       0| 183259|    3|     442895|
|   218415|     2015-...|<p>Th...|       0|  98445|    1|     210431|
|   227701|     2016-...|<p>Ro...|       2|  55531|    0|     227696|
+---------+-------------+--------+--------+-------+-----+-----------+
only showing top 5 rows



In [44]:
verticesDF = (
    answersDF
    .filter(col('user_id').isNotNull())
    .select('user_id')
    .distinct()
    .join(usersDF, 'user_id')
    .select(
        col('user_id').alias('id'),
        col('display_name')
    )
)

In [36]:
edgesDF = (
    answersDF.alias('src')
    .join(
        answersDF.alias('dst'),
        'question_id'
    )
    .select(
        col('src.user_id').alias('src'), 
        col('dst.user_id').alias('dst')
    )
    .filter(col('src') != col('dst'))
)

In [37]:
edgesDF.count()

3010

In [45]:
verticesDF.count()

4936

In [51]:
usersGraph = GraphFrame(verticesDF, edgesDF)

In [47]:
communities = usersGraph.labelPropagation(5)

In [49]:
(
  communities
    .groupBy('label')
    .agg(count('*').alias('cnt'))
    .orderBy(desc('cnt'))
).show()

+------+---+
| label|cnt|
+------+---+
|  1492| 45|
|  1325| 38|
|  4864| 22|
|  2451| 18|
| 26969| 17|
|  4962| 15|
|  1236| 14|
|104696| 14|
|  8563| 13|
|   347| 13|
|  2525| 12|
|   520| 11|
|  1186| 11|
| 83398| 11|
| 26076| 11|
|  9887| 11|
|   392| 10|
|  5477|  9|
|    74|  9|
|  1257|  9|
+------+---+
only showing top 20 rows



In [52]:
(
  communities
  .filter(col("label") == 1492)
).show()

+------+-------------------+-----+
|    id|       display_name|label|
+------+-------------------+-----+
| 30247|             Martin| 1492|
| 18788|              Bzazz| 1492|
|180976|         David Shin| 1492|
|  3031|           Johannes| 1492|
| 42187|               R004| 1492|
| 38184|          user38184| 1492|
| 70245|               Buzz| 1492|
| 38145|        user1231247| 1492|
|111938|         Bill Dixon| 1492|
|  5070|         ganzewoort| 1492|
|  8861|      Mario Enrique| 1492|
|164567|                gan| 1492|
| 83476| Tamoghna Chowdhury| 1492|
| 92077|Former SSC employee| 1492|
| 65077|     Frankly Ernest| 1492|
|   861|           Mitchell| 1492|
| 62770|          Wen Chern| 1492|
| 72900|           Name YYY| 1492|
|  7993|      Mozibur Ullah| 1492|
| 78230|          Jaywalker| 1492|
+------+-------------------+-----+
only showing top 20 rows



In [31]:
pr = (
   usersGraph.pageRank(tol=0.01)
 )

In [32]:
(
  pr
  .vertices
  .orderBy(desc('pagerank'))
).show()

+------+------------------+
|    id|          pagerank|
+------+------------------+
|  1492|52.733122262268814|
|  1325| 37.50490707250536|
| 26969|27.687848380022732|
|  4864|24.958226457283317|
|  1236|19.612611237650622|
|  9887|18.927662130907382|
|  2451|17.789265325719093|
|  4962|15.779909176147259|
|104696|15.664027346489439|
|   124|14.755456827061133|
|  8563| 13.66305480430828|
|   520|13.619455474457679|
|   392|12.757972462486082|
|  4552|12.633922259370664|
|   347|11.099675147387822|
| 44126|10.743275206522132|
| 50583|10.662913817552354|
| 76162|10.371432035738277|
|  1257|10.257251985734136|
| 43351|10.112820758425817|
+------+------------------+
only showing top 20 rows

