## Mount Google Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## Setup

In [2]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz
!tar xf spark-3.4.2-bin-hadoop3.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.2-bin-hadoop3"

import findspark
findspark.init()


from pyspark.sql import SparkSession
import random
spark = SparkSession.builder.appName("YourTest")\
.master("local[2]")\
.config('spark.ui.port', random.randrange(4000,5000))\
.config("spark.jars.packages","graphframes:graphframes:0.8.3-spark3.4-s_2.12")\
.getOrCreate()

In [4]:
from google.colab import output
ui_port=spark.sparkContext.uiWebUrl.split(":")[-1]
output.serve_kernel_port_as_window(ui_port,path='/jobs/index.html')

<IPython.core.display.Javascript object>

In [5]:
# from pyspark.sql.functions import coalesce, col, lit, sum, when
import pyspark.sql.functions as f
from graphframes import *
from graphframes.lib import Pregel

spark.sparkContext.setCheckpointDir("./checkpoint/")

## Personalized Page Rank

In [14]:
from graphframes.lib import Pregel
from pyspark.sql.functions import lit, col, abs as pyspark_abs

# Implement in_degree function
def in_Degrees(graph):
  in_degrees = graph.edges.groupBy("dst").count().select(col("dst").alias("id"), col("count").alias("in_degree"))

  # for 0 indegree
  return graph.vertices.join(in_degrees, "id", "left_outer").select("id", "in_degree").na.fill(0)

def personalized_page_rank(graph, sourceId, resetProbability=0.15, maxIter=None, tol=None):
    """
    Runs the Personalized PageRank algorithm on the graph.
    Note: Exactly one of fixed_num_iter or tolerance must be set.

    :param graph: The graph to run the algorithm on.
    :param sourceId: The source vertex for the personalized PageRank.
    :param resetProbability: Probability of resetting to the source vertex.
    :param maxIter: If set, the algorithm is run for a fixed number
            of iterations. This may not be set if the `tol` parameter is set.
    :param tol: If set, the algorithm is run until the given tolerance.
            This may not be set if the `numIter` parameter is set.
    :return:  GraphFrame with new vertices column "pagerank" and new edges column "weight"
    """

    num_vertex = graph.vertices.count()

    # init rank
    ranks = graph.vertices.withColumn('pagerank', f.when(graph.vertices.id == sourceId, f.lit(1.0)).otherwise(f.lit(0.0)))

    graph_indegrees = in_Degrees(graph)

    i = 0
    while 1:
        i += 1
        # calculate contributions for each edges
        contributions = graph.edges.join(graph_indegrees, graph.edges.src == graph_indegrees.id).join(ranks,
                                                                                                      ranks.id == graph_indegrees.id).select(
            col("src"), col("dst"), (col("pagerank") / col("in_degree")).alias("contributions"))

        # new pagerank
        new = contributions.select(col("dst").alias("id"), col("contributions").alias("pagerank")).groupby("id").sum(
            "pagerank").withColumnRenamed("sum(pagerank)", "pagerank").select(col("id"), (
                    resetProbability * f.when(col("id") == sourceId, f.lit(1.0)).otherwise(f.lit(0.0)) + (1 - resetProbability) * col("pagerank")).alias("pagerank")).orderBy("id")

        # reach maximum iterations
        if maxIter != None and i == maxIter:
            return GraphFrame(new, graph.edges)

        # difference is smaller than tol
        if tol != None:
            diff = new.join(ranks.withColumnRenamed('pagerank', 'previous_pagerank'), "id")
            diff = diff.withColumn('difference', pyspark_abs(diff['previous_pagerank'] - diff['pagerank']))

            smallest_difference = diff.orderBy(diff['difference'].desc()).select("difference").limit(1).first()[0]

            if smallest_difference < tol:
                return GraphFrame(new, graph.edges)
        ranks = new


## Simple Graph Test

In [15]:
edges = spark.createDataFrame([
    [0, 1],
    [1, 2],
    [2, 4],
    [2, 0],
    [3, 4],
    [4, 0],
    [4, 2],
    [0, 4]
    ], ["src", "dst"])

edges.cache()
vertices = spark.createDataFrame([[0], [1], [2], [3], [4]], ["id"])
vertices.show()

vertices.cache()
graph = GraphFrame(vertices, edges)

test = personalized_page_rank(graph, 0, maxIter = 1)
test.vertices.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+

+---+--------+
| id|pagerank|
+---+--------+
|  0|    0.15|
|  1|   0.425|
|  2|     0.0|
|  4|   0.425|
+---+--------+

