**Mount Google Drive**

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Setup**

In [3]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz
!tar xf spark-3.4.2-bin-hadoop3.tgz
!pip install -q findspark

In [8]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.2-bin-hadoop3"

import findspark

findspark.init()

from pyspark.sql import SparkSession
import random

spark = SparkSession.builder.appName("YourTest") \
    .master("local[2]") \
    .config('spark.ui.port', random.randrange(4000, 5000)) \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.3-spark3.4-s_2.12") \
    .getOrCreate()

from graphframes import GraphFrame
from pyspark.sql.functions import lit, col, abs as pyspark_abs, when

In [9]:
# Read the excel file / rename columns
twitch_gamers = spark.read.format("csv").option("header", "true").load(
    "/content/drive/MyDrive/twitch_gamers/large_twitch_edges_test.csv").withColumnRenamed("numeric_id_1",
                                                                                          "src").withColumnRenamed(
    "numeric_id_2", "dst")

vertices = twitch_gamers.select("src").union(twitch_gamers.select("dst")).distinct().withColumnRenamed("src", "id")

edges = twitch_gamers.select("src", "dst")

graph = GraphFrame(vertices, edges)

# Show graph
graph.vertices.show()
graph.edges.show()


+---+
| id|
+---+
|  3|
|  1|
|  4|
|  2|
+---+

+---+---+
|src|dst|
+---+---+
|  1|  2|
|  1|  3|
|  1|  4|
|  2|  1|
|  3|  1|
|  4|  1|
+---+---+


**In degree function**

In [10]:
# Implement in_degree function
def in_Degrees(graph):
    in_degrees = graph.edges.groupBy("dst").count().select(col("dst").alias("id"), col("count").alias("in_degree"))

    # for 0 indegree
    return graph.vertices.join(in_degrees, "id", "left_outer").select("id", "in_degree").na.fill(0)

In [11]:
in_Degrees(graph).show()

+---+---------+
| id|in_degree|
+---+---------+
|  3|        1|
|  1|        3|
|  4|        1|
|  2|        1|
+---+---------+


**PageRank**

In [24]:
# Implement Page Rank

def page_rank(graph, resetProbability=0.15, sourceId=None, maxIter=None, tol=None):
    """
    Runs the PageRank algorithm on the graph.
    Note: Exactly one of fixed_num_iter or tolerance must be set.

    See Scala documentation for more details.

    :param resetProbability: Probability of resetting to a random vertex.
    :param sourceId: (optional) the source vertex for a personalized PageRank.
    :param maxIter: If set, the algorithm is run for a fixed number
            of iterations. This may not be set if the `tol` parameter is set.
    :param tol: If set, the algorithm is run until the given tolerance.
            This may not be set if the `numIter` parameter is set.
    :return:  GraphFrame with new vertices column "pagerank" and new edges column "weight"
    """

    num_vertex = graph.vertices.count()

    # init rank
    if sourceId is not None:
        ranks = graph.vertices.withColumn('pagerank', when(graph.vertices.id == sourceId, lit(1)).otherwise(lit(0)))
    else:
        ranks = graph.vertices.withColumn('pagerank', lit(1 / num_vertex))

    graph_indegrees = in_Degrees(graph)

    i = 0
    while 1:
        i += 1
        # calculate contributions for each edges
        contributions = graph.edges.join(graph_indegrees, graph.edges.src == graph_indegrees.id).join(ranks,
                                                                                                      ranks.id == graph_indegrees.id).select(
            col("src"), col("dst"), (col("pagerank") / col("in_degree")).alias("contributions"))

        # new pagerank
        if sourceId is not None:
            new = contributions.select(col("dst").alias("id"), col("contributions").alias("pagerank")).groupby(
                "id").sum(
                "pagerank").withColumnRenamed("sum(pagerank)", "pagerank").select(col("id"), (
                    resetProbability * when(col("id") == sourceId, lit(1)).otherwise(lit(0)) + (
                    1 - resetProbability) * col("pagerank")).alias("pagerank")).orderBy("id")
        else:
            new = contributions.select(col("dst").alias("id"), col("contributions").alias("pagerank")).groupby(
                "id").sum(
                "pagerank").withColumnRenamed("sum(pagerank)", "pagerank").select(col("id"), (
                    resetProbability + (1 - resetProbability) * col("pagerank")).alias("pagerank")).orderBy("id")

        # reach maximum iterations
        if maxIter is not None and i == maxIter:
            return GraphFrame(new, graph.edges)

        # difference is smaller than tol
        if tol is not None:
            diff = new.join(ranks.withColumnRenamed('pagerank', 'previous_pagerank'), "id")
            diff = diff.withColumn('difference', pyspark_abs(diff['previous_pagerank'] - diff['pagerank']))

            smallest_difference = diff.orderBy(diff['difference'].desc()).select("difference").limit(1).first()[0]

            if smallest_difference < tol:
                return GraphFrame(new, graph.edges)
        ranks = new

