In [1]:
import pyspark
from pyspark.sql import (
    SparkSession,
    functions as F,
    types as T
)

In [2]:
spark = (
    SparkSession.builder
    .remote('sc://localhost:15002')
    .appName("p1_app")
    .getOrCreate()
)

In [3]:
(
    spark.read
    .parquet("s3://bsu-c535-fall2024-commons/arjun-workspace/linktarget/")
    .createOrReplaceTempView("linktarget")
)

(
    spark.read
    .parquet("s3://bsu-c535-fall2024-commons/arjun-workspace/page/")
    .createOrReplaceTempView("page")
)

(
    spark.read
    .parquet("s3://bsu-c535-fall2024-commons/arjun-workspace/pagelinks/")
    .createOrReplaceTempView("pagelinks")
)

(
    spark.read
    .parquet("s3://bsu-c535-fall2024-commons/arjun-workspace/redirect/")
    .createOrReplaceTempView("redirect")
)

## Mutual Link Pairs

In [4]:
# connect pages and linktarget ids
# (page_id | page_title | linktarget_id | redirect? | namespace)
(
    spark.table("page").filter("page_namespace = 0")
    .join(
        spark.table("linktarget").filter("lt_namespace = 0"),
        F.expr("lt_title = page_title"),
        "inner"
    )
    .selectExpr("page_id", "page_title", "lt_id", "page_is_redirect as redirect")
    .createOrReplaceTempView("page_with_link_ids")
)

# connect redirect source pages with corresponding destination pages 
# (rd_src_page_id | rd_dst_dst_page_id | namespace)
(
    spark.table("page_with_link_ids")
    .filter("redirect = false") # don't want redirects that link to redirects
    .join(
        spark.table("redirect").filter("rd_namespace = 0"),
        F.expr("rd_title = page_title"),
        "inner"
    )
    .selectExpr("rd_from", "page_id as rd_dst")
    .createOrReplaceTempView("redirect_pages")
)

# connect the destination page of a redirect to the linktarget id of the redirect source,
# this means pages that link to redirect linktarget ids in the pagelinks table will be linked
# with the page id of the redirect destination rather than the source
# (rd_dst_page_id | rd_src_linktarget_id | namespace)
(
    spark.table("page_with_link_ids")
    .filter("redirect = true") # get all redirects
    .join(
        spark.table("redirect_pages"),
        F.expr("page_id = rd_from"),
        "inner"
    )
    .selectExpr("rd_dst as page_id", "lt_id")
    .createOrReplaceTempView("redirect_with_link_ids")
)

# union together non-redirects and redirects
# (page_id | linktarget_id | namespace)
# (rd_dst_page_id | rd_src_linktarget_id | namespace)
(
    spark.table("page_with_link_ids")
    .filter("redirect = false") # non-redirects
    .select("page_id", "lt_id")
    .union(spark.table("redirect_with_link_ids")) # redirects
    .select("page_id", "lt_id")
    .createOrReplaceTempView("all_pages_with_links")
)

# get pagelinks from source to destination and create pairs
(
    spark.table("all_pages_with_links")
    .join(
        spark.table("pagelinks").filter("pl_from_namespace = 0"),
        F.expr("pl_target_id = lt_id"),
        "inner"
    )
    .selectExpr("pl_from", "page_id")
    .selectExpr(
        "greatest(pl_from, page_id) as page_a",
        "least(pl_from, page_id) as page_b",
        "pl_from > page_id as direction"
    )
    .filter("page_a != page_b")
    .groupby("page_a", "page_b")
    .agg(
        F.expr("bool_or(direction) as a_to_b"),
        F.expr("bool_or(not direction) as b_to_a")
    )
    .filter("a_to_b and b_to_a")
    .drop("direction")
    .drop("a_to_b")
    .drop("b_to_a")
    .select("page_a", "page_b")
    .createOrReplaceTempView("final")
)

In [5]:
spark.table("final").count()

174438855

In [11]:
spark.table("final").count()

192093553

## Connected Components