In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
import numpy as np 
import pandas as pd

conf = pyspark.SparkConf().setMaster("local[4]").setAll([
                                   ('spark.jars.packages', 'com.databricks:spark-xml_2.12:0.8.0'),
                                   ('spark.executor.memory', '4g'),
                                   ('spark.driver.memory','2g'),
                                   ('spark.driver.maxResultSize', '5G'),
                                   ('spark.executor.heartbeatInterval', '3600s'),
                                   ('spark.network.timeout', '4000s')
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [2]:
spark

In [3]:
COMMONS_DUMP_REDUCED = 'commonswiki-20220220-pages-articles-multistream1.xml-p1p1500000.bz2'

In [4]:
commons_categories_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '14'")

In [5]:
commons_categories_raw.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _bytes: long (nullable = true)
 |    |    |-- _xml:space: string (nullable = true)
 |  

In [6]:
import urllib

In [7]:
# Adapted from https://github.com/epfl-dlab/WikiPDA/blob/master/PaperAndCode/TopicsExtractionPipeline/GenerateDataframes.py
def normalize_title(title):
    """ Replace _ with space, remove anchor and namespace prefix, capitalize """
    title = urllib.parse.unquote(title)
    title = title.split(':')[1]
    title = title.strip()
    if len(title) > 0:
        title = title[0].upper() + title[1:]
    n_title = title.replace("_", " ")
    if '#' in n_title:
        n_title = n_title.split('#')[0]
    return n_title

In [9]:
from collections import defaultdict

In [10]:
from pyspark.accumulators import AccumulatorParam
class ParentsAccumulator(AccumulatorParam):
    def zero(self, value):
        return defaultdict(list)

    def addInPlace(self, val1, val2):
        for key, value in val2.items():
            val1[key] += value
        return val1

In [8]:
import re

In [11]:
categories_regex = re.compile('\[\[Category:([^\|]*?)(?:\|.*?)*\]\]')
hiddencat_regex = re.compile('__HIDDENCAT__')

In [12]:
acc = sc.accumulator(defaultdict(list), ParentsAccumulator())

def extract_category(row):
    """ Extract the details of a category """
    title=normalize_title(row.title)
    text=row.revision.text._VALUE
    parents=re.findall(categories_regex, text) if text else []
    global acc
    if parents:
        acc += {parent: [title] for parent in parents}
    return Row(
        id=row.id,
        title=title,
        redirect=row.redirect._title if row.redirect is not None else None,
        parents=parents,
        hiddencat=re.search(hiddencat_regex, text) is not None if text else False
    )

In [13]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType

schema = StructType([StructField("id", IntegerType(), True),
                     StructField("title", StringType(), True),
                     StructField("redirect", StringType(), True),
                     StructField("parents", ArrayType(StringType()), True),
                     StructField("hiddencat", BooleanType(), True)])

In [14]:
commons_categories = spark.createDataFrame(commons_categories_raw.rdd.map(extract_category).filter(lambda r: r is not None), schema=schema)

In [35]:
parents = spark.createDataFrame(acc.value.items(), ['title', 'childs'])

In [37]:
categories = commons_categories.alias('c').join(parents, commons_categories.title==parents.title).select('c.*', 'childs')

In [39]:
categories.show()

+-------+--------------------+--------+--------------------+---------+--------------------+
|     id|               title|redirect|             parents|hiddencat|              childs|
+-------+--------------------+--------+--------------------+---------+--------------------+
|  27767|        10th century|    null|[Centuries, 1st m...|    false|[Maps showing 10t...|
| 759228|         1215 births|    null|                  []|    false|     [Nicola Pisano]|
|1275954|         1267 births|    null|                  []|    false| [Giotto di Bondone]|
| 764453|12th dynasty of E...|    null|[Egyptian Middle ...|    false|     [Amenemhat III]|
| 775578|             13 June|    null|                  []|    false|[Saint Anthony of...|
| 775874|          13 October|    null|                  []|    false|[Edward the Confe...|
|1430116|      1397 paintings|    null|                  []|    false|    [Kyivan Psalter]|
|1340410|             14 July|    null|                  []|    false|[Fête de l