In [1]:
%pip install git+https://github.com/frasalvi/wikitextprocessor

Collecting git+https://github.com/frasalvi/wikitextprocessor
  Cloning https://github.com/frasalvi/wikitextprocessor to /tmp/pip-req-build-4sv5lnia
  Running command git clone -q https://github.com/frasalvi/wikitextprocessor /tmp/pip-req-build-4sv5lnia
  Resolved https://github.com/frasalvi/wikitextprocessor to commit 9248cb72998a0a9b56b44a0ff4f2fda80cd7d1ab
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
import numpy as np 
import pandas as pd
import urllib
import re
from collections import defaultdict

import os
import shutil

import wikitextprocessor as wtp

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType
from pyspark.accumulators import AccumulatorParam

conf = pyspark.SparkConf().setMaster("local[10]").setAll([
                                   ('spark.jars.packages', 'com.databricks:spark-xml_2.12:0.8.0'),
                                   ('spark.executor.memory', '8g'),
                                   ('spark.driver.memory','10g'),
                                   ('spark.driver.maxResultSize', '50G'),
                                   ('spark.executor.heartbeatInterval', '60s'),
                                   ('spark.network.timeout', '61s')
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

22/03/24 16:28:21 WARN Utils: Your hostname, iccluster111 resolves to a loopback address: 127.0.1.1; using 10.90.36.41 instead (on interface eno1)
22/03/24 16:28:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/salvi/.conda/envs/francesco/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/salvi/.ivy2/cache
The jars for the packages stored in: /home/salvi/.ivy2/jars
com.databricks#spark-xml_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0d95e5af-11b5-42e8-9100-ac3202700782;1.0
	confs: [default]
	found com.databricks#spark-xml_2.12;0.8.0 in central
	found commons-io#commons-io;2.6 in central
	found org.glassfish.jaxb#txw2;2.3.2 in central
:: resolution report :: resolve 328ms :: artifacts dl 16ms
	:: modules in use:
	com.databricks#spark-xml_2.12;0.8.0 from central in [default]
	commons-io#commons-io;2.6 from central in [default]
	org.glassfish.jaxb#txw2;2.3.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |  

In [3]:
# sc.setLogLevel('DEBUG')

In [4]:
spark

In [5]:
# Inputs

COMMONS_DUMP = '/scratch/WikipediaImagesTaxonomy/commonswiki-20220220-pages-articles-multistream.xml.bz2'
# COMMONS_DUMP_REDUCED = '../../commonswiki-20220220-pages-articles-multistream1.xml-p1p1500000.bz2'
COMMONS_DUMP_REDUCED = '../../commonswiki-20220220-pages-articles-multistream6.xml-p114543930p115400363.bz2'
# list of chunks of the WIT dataset
WIT_DATASET = ['../../wit_v1.train.all-1percent_sample.tsv.gz']

# Outputs (ideally in scratch/, but I don't have permissions)
TEMPLATES_DUMP = '../../commonswiki-20220220-templates-modules.xml'
CATEGORIES_PATH = '../../commonswiki-20220220-category-network.parquet'
FILES_PATH = '../../commonswiki-20220220-files.parquet'

In [6]:
# Adapted from https://github.com/epfl-dlab/WikiPDA/blob/master/PaperAndCode/TopicsExtractionPipeline/GenerateDataframes.py
def normalize_title(title, dumps=True):
    """ Replace _ with space, remove anchor and namespace prefix, capitalize """
    title = urllib.parse.unquote(title)
    if(dumps):
        title = title.split(':')[1]
    title = title.strip()
    if len(title) > 0:
        title = title[0].upper() + title[1:]
    n_title = title.replace("_", " ")
    if '#' in n_title:
        n_title = n_title.split('#')[0]
    return n_title

## Templates

Templates and lua modules are extracted and processed by wikitextprocessor, storing them for later use in expansion.

In [7]:
# from https://github.com/tatuylonen/wikitextprocessor/blob/ee043cff190543fb94cb40d4827444d8982a30fe/wikitextprocessor/core.py#L490
def template_to_body(title, text):
        """Extracts the portion to be transcluded from a template body.  This
        returns an str."""
        assert isinstance(title, str)
        assert isinstance(text, str)
        # Remove all comments
        text = re.sub(r"(?s)<!\s*--.*?--\s*>", "", text)
        # Remove all text inside <noinclude> ... </noinclude>
        text = re.sub(r"(?is)<\s*noinclude\s*>.*?<\s*/\s*noinclude\s*>",
                      "", text)
        # Handle <noinclude> without matching </noinclude> by removing the
        # rest of the file
        text = re.sub(r"(?is)<\s*noinclude\s*>.*", "", text)
        text = re.sub(r"(?is)<\s*noinclude\s*/\s*>", "", text)
        # Apparently unclosed <!-- at the end of a template body is ignored
        text = re.sub(r"(?s)<!\s*--.*", "", text)
        # <onlyinclude> tags, if present, include the only text that will be
        # transcluded.  All other text is ignored.
        onlys = list(re.finditer(r"(?is)<\s*onlyinclude\s*>(.*?)"
                                 r"<\s*/\s*onlyinclude\s*>|"
                                 r"<\s*onlyinclude\s*/\s*>",
                                 text))
        if onlys:
            text = "".join(m.group(1) or "" for m in onlys)
        # Remove <includeonly>.  They mark text that is not visible on the page
        # itself but is included in transclusion.  Also text outside these tags
        # is included in transclusion.
        text = re.sub(r"(?is)<\s*(/\s*)?includeonly\s*(/\s*)?>", "", text)
        return text

In [8]:
initialize = 0

if initialize:
    commons_templates_modules = spark.read.format('com.databricks.spark.xml') \
                                        .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '10' or ns = '828'")
    commons_templates_modules.coalesce(1).write.format("com.databricks.spark.xml").mode("overwrite")\
                                         .options(rowTag='page', rootTag='pages').save(TEMPLATES_DUMP)

    # Remove and rename .crc files
    os.rename(os.path.join(TEMPLATES_DUMP, 'part-00000'), (new_path:=os.path.join(os.path.dirname(TEMPLATES_DUMP), 'temp.xml')))
    shutil.rmtree(TEMPLATES_DUMP)
    os.rename(new_path, TEMPLATES_DUMP)

In [9]:
def page_handler(model, title, next):
    if not (title.startswith("Template:") or title.startswith("Module:")):
        return None

In [10]:
ctx = wtp.Wtp()
ctx.process(TEMPLATES_DUMP, page_handler, windows=False)

# otherwise "TypeError: cannot serialize '_io.FileIO' object" would be 
# raised later
ctx.tmp_file = None

UNSUPPORTED pages 2 {}
Analyzing which templates should be expanded before parsing


## Categories

In [11]:
commons_categories_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '14'")
# commons_categories_raw.persist()

                                                                                

In [12]:
commons_categories_raw.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _bytes: long (nullable = true)
 |    |    |-- _xml:space: string (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)



In [13]:
# Build a dictionary of redirects (old_title -> redirect_title)
category_redirects = {normalize_title(r.title): normalize_title(r.redirect._title) 
                      for r in commons_categories_raw.filter('redirect is not null').collect()}
category_redirects

                                                                                

{'Scholars of Albania': 'Scholars from Albania',
 'IRT Substation 17': 'Dyckman-Hillside Substation (Substation 17)',
 'The Northern Mariana Islands': 'Northern Mariana Islands'}

In [14]:
categories_regex = re.compile('\[\[(Category:[^\|]*?)(?:\|.*?)*\]\]')
hiddencat_regex = re.compile('__HIDDENCAT__')

In [15]:
class ChildsAccumulator(AccumulatorParam):
    '''
    Accumulator for childs: a dictionary mapping each category to its childs
    '''
    def zero(self, value):
        return defaultdict(list)

    def addInPlace(self, val1, val2):
        for key, value in val2.items():
            val1[key] += value
        return val1

In [54]:
def extract_category(row):
    '''
    Extract the details of a category
    '''
    title = normalize_title(row.title)

    # Template expansion
    print('>>>>', title)
    try:
        text = ctx.expand(row.revision.text._VALUE, title=title)
    except AssertionError:
        text = ''

    # No template expansion
    # text = row.revision.text._VALUE
        
    parents = re.findall(categories_regex, text) if text else []
    parents = [category_redirects[normalize_title(parent)] if normalize_title(parent) 
               in category_redirects.keys() else normalize_title(parent) for parent in parents]
    global acc
    if parents:
        acc += {parent: [title] for parent in parents}
    return Row(
        id=row.id,
        title=title,
        parents=parents,
        hiddencat=re.search(hiddencat_regex, text) is not None if text else False
    )

In [55]:
# Schema of the processed categories DataFrame
schema_cat = StructType([StructField("id", IntegerType(), True),
                         StructField("title", StringType(), True),
                         StructField("parents", ArrayType(StringType()), True),
                         StructField("hiddencat", BooleanType(), True)])

In [56]:
# We ignore redirect categories, eventually remapping parents to their redirects
acc = sc.accumulator(defaultdict(list), ChildsAccumulator())
categories_clean = spark.createDataFrame(commons_categories_raw.filter('redirect is null')\
                                            .rdd.map(extract_category).filter(lambda r: r is not None), 
                                         schema=schema_cat)

# commons_categories_raw.unpersist()
# categories_clean.persist();

In [57]:
# Test with write instead of collect - Workaround for the fact that the value of acc is used before it is filled

TEMP_PATH = '../../dump.xml'

categories_clean.write.format("com.databricks.spark.xml").mode("overwrite")\
                                         .options(rowTag='page', rootTag='pages').save(TEMP_PATH)

# Remove files
shutil.rmtree(TEMP_PATH)

>>>>>>>>  Ailuronyx seychellensis on stampsTrimurti Temple, Baroli  (0 + 4) / 4]

>>>> Census-designated places in Pulaski County, Arkansas
>>>> Ailuronyx seychellensis in art
>>>> Pulpit of St. Peter und Paul (Zell)
>>>> Weekend Wikipédia à CAEB Porto-novo
>>>> Jessica Heller
>>>> Terpsiphone corvina on stamps
>>>> Terpsiphone corvina in art
>>>> Hawker Hunter at Museo Nacional Aeronáutico y del Espacio
>>>> Villages in Assam
>>>> Coasts of Isola del Giglio
>>>> Paul Pieper
>>>> Stadtschloss, Berlin in the 2010s
>>>> Wiki Loves Africa 2022 in Nigeria Templates
>>>> Stadtschloss, Berlin in the 21st century
>>>> >>>> August 2021 in Calgary
Accordions in France
>>>> Aéronautique militaire
>>>> 1647 drawings by country
>>>> January 2022 in Utah
>>>> Bus depots in Brescia
>>>> 2021 in Lahti
>>>> Paintings by Franz Marc in the LWL-Museum für Kunst und Kultur
>>>>>>>> 1964 in Brescia
 F/A-18 Hornet top views
>>>> 2022 in Utah by month
>>>> Laras
>>>> Nature of Isola del Giglio
>>>> 2020 in L

KeyboardInterrupt: 

In [21]:
# Workaround for the fact that the value of acc is used before it is filled
# categories_clean.collect();

In [22]:
schema_childs = StructType([StructField('title', StringType(), True),
                            StructField('childs', ArrayType(StringType(), True), True)])

In [23]:
childs_df = spark.createDataFrame(acc.value.items(), schema=schema_childs)

In [24]:
categories = categories_clean.alias('c').join(childs_df, categories_clean.title==childs_df.title).select('c.*', 'childs')
# categories_clean.unpersist();
# categories.persist();

In [28]:
categories.write.mode("overwrite").parquet(CATEGORIES_PATH)

22/03/24 16:39:14 WARN TaskSetManager: Stage 15 contains a task of very large size (1241 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [29]:
categories = spark.read.parquet(CATEGORIES_PATH)

In [30]:
categories.show()

+---------+--------------------+--------------------+---------+--------------------+
|       id|               title|             parents|hiddencat|              childs|
+---------+--------------------+--------------------+---------+--------------------+
|115179492|1 manat of Turkme...|[Currencies with ...|    false|[1 manat banknote...|
|115178624|10 manat of Turkm...|[Currencies with ...|    false|[10 manat banknot...|
|115179595|100 manat of Turk...|[Currencies with ...|    false|[100 manat bankno...|
|115075609|   1000 D of Germany|        [MaK 1000 D]|    false|[1000 D of Hafen-...|
|114952756|10th-century scul...|[Sculptures in Li...|    false|[10th-century scu...|
|114876524|10th-century work...|[Works in Baden-W...|    false|[10th-century pai...|
|115032212|11th-century BC w...|[11th-century BC ...|    false|[11th-century BC ...|
|114952761|11th-century scul...|[Sculptures in Li...|    false|[11th-century scu...|
|114717666|  1200s art in Italy|                  []|    false|[1

In [25]:
hidden_categories = categories.filter('hiddencat is True').select('title').rdd.flatMap(lambda x: x).collect()

22/03/24 16:36:42 WARN TaskSetManager: Stage 4 contains a task of very large size (1241 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [26]:
hidden_categories[:10]

['Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Adobe Illustrator',
 'Washington Park Hist Dist Image by CLight',
 'Giornali storici digitalizzati by Gianluca T.',
 'Photographs by Gaudi Renanda',
 'Files by User',
 'Files by User',
 'Schereville Image by Chris Light']

## Files

In [31]:
commons_files_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED)\
                                .filter("ns = '6'")
# commons_files_raw.persist();

                                                                                

In [32]:
# Build a dictionary of redirects
file_redirects = {normalize_title(r.title): normalize_title(r.redirect._title)
                  for r in commons_files_raw.filter('redirect is not null').collect()}
# file_redirects

                                                                                

For now, we consider only the images that appear in en.wikipedia, discarding all the others. We can also ignore redirects.

In [33]:
wiki_image_names = []

for chunk in WIT_DATASET:
    wiki_image_names += pd.read_csv(chunk, sep="\t").query("language == 'en'")\
                            .image_url.apply(lambda r: normalize_title(r.split('/')[-1], False)).tolist()

In [34]:
# Keep only unique values
wiki_image_names = set(wiki_image_names)

# Remap redirects
wiki_image_names = {file_redirects[name] if name in file_redirects.keys() else name for name in wiki_image_names}

In [35]:
def extract_file(row):
    '''
    Extract the details of a file
    '''
    # Template expansion
    # try:
    #     text = ctx.expand(row.revision.text._VALUE, title=title)
    # except AssertionError:
    #     text = ''

    # No template expansion
    text = row.revision.text._VALUE

    categories = re.findall(categories_regex, text) if text else []
    
    # No way to do this with a list comprehension (nested conditions work only if there is always an else)
    # Remap categories to their redirect and filter hidden categories
    categories_nohidd = []
    for category in categories:
        category_norm = normalize_title(category)
        if(category_norm not in hidden_categories):
            if(category_norm in category_redirects.keys()):
                if((c:=category_redirects[category_norm]) not in hidden_categories):
                    categories_nohidd.append(c)
            else:
                categories_nohidd.append(category_norm)

    return Row(
        id=row.id,
        title=normalize_title(row.title),
        categories=categories_nohidd
    )

In [36]:
# Schema of the processed files DataFrame
schema_files = StructType([StructField("id", IntegerType(), True),
                           StructField("title", StringType(), True),
                           StructField("categories", ArrayType(StringType()), True)])

In [37]:
# Also for files, we ignore redirects
files = spark.createDataFrame(commons_files_raw.filter('redirect is null')\
                                        .rdd.map(extract_file).filter(lambda r: r is not None), 
                              schema=schema_files)
# commons_files_raw.unpersist();

In [38]:
files.write.mode("overwrite").parquet(FILES_PATH)

                                                                                

In [39]:
files = spark.read.parquet(FILES_PATH)

In [40]:
files.show()

+---------+--------------------+--------------------+
|       id|               title|          categories|
+---------+--------------------+--------------------+
|114792323|PATRICK MAGO (179...|[Players of Mount...|
|114792324|RETURN TO REDFERN...|[Photographs by N...|
|114792325|View from the Sev...|         [Arlingham]|
|114792326|RETURN TO REDFERN...|[Photographs by N...|
|114792328|RETURN TO REDFERN...|[Photographs by N...|
|114792329| Marina Goliasse.jpg|                  []|
|114792330|Downtown Ferndale...|                  []|
|114792331|MATT PLACE (18392...|[Players of Mount...|
|114792332|Benchmark on Citi...|[Richmond, North ...|
|114792334|Portrait photogra...|  [Joséphin Péladan]|
|114792335|SONNY BRISTOW (18...|[Players of Mount...|
|114792336|The River Ayr - g...|[Ayr (civil paris...|
|114792337|MATT PLACE (18370...|[Players of Mount...|
|114792338|Path to Coton - g...|[Coton, Cambridge...|
|114792339|MICHAEL MORRIS (1...|[Players of Mount...|
|114792340|On top of A'Bhuid

## Categories/2

In [41]:
# List of categories that appear in en.wikipedia
categories_in_wikipedia = files.rdd.flatMap(lambda x: x.categories).distinct().map(Row("title")).toDF()
categories_in_wikipedia = categories_in_wikipedia.withColumn('in_en_wiki', lit(True))

                                                                                

In [42]:
categories_in_wikipedia.show(5)

+--------------------+----------+
|               title|in_en_wiki|
+--------------------+----------+
| Santon, Isle of Man|      true|
|Christmas 2020 in...|      true|
|Pacific Sogo Depa...|      true|
|Players (men) by ...|      true|
|Songs of the Beatles|      true|
+--------------------+----------+
only showing top 5 rows



In [43]:
categories_in_wikipedia.count()

190009

In [44]:
categories = categories.alias('c').join(categories_in_wikipedia, 'title', 'left').select('c.*', categories_in_wikipedia.in_en_wiki)
categories = categories.na.fill(False, subset=["in_en_wiki"])

In [45]:
categories.filter('in_en_wiki == True').count()

1746

In [48]:
temp_path = '.'.join(CATEGORIES_PATH.split('.')[:-1]) + '-temp.parquet'
categories.write.mode("overwrite").parquet(temp_path)

shutil.rmtree(CATEGORIES_PATH)
os.rename(temp_path, CATEGORIES_PATH)

## Close

In [None]:
spark.stop()