In [1]:
%pip install git+https://github.com/frasalvi/wikitextprocessor

Collecting git+https://github.com/frasalvi/wikitextprocessor
  Cloning https://github.com/frasalvi/wikitextprocessor to /tmp/pip-req-build-8rocjp_3
  Running command git clone -q https://github.com/frasalvi/wikitextprocessor /tmp/pip-req-build-8rocjp_3
  Resolved https://github.com/frasalvi/wikitextprocessor to commit ad41854f13970334bdedfb1c4615bfd0374fad64
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np 
import pandas as pd
import urllib
import re
from collections import defaultdict

import os
import shutil

import wikitextprocessor as wtp

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType
from pyspark.accumulators import AccumulatorParam

conf = pyspark.SparkConf().setMaster("local[10]").setAll([
                                   ('spark.jars.packages', 'com.databricks:spark-xml_2.12:0.8.0'),
                                   ('spark.executor.memory', '4g'),
                                   ('spark.driver.memory','3g'),
                                   ('spark.driver.maxResultSize', '5G'),
                                   ('spark.executor.heartbeatInterval', '3600s'),
                                   ('spark.network.timeout', '4000s')
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

22/03/21 23:33:01 WARN Utils: Your hostname, iccluster111 resolves to a loopback address: 127.0.1.1; using 10.90.36.41 instead (on interface eno1)
22/03/21 23:33:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/salvi/.conda/envs/francesco/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/salvi/.ivy2/cache
The jars for the packages stored in: /home/salvi/.ivy2/jars
com.databricks#spark-xml_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-03d24434-980b-4ecd-bac3-42ddb67dd9d8;1.0
	confs: [default]
	found com.databricks#spark-xml_2.12;0.8.0 in central
	found commons-io#commons-io;2.6 in central
	found org.glassfish.jaxb#txw2;2.3.2 in central
:: resolution report :: resolve 234ms :: artifacts dl 13ms
	:: modules in use:
	com.databricks#spark-xml_2.12;0.8.0 from central in [default]
	commons-io#commons-io;2.6 from central in [default]
	org.glassfish.jaxb#txw2;2.3.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |  

In [2]:
# sc.setLogLevel('DEBUG')

In [3]:
spark

In [4]:
# COMMONS_DUMP_REDUCED = 'commonswiki-20220220-pages-articles-multistream1.xml-p1p1500000.bz2'
COMMONS_DUMP_REDUCED = '../../commonswiki-20220220-pages-articles-multistream6.xml-p114543930p115400363.bz2'
TEMPLATES_DUMP = '../../commonswiki-20220220-templates-modules.xml'

In [5]:
# Adapted from https://github.com/epfl-dlab/WikiPDA/blob/master/PaperAndCode/TopicsExtractionPipeline/GenerateDataframes.py
def normalize_title(title, dumps=True):
    """ Replace _ with space, remove anchor and namespace prefix, capitalize """
    title = urllib.parse.unquote(title)
    if(dumps):
        title = title.split(':')[1]
    title = title.strip()
    if len(title) > 0:
        title = title[0].upper() + title[1:]
    n_title = title.replace("_", " ")
    if '#' in n_title:
        n_title = n_title.split('#')[0]
    return n_title

## Templates

In [6]:
# from https://github.com/tatuylonen/wikitextprocessor/blob/ee043cff190543fb94cb40d4827444d8982a30fe/wikitextprocessor/core.py#L490
def template_to_body(title, text):
        """Extracts the portion to be transcluded from a template body.  This
        returns an str."""
        assert isinstance(title, str)
        assert isinstance(text, str)
        # Remove all comments
        text = re.sub(r"(?s)<!\s*--.*?--\s*>", "", text)
        # Remove all text inside <noinclude> ... </noinclude>
        text = re.sub(r"(?is)<\s*noinclude\s*>.*?<\s*/\s*noinclude\s*>",
                      "", text)
        # Handle <noinclude> without matching </noinclude> by removing the
        # rest of the file
        text = re.sub(r"(?is)<\s*noinclude\s*>.*", "", text)
        text = re.sub(r"(?is)<\s*noinclude\s*/\s*>", "", text)
        # Apparently unclosed <!-- at the end of a template body is ignored
        text = re.sub(r"(?s)<!\s*--.*", "", text)
        # <onlyinclude> tags, if present, include the only text that will be
        # transcluded.  All other text is ignored.
        onlys = list(re.finditer(r"(?is)<\s*onlyinclude\s*>(.*?)"
                                 r"<\s*/\s*onlyinclude\s*>|"
                                 r"<\s*onlyinclude\s*/\s*>",
                                 text))
        if onlys:
            text = "".join(m.group(1) or "" for m in onlys)
        # Remove <includeonly>.  They mark text that is not visible on the page
        # itself but is included in transclusion.  Also text outside these tags
        # is included in transclusion.
        text = re.sub(r"(?is)<\s*(/\s*)?includeonly\s*(/\s*)?>", "", text)
        return text

In [7]:
initialize = 0

if initialize:
    commons_templates_modules = spark.read.format('com.databricks.spark.xml') \
                                        .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '10' or ns = '828'")
    commons_templates_modules.coalesce(1).write.format("com.databricks.spark.xml").mode("overwrite")\
                                         .options(rowTag='page', rootTag='pages').save(TEMPLATES_DUMP)

    # Remove and rename .crc files
    os.rename(os.path.join(TEMPLATES_DUMP, 'part-00000'), (new_path:=os.path.join(os.path.dirname(TEMPLATES_DUMP), 'temp.xml')))
    shutil.rmtree(TEMPLATES_DUMP)
    os.rename(new_path, TEMPLATES_DUMP)

In [8]:
def page_handler(model, title, next):
    if not (title.startswith("Template:") or title.startswith("Module:")):
        return None

In [9]:
ctx = wtp.Wtp()
ctx.process(TEMPLATES_DUMP, page_handler, windows=False)

# otherwise "TypeError: cannot serialize '_io.FileIO' object" would be 
# raised later
ctx.tmp_file = None

UNSUPPORTED pages 2 {}
Analyzing which templates should be expanded before parsing


### Tests

In [9]:
templates_categories = spark.read.format("com.databricks.spark.xml")\
                                 .options(rowTag='page').load(TEMPLATES_DUMP)

                                                                                

In [36]:
aa = templates_categories.filter('ns="828"').collect()

In [12]:
list(map(lambda x: x.title, aa))

['Module:Navbox with collapsible groups',
 'Module:Navbox with collapsible groups/doc',
 'Module:Navboxes',
 'Module:PermissionTicket',
 'Module:PermissionTicket/doc',
 'Module:PermissionTicket/testcases',
 'Module:PermissionTicket/testcases/doc',
 'Module:RomanNumber',
 'Module:Contributor/sandbox']

In [16]:
text = 'ciao bello {{global maintenance category}} come stai?'

In [17]:
body_gmc = template_to_body('global maintenance category', '{{autotranslate|1={{{shortcut|{{{1|}}} }}}|purge={{{purge|}}}|base=Global maintenance category/i18n}}<includeonly>{{#ifeq:{{{hidden|}}}|n||__HIDDENCAT__}}</includeonly><noinclude>{{documentation}}</noinclude>')
body_autotrans = template_to_body('autotranslate', '<includeonly>{{#invoke:PermissionTicket|PermissionTicket|id=2022011910011071|nocat=1}}<!--  -->{{#ifeq: {{FULLPAGENAME}} |Template:{{{base|}}} |[[Category:Autotranslated templates|{{PAGENAME}}]]}}</includeonly><noinclude>{{Documentation}}</noinclude>')

In [18]:
templates_dict = {'global maintenance category': body_gmc, 'autotranslate': body_autotrans}

In [19]:
ctx.assign_templates(templates_dict)

In [15]:
ctx.expand('aaa{{#invoke:RomanNumber|roman|2570}}bbb', title='mua')

'aaaMMDLXXbbb'

In [20]:
ctx.expand(text, title='prova')

prova: ERROR: LUA error in #invoke ('PermissionTicket', 'PermissionTicket', 'id=2022011910011071', 'nocat=1') parent ('Template:autotranslate', {1: '', 'purge': '', 'base': 'Global maintenance category/i18n'}) at ['global maintenance category', 'autotranslate', '#invoke']
	Loading module failed in #invoke: Module:PermissionTicket
[string "<python>"]:77: module not found
stack traceback:
	[C]: in function 'assert'
	[string "<python>"]:77: in function 'new_require'
	[string "Module:PermissionTicket"]:22: in main chunk
	[C]: in function 'xpcall'
	[string "_sandbox_phase2"]:180: in function <[string "_sandbox_phase2"]:140>


'ciao bello <strong class="error">Lua execution error in Module:PermissionTicket function PermissionTicket</strong>__HIDDENCAT__ come stai?'

In [26]:
# from wikitextprocessor.luaexec import call_lua_sandbox
# from wikitextprocessor.parser import parse_encoded, NodeKind
# from wikitextprocessor.common import (MAGIC_FIRST, MAGIC_LAST, MAX_MAGICS, MAGIC_NOWIKI_CHAR,
#                      preprocess_text)
# from wikitextprocessor.dumpparser import process_dump

In [57]:
import pickle
import my_wtp
from importlib import reload
reload(my_wtp)

<module 'my_wtp' from '/home/salvi/wiki_image_classification/taxonomy/my_wtp.py'>

In [58]:
ctx = my_wtp.Wtp()

In [63]:
a = open('fs', 'wb')

In [69]:
ctx.tmp_file = None

In [70]:
pickle.Pickler(open('test', 'wb')).dump(ctx)

In [60]:
ctx.expand('aaa{{#invoke:RomanNumber|roman|2570}}bbb', title='mua')

'aaabbb'

## Categories

In [10]:
commons_categories_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '14'")
commons_categories_raw.persist()

                                                                                

DataFrame[id: bigint, ns: bigint, redirect: struct<_VALUE:string,_title:string>, revision: struct<comment:string,contributor:struct<id:bigint,ip:string,username:string>,format:string,id:bigint,minor:string,model:string,parentid:bigint,sha1:string,text:struct<_VALUE:string,_bytes:bigint,_xml:space:string>,timestamp:string>, title: string]

In [11]:
commons_categories_raw.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _bytes: long (nullable = true)
 |    |    |-- _xml:space: string (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)



In [12]:
# Build a dictionary of redirects (old_title -> redirect_title)
category_redirects = {normalize_title(r.title): normalize_title(r.redirect._title) 
                      for r in commons_categories_raw.filter('redirect is not null').collect()}
category_redirects

                                                                                

{'Scholars of Albania': 'Scholars from Albania',
 'IRT Substation 17': 'Dyckman-Hillside Substation (Substation 17)',
 'The Northern Mariana Islands': 'Northern Mariana Islands'}

In [13]:
categories_regex = re.compile('\[\[(Category:[^\|]*?)(?:\|.*?)*\]\]')
hiddencat_regex = re.compile('__HIDDENCAT__')

In [14]:
class ChildsAccumulator(AccumulatorParam):
    '''
    Accumulator for childs: a dictionary mapping each category to its childs
    '''
    def zero(self, value):
        return defaultdict(list)

    def addInPlace(self, val1, val2):
        for key, value in val2.items():
            val1[key] += value
        return val1

In [15]:
def extract_category(row):
    '''
    Extract the details of a category
    '''
    title = normalize_title(row.title)
    # print('>>>>>', title)
    # try:
    #     text = ctx.expand(row.revision.text._VALUE, title=title)
    # except AssertionError:
    #     text = ''
    text = row.revision.text._VALUE
        
    parents = re.findall(categories_regex, text) if text else []
    parents = [category_redirects[normalize_title(parent)] if normalize_title(parent) 
               in category_redirects.keys() else normalize_title(parent) for parent in parents]
    global acc
    if parents:
        acc += {parent: [title] for parent in parents}
    return Row(
        id=row.id,
        title=title,
        parents=parents,
        hiddencat=re.search(hiddencat_regex, text) is not None if text else False
    )

In [16]:
# Schema of the processed categories DataFrame
schema_cat = StructType([StructField("id", IntegerType(), True),
                         StructField("title", StringType(), True),
                         StructField("parents", ArrayType(StringType()), True),
                         StructField("hiddencat", BooleanType(), True)])

In [17]:
# We ignore redirect categories, eventually remapping parents to their redirects
acc = sc.accumulator(defaultdict(list), ChildsAccumulator())
categories_clean = spark.createDataFrame(commons_categories_raw.filter('redirect is null')\
                                            .rdd.map(extract_category).filter(lambda r: r is not None), 
                                         schema=schema_cat)

commons_categories_raw.unpersist()
categories_clean.persist();

In [18]:
schema_childs = StructType([StructField('title', StringType(), True),
                            StructField('childs', ArrayType(StringType(), True), True)])

In [19]:
# Workaround for the fact that the value of acc is used before it is filled, need to fix this.
categories_clean.collect();

                                                                                

In [20]:
childs_df = spark.createDataFrame(acc.value.items(), schema=schema_childs)

In [21]:
categories = categories_clean.alias('c').join(childs_df, categories_clean.title==childs_df.title).select('c.*', 'childs')
categories_clean.unpersist();
categories.persist();

In [22]:
hidden_categories = categories.filter('hiddencat is True').select('title').rdd.flatMap(lambda x: x).collect()

22/03/21 23:36:02 WARN TaskSetManager: Stage 3 contains a task of very large size (1241 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [23]:
hidden_categories[:10]

['Photographers and artists who died in 1872',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor',
 'Valid SVG created with Text Editor']

In [24]:
categories.show()

+---------+--------------------+--------------------+---------+--------------------+
|       id|               title|             parents|hiddencat|              childs|
+---------+--------------------+--------------------+---------+--------------------+
|115066573|1997 events in Ba...|                  []|    false|[May 1997 events ...|
|114987637|2021-22 Luge Worl...|[2021-22 Luge Wor...|    false|[Nations Cup at 2...|
|114918550|2022 in Sonoma Co...|[Sonoma County by...|    false|[2022 in Sonoma, ...|
|115365996|8th-century Buddh...|[8th-century Budd...|    false|[8th-century Budd...|
|114597220|     AGRI Party-list|[Agrarian parties...|    false|[Pakyaw contracto...|
|115384984|Aeolian erosion i...|[Erosion in Engla...|    false|   [Wind in England]|
|115388819|Aeroporto (Salvador)|[Neighborhoods in...|    false|[Avenida Tenente ...|
|114983586|Agestrata dehaan ...|[Agestrata dehaan...|    false|[Agestrata dehaan...|
|115269554|Amber, blue, cyan...|[Combinations of ...|    false|[A

## Files

In [25]:
commons_files_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED)\
                                .filter("ns = '6'")
commons_files_raw.persist();

                                                                                

In [26]:
# Build a dictionary of redirects
file_redirects = {normalize_title(r.title): normalize_title(r.redirect._title)
                  for r in commons_files_raw.filter('redirect is not null').collect()}
file_redirects

                                                                                

{'Harbour russell above.JPG': 'Harbour Opua above.jpg',
 'Print, trade-card (BM 1880,0911.1002).jpg': 'Trade card of C. Ingrey, 310, Strand.jpg',
 'PM 040753 E Tarragona.jpg': 'Tarragona, Catedral de Santa María-PM 40753.jpg',
 'PM 055315 E El Miracle.jpg': 'El Miracle, El retaule Barroc-PM 55315.jpg',
 'PM 055314 E El Miracle.jpg': 'El Miracle, El retaule Barroc-PM 55314.jpg',
 'PM 033460 P Vila Real Mateus.jpg': 'Mateus PM 33460.jpg',
 'PM 008469 B Gent.jpg': 'Gent B PM 008469.jpg',
 'Novartis Cambridge 2019.jpg': 'Novartis 100 Technology Square Cambridge 2019.jpg',
 'Novartis Cambridge November 2019.jpg': 'Novartis 100 Technology Square Cambridge November 2019.jpg',
 'IDF parade 997009327174305171.jpg': 'IDF parade (997009327174305171.jpg',
 'IDF parade 997009327174705171.jpg': 'IDF parade (997009327174705171.jpg',
 'IDF parade 997009327265505171.jpg': 'IDF parade (997009327265505171.jpg',
 'IDF parade 997009327265705171.jpg': 'IDF parade (997009327265705171.jpg',
 'IDF parade 99700

For now, we consider only the images that appear in en.wikipedia, discarding all the others. We can also ignore redirects.

In [27]:
# list of chunks of the dataset
WIT_DATASET = ['../../wit_v1.train.all-1percent_sample.tsv.gz']

In [28]:
wiki_image_names = []

for chunk in WIT_DATASET:
    wiki_image_names += pd.read_csv(chunk, sep="\t").query("language == 'en'")\
                            .image_url.apply(lambda r: normalize_title(r.split('/')[-1], False)).tolist()

In [29]:
# Keep only unique values
wiki_image_names = set(wiki_image_names)

# Remap redirects
wiki_image_names = {file_redirects[name] if name in file_redirects.keys() else name for name in wiki_image_names}

In [30]:
def extract_file(row):
    '''
    Extract the details of a file
    '''
    text = row.revision.text._VALUE
    categories = re.findall(categories_regex, text) if text else []
    # No way to do this with a list comprehension (nested conditions work only if there is always an else)
    categories_nohidd = []
    for category in categories:
        category_norm = normalize_title(category)
        if(category_norm not in hidden_categories):
            if(category_norm in category_redirects.keys()):
                if((c:=category_redirects[category_norm]) not in hidden_categories):
                    categories_nohidd.append(c)
            else:
                categories_nohidd.append(category_norm)

    return Row(
        id=row.id,
        title=normalize_title(row.title),
        categories = categories_nohidd
    )

In [31]:
# Schema of the processed files DataFrame
schema_files = StructType([StructField("id", IntegerType(), True),
                           StructField("title", StringType(), True),
                           StructField("categories", ArrayType(StringType()), True)])

In [32]:
# Also for files, we ignore redirects
files_clean = spark.createDataFrame(commons_files_raw.filter('redirect is null')\
                                            .rdd.map(extract_file).filter(lambda r: r is not None), 
                                    schema=schema_files)
commons_files_raw.unpersist();

In [33]:
files_clean.show()

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 1) / 1]
Traceback (most recent call last):
  File "/home/salvi/.conda/envs/francesco/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/salvi/.conda/envs/francesco/lib/python3.9/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/salvi/.conda/envs/francesco/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

## Close

In [38]:
spark.stop()