In [None]:
!pip install git+https://github.com/frasalvi/wikitextprocessor

In [1]:
import numpy as np 
import pandas as pd
import urllib
import re
from collections import defaultdict

import wikitextprocessor as wtp

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType
from pyspark.accumulators import AccumulatorParam

conf = pyspark.SparkConf().setMaster("local[4]").setAll([
                                   ('spark.jars.packages', 'com.databricks:spark-xml_2.12:0.8.0'),
                                   ('spark.executor.memory', '4g'),
                                   ('spark.driver.memory','2g'),
                                   ('spark.driver.maxResultSize', '5G'),
                                   ('spark.executor.heartbeatInterval', '3600s'),
                                   ('spark.network.timeout', '4000s')
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [None]:
spark

In [61]:
# COMMONS_DUMP_REDUCED = 'commonswiki-20220220-pages-articles-multistream1.xml-p1p1500000.bz2'
COMMONS_DUMP_REDUCED = 'commonswiki-20220220-pages-articles-multistream6.xml-p114543930p115400363.bz2'
TEMPLATES_DUMP = 'commonswiki-20220220-templates-modules.xml'

In [4]:
# Adapted from https://github.com/epfl-dlab/WikiPDA/blob/master/PaperAndCode/TopicsExtractionPipeline/GenerateDataframes.py
def normalize_title(title, dumps=True):
    """ Replace _ with space, remove anchor and namespace prefix, capitalize """
    title = urllib.parse.unquote(title)
    if(dumps):
        title = title.split(':')[1]
    title = title.strip()
    if len(title) > 0:
        title = title[0].upper() + title[1:]
    n_title = title.replace("_", " ")
    if '#' in n_title:
        n_title = n_title.split('#')[0]
    return n_title

## Templates

In [90]:
# from https://github.com/tatuylonen/wikitextprocessor/blob/ee043cff190543fb94cb40d4827444d8982a30fe/wikitextprocessor/core.py#L490
def template_to_body(title, text):
        """Extracts the portion to be transcluded from a template body.  This
        returns an str."""
        assert isinstance(title, str)
        assert isinstance(text, str)
        # Remove all comments
        text = re.sub(r"(?s)<!\s*--.*?--\s*>", "", text)
        # Remove all text inside <noinclude> ... </noinclude>
        text = re.sub(r"(?is)<\s*noinclude\s*>.*?<\s*/\s*noinclude\s*>",
                      "", text)
        # Handle <noinclude> without matching </noinclude> by removing the
        # rest of the file
        text = re.sub(r"(?is)<\s*noinclude\s*>.*", "", text)
        text = re.sub(r"(?is)<\s*noinclude\s*/\s*>", "", text)
        # Apparently unclosed <!-- at the end of a template body is ignored
        text = re.sub(r"(?s)<!\s*--.*", "", text)
        # <onlyinclude> tags, if present, include the only text that will be
        # transcluded.  All other text is ignored.
        onlys = list(re.finditer(r"(?is)<\s*onlyinclude\s*>(.*?)"
                                 r"<\s*/\s*onlyinclude\s*>|"
                                 r"<\s*onlyinclude\s*/\s*>",
                                 text))
        if onlys:
            text = "".join(m.group(1) or "" for m in onlys)
        # Remove <includeonly>.  They mark text that is not visible on the page
        # itself but is included in transclusion.  Also text outside these tags
        # is included in transclusion.
        text = re.sub(r"(?is)<\s*(/\s*)?includeonly\s*(/\s*)?>", "", text)
        return text

In [62]:
initialize = 1

if initialize:
    commons_templates_modules = spark.read.format('com.databricks.spark.xml') \
                                        .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '10' or ns = '828'")
    commons_templates_modules.write.format("com.databricks.spark.xml").mode("overwrite")\
                                   .options(rowTag='page', rootTag='pages').save(TEMPLATES_DUMP)
    # On windows, CRC parts must then be merged manually

In [64]:
templates_categories = spark.read.format("com.databricks.spark.xml")\
                                    .options(rowTag='page').load(TEMPLATES_DUMP)

In [45]:
def page_handler(model, title, next):
    if not (title.startswith("Template:") or title.startswith("Module:")):
        return None

In [72]:
ctx = wtp.Wtp()
ctx.process(TEMPLATES_DUMP, page_handler, windows=True)

UNSUPPORTED pages 2 {}
Analyzing which templates should be expanded before parsing


<generator object Wtp.reprocess at 0x00000258410BFF90>

### Tests

In [66]:
aa = templates_categories.filter('ns="828"').collect()

In [74]:
list(map(lambda x: x.title, aa))

['Module:Navbox with collapsible groups',
 'Module:Navbox with collapsible groups/doc',
 'Module:Navboxes',
 'Module:PermissionTicket',
 'Module:PermissionTicket/doc',
 'Module:PermissionTicket/testcases',
 'Module:PermissionTicket/testcases/doc',
 'Module:RomanNumber',
 'Module:Contributor/sandbox']

In [75]:
text = 'ciao bello {{global maintenance category}} come stai?'

In [78]:
body_gmc = template_to_body('global maintenance category', '{{autotranslate|1={{{shortcut|{{{1|}}} }}}|purge={{{purge|}}}|base=Global maintenance category/i18n}}<includeonly>{{#ifeq:{{{hidden|}}}|n||__HIDDENCAT__}}</includeonly><noinclude>{{documentation}}</noinclude>')
body_autotrans = template_to_body('autotranslate', '<includeonly>{{#invoke:PermissionTicket|PermissionTicket|id=2022011910011071|nocat=1}}<!--  -->{{#ifeq: {{FULLPAGENAME}} |Template:{{{base|}}} |[[Category:Autotranslated templates|{{PAGENAME}}]]}}</includeonly><noinclude>{{Documentation}}</noinclude>')

In [81]:
templates_dict = {'global maintenance category': body_gmc, 'autotranslate': body_autotrans}

In [83]:
ctx.assign_templates(templates_dict)

In [84]:
ctx.expand(text, title='prova')

AttributeError: module 'os' has no attribute 'pread'

## Categories

In [None]:
commons_categories_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '14'")
commons_categories_raw.persist()

In [6]:
commons_categories_raw.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _bytes: long (nullable = true)
 |    |    |-- _xml:space: string (nullable = true)
 |  

## Categories

In [5]:
commons_categories_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED).filter("ns = '14'")
commons_categories_raw.persist()

DataFrame[id: bigint, ns: bigint, redirect: struct<_VALUE:string,_title:string>, revision: struct<comment:struct<_VALUE:string,_deleted:string>,contributor:struct<id:bigint,ip:string,username:string>,format:string,id:bigint,minor:string,model:string,parentid:bigint,sha1:string,text:struct<_VALUE:string,_bytes:bigint,_xml:space:string>,timestamp:string>, title: string]

In [9]:
class ChildsAccumulator(AccumulatorParam):
    '''
    Accumulator for childs: a dictionary mapping each category to its childs
    '''
    def zero(self, value):
        return defaultdict(list)

    def addInPlace(self, val1, val2):
        for key, value in val2.items():
            val1[key] += value
        return val1

In [10]:
def extract_category(row):
    '''
    Extract the details of a category
    '''
    title=normalize_title(row.title)
    text=row.revision.text._VALUE
    parents=re.findall(categories_regex, text) if text else []
    parents=[category_redirects[parent] if parent in category_redirects.keys() else parent for parent in parents]
    global acc
    if parents:
        acc += {parent: [title] for parent in parents}
    return Row(
        id=row.id,
        title=title,
        parents=parents,
        hiddencat=re.search(hiddencat_regex, text) is not None if text else False
    )

In [11]:
# Schema of the processed categories DataFrame
schema_cat = StructType([StructField("id", IntegerType(), True),
                         StructField("title", StringType(), True),
                         StructField("parents", ArrayType(StringType()), True),
                         StructField("hiddencat", BooleanType(), True)])

In [15]:
# We ignore redirect categories, eventually remapping parents to their redirects
acc = sc.accumulator(defaultdict(list), ChildsAccumulator())
categories_clean = spark.createDataFrame(commons_categories_raw.filter('redirect is null')\
                                            .rdd.map(extract_category).filter(lambda r: r is not None), 
                                         schema=schema_cat)

commons_categories_raw.unpersist();

In [16]:
schema_childs = StructType([StructField('title', StringType(), True),
                            StructField('childs', ArrayType(StringType(), True), True)])

In [17]:
# Workaround for the fact that the value of acc is used before it is filled, need to fix this.
categories_clean.collect();

In [18]:
childs_df = spark.createDataFrame(acc.value.items(), schema=schema_childs)

In [19]:
categories = categories_clean.alias('c').join(childs_df, categories_clean.title==childs_df.title).select('c.*', 'childs')

In [20]:
categories.cache()

DataFrame[id: int, title: string, parents: array<string>, hiddencat: boolean, childs: array<string>]

In [21]:
hidden_categories = categories.filter('hiddencat is True').select('title').rdd.flatMap(lambda x: x).collect()

In [22]:
hidden_categories[:10]

['GFDL',
 'Flickr review needed',
 'GPL',
 'PD CIA',
 'PD US',
 'PD US Military',
 'PD Germany',
 'PD Indonesia',
 'CC-BY-2.5',
 'PD OpenClipart']

## Files

In [30]:
commons_files_raw = spark.read.format('com.databricks.spark.xml') \
                                .options(rowTag='page').load(COMMONS_DUMP_REDUCED)\
                                .filter("ns = '6'")
commons_files_raw.persist();

In [47]:
# Build a dictionary of redirects
file_redirects = {normalize_title(r.title): normalize_title(r.redirect._title)
                  for r in commons_files_raw.filter('redirect is not null').collect()}
file_redirects

{'GDR Army OF9 Generalleutnant.gif': 'GDR Air Force OF7 Generalleutnant.gif',
 'Empress-Dowager-Cixi2.jpg': 'Empress Dowager Cixi (c. 1890).jpg'}

For now, we consider only the images that appear in en.wikipedia, discarding all the others. We can also ignore redirects.

In [48]:
# list of chunks of the dataset
WIT_DATASET = ['wit_v1.train.all-1percent_sample.tsv.gz']

In [49]:
wiki_image_names = []

for chunk in WIT_DATASET:
    wiki_image_names += pd.read_csv(chunk, sep="\t").query("language == 'en'")\
                            .image_url.apply(lambda r: normalize_title(r.split('/')[-1], False)).tolist()

In [50]:
# Keep only unique values
wiki_image_names = set(wiki_image_names)

# Remap redirects
wiki_image_names = {file_redirects[name] if name in file_redirects.keys() else name for name in wiki_image_names}

In [51]:
def extract_file(row):
    '''
    Extract the details of a file
    '''
    text=row.revision.text._VALUE
    categories=re.findall(categories_regex, text) if text else []
    # No way to do this with a list comprehension (nested conditions work only if there is always an else)
    categories_nohidd = []
    for category in categories:
        if(category not in hidden_categories):
            if(category in category_redirects.keys()):
                if((c:=category_redirects[category]) not in hidden_categories):
                    categories_nohidd.append(c)
            else:
                categories_nohidd.append(category)

    return Row(
        id=row.id,
        title=normalize_title(row.title),
        categories = categories_nohidd
    )

In [52]:
# Schema of the processed files DataFrame
schema_files = StructType([StructField("id", IntegerType(), True),
                           StructField("title", StringType(), True),
                           StructField("categories", ArrayType(StringType()), True)])

In [53]:
# Also for files, we ignore redirects
files_clean = spark.createDataFrame(commons_files_raw.filter('redirect is null')\
                                            .rdd.map(extract_file).filter(lambda r: r is not None), 
                                    schema=schema_files)
commons_files_raw.unpersist();

In [54]:
files_clean.show()

+---+--------------------+--------------------+
| id|               title|          categories|
+---+--------------------+--------------------+
| 26|Two Gambel's Quai...|[Callipepla gambe...|
| 30|          Quail2.png|[Callipepla gambe...|
| 56|         Kane QC.png|[Computer diagram...|
| 62|The Death of Hyac...|[Paintings of nud...|
| 73|     BordUtrecht.jpg|[Utrecht Central ...|
| 76|         Bustaxi.jpg|[Public transport...|
| 80|      Buswachten.jpg|[Station Assen, B...|
| 81|          Lijn10.jpg|[Assen, Arriva Pe...|
| 82|          Lijn51.jpg|[Den Oudsten B88,...|
| 85|Groninger-museum.jpg|[Groninger Museum...|
| 87|   Groningen 003.jpg|[Der Aa-kerk, Kor...|
| 93|De Slegte, Gronin...|[Buildings in Gro...|
| 95|     Hunebed 001.jpg|[Hunebed D25 in B...|
| 96|     Hunebed 002.jpg|[Hunebed D25 in B...|
| 97|     Hunebed 003.jpg|[Hunebed D24 in B...|
| 98|     Hunebed 004.jpg|[Hunebed D21 in B...|
| 99|     Hunebed 005.jpg|[Hunebed D21 in B...|
|100|     Hunebed 006.jpg|[Hunebed D27 i

## Close

In [None]:
spark.stop()