## Imports

In [1]:
import sys

sys.path.append("../../")

from src.config import *
from src.utilities import normalize_title

In [None]:
import os
import pickle
import re
import shutil
from collections import defaultdict

import numpy as np
import pandas as pd
import pyspark
from pyspark.accumulators import AccumulatorParam
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import (
    ArrayType,
    BooleanType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

conf = (
    pyspark.SparkConf()
    .setMaster("local[10]")
    .setAll(
        [
            ("spark.jars.packages", "com.databricks:spark-xml_2.12:0.8.0"),
            ("spark.executor.memory", "4g"),
            ("spark.driver.memory", "5g"),
            ("spark.driver.maxResultSize", "20G"),
            ("spark.executor.heartbeatInterval", "60s"),
            ("spark.network.timeout", "61s"),
        ]
    )
)
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [2]:
# sc.setLogLevel('DEBUG')

In [3]:
spark

## Categories

In [None]:
commons_categories_raw = (
    spark.read.format("com.databricks.spark.xml")
    .options(rowTag="page")
    .load(COMMONS_DUMP)
    .filter("ns = '14'")
)
# commons_categories_raw.persist()

In [7]:
commons_categories_raw.printSchema()

root
 |-- id: long (nullable = true)
 |-- ns: long (nullable = true)
 |-- redirect: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _title: string (nullable = true)
 |-- revision: struct (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |-- contributor: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _deleted: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- ip: string (nullable = true)
 |    |    |-- username: string (nullable = true)
 |    |-- format: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- minor: string (nullable = true)
 |    |-- model: string (nullable = true)
 |    |-- parentid: long (nullable = true)
 |    |-- sha1: string (nullable = true)
 |    |-- text: struct (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |  

In [10]:
commons_categories_raw.write.mode("overwrite").parquet(CATEGORIES_DUMP)

                                                                                

In [73]:
commons_categories_raw = spark.read.parquet(CATEGORIES_DUMP)

In [6]:
# Build a dictionary of redirects (old_title -> redirect_title)
category_redirects = {
    normalize_title(r.title): normalize_title(r.redirect._title)
    for r in commons_categories_raw.filter("redirect is not null").collect()
}
len(category_redirects)

                                                                                

751

In [7]:
categories_regex = re.compile(
    "(?<!wpTextbox1\.value\+=')(?<!wpTextbox1\.value=')\[\[([cC]ategory:[^\|]*?)(?:\|.*?)*\]\]"
)
hiddencat_regex = re.compile(
    "__HIDDENCAT__"
    + "|\{\{[hH]iddencat\}\}"
    + "|\{\{[uU]ser category.*?\}\}"
    + "|\{\{[dD]isambig.*?\}\}"
    + "|\{\{[gG]lobal maintenance category\}\}"
    + "|\[\[([cC]ategory:Categories for discussion[^\|]*?)(?:\|.*?)*\]\]"
    + "|\[\[([cC]ategory:Media contributed by[^\|]*?)(?:\|.*?)*\]\]"
)

In [8]:
class ChildsAccumulator(AccumulatorParam):
    """
    Accumulator for childs: a dictionary mapping each category to its childs
    """

    def zero(self, value):
        return defaultdict(list)

    def addInPlace(self, val1, val2):
        for key, value in val2.items():
            val1[key] += value
        return val1

In [9]:
def extract_category(row):
    """
    Extract the details of a category
    """
    title = normalize_title(row.title)
    text = row.revision.text._VALUE

    parents = re.findall(categories_regex, text) if text else []
    parents = [
        category_redirects[normalize_title(parent)]
        if normalize_title(parent) in category_redirects.keys()
        else normalize_title(parent)
        for parent in parents
    ]
    global acc
    if parents:
        acc += {parent: [title] for parent in parents}
    return Row(
        id=row.id,
        title=title,
        parents=parents,
        hiddencat=re.search(hiddencat_regex, text) is not None if text else False,
    )

In [10]:
# Schema of the processed categories DataFrame
schema_cat = StructType(
    [
        StructField("id", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("parents", ArrayType(StringType()), True),
        StructField("hiddencat", BooleanType(), True),
    ]
)

In [12]:
# We ignore redirect categories, eventually remapping parents to their redirects
acc = sc.accumulator(defaultdict(list), ChildsAccumulator())
categories_clean = spark.createDataFrame(
    commons_categories_raw.filter("redirect is null")
    .rdd.map(extract_category)
    .filter(lambda r: r is not None),
    schema=schema_cat,
)

# commons_categories_raw.unpersist()
# categories_clean.persist();

In [13]:
# Workaround for the fact that the value of acc is used before it is filled

TEMP_PATH = "../../dump.xml"

categories_clean.write.format("com.databricks.spark.xml").mode("overwrite").options(
    rowTag="page", rootTag="pages"
).save(TEMP_PATH)

# Remove files
shutil.rmtree(TEMP_PATH)

                                                                                

In [14]:
schema_childs = StructType(
    [
        StructField("title", StringType(), True),
        StructField("childs", ArrayType(StringType(), True), True),
    ]
)

In [15]:
childs_df = spark.createDataFrame(acc.value.items(), schema=schema_childs)

In [17]:
categories = (
    categories_clean.alias("c")
    .join(childs_df, categories_clean.title == childs_df.title, how="left")
    .select("c.*", "childs")
)

In [18]:
categories.write.mode("overwrite").parquet(CATEGORIES_PATH)

22/05/02 17:23:47 WARN TaskSetManager: Stage 4 contains a task of very large size (251897 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [74]:
categories = spark.read.parquet(CATEGORIES_PATH)

In [12]:
categories.show()

+--------------------+---------+--------------------+---------+------+----------+
|               title|       id|             parents|hiddencat|childs|in_en_wiki|
+--------------------+---------+--------------------+---------+------+----------+
|"Bioagra" plant i...| 37673751|[Goświnowice, Fac...|    false|  null|     false|
|"Dancing Dervishe...|113462211|[Kamal-ud-din Bih...|    false|  null|      true|
|"Dardanelles", Po...| 33938219|[Polkemmet collie...|    false|  null|      true|
|"Der Verwalter", ...| 12707820|[Buildings in Dor...|    false|  null|      true|
|"Doktorhaus" (Got...|112648762|[Cultural propert...|    false|  null|     false|
|"Evaluarea impact...|102833104|[Photos from Parl...|    false|  null|     false|
|"Forever alive" m...| 66089937|                  []|    false|  null|     false|
|"Gracias" rainbow...| 89849682|[Rainbows, COVID-19]|    false|  null|     false|
|"Kniende" (Karl T...| 18536700|[Statues in Berli...|    false|  null|     false|
|     "Krym" roa

In [13]:
categories.count()

11029650

In [14]:
hidden_categories = (
    categories.filter("hiddencat is True")
    .select("title")
    .rdd.flatMap(lambda x: x)
    .collect()
)

                                                                                

In [15]:
len(hidden_categories)

137448

## Files

In [76]:
commons_files_raw = (
    spark.read.format("com.databricks.spark.xml")
    .option("inferSchema", "false")
    .schema(commons_categories_raw.schema)
    .options(rowTag="page")
    .load(COMMONS_DUMP)
    .filter("ns = '6'")
)
# commons_files_raw.persist();

In [None]:
# Build a dictionary of redirects
file_redirects = {
    normalize_title(r.title): normalize_title(r.redirect._title)
    for r in commons_files_raw.filter("redirect is not null").collect()
}

In [28]:
with open(FILE_REDIRECTS, "wb") as f:
    pickle.dump(file_redirects, f)

In [77]:
with open(FILE_REDIRECTS, "rb") as f:
    file_redirects = pickle.load(f)

In [78]:
len(file_redirects)

1903071

For now, we consider only the images that appear in en.wikipedia, discarding all the others. We can also ignore redirects.

In [31]:
wit_images = pd.DataFrame()

# Filter for images in english, normalizing titles and remapping redirects
for i, wit_chunk in enumerate(WIT_DATASET):
    print(f"Processing chunk {i}...")
    wit_images = pd.concat(
        [
            wit_images,
            pd.read_csv(wit_chunk, sep="\t")
            .query(
                "language == 'en' and image_url.str.contains('/commons/')",
                engine="python",
            )
            .image_url.apply(
                lambda x: pd.Series(
                    {
                        "image_path": x.split("commons/")[1],
                        "title": file_redirects[name]
                        if (name := normalize_title(x.split("/")[-1], False))
                        in file_redirects.keys()
                        else name,
                    }
                )
            ),
        ]
    )

Processing chunk 0...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...


In [34]:
# Keep only unique titles
wit_images.drop_duplicates(subset="title", inplace=True)

In [36]:
wit_images_df = spark.createDataFrame(
    wit_images,
    StructType(
        [
            StructField("image_path", StringType(), True),
            StructField("title", StringType(), True),
        ]
    ),
)

In [43]:
wit_images_df.write.mode("overwrite").parquet(WIT_NAMES_PATH)

22/05/08 17:29:28 WARN TaskSetManager: Stage 2 contains a task of very large size (32447 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [79]:
wit_images_df = spark.read.parquet(WIT_NAMES_PATH)

In [80]:
wit_images_df.count()

3915597

In [81]:
# Filter only files contained in WIT
normalize_title_udf = udf(lambda r: normalize_title(r))
wit_files_raw = commons_files_raw.withColumn(
    "title_norm", normalize_title_udf(commons_files_raw.title)
).join(wit_images_df, col("title_norm") == wit_images_df.title)

In [82]:
def extract_file(row):
    """
    Extract the details of a file
    """
    text = row.revision.text._VALUE

    categories = re.findall(categories_regex, text) if text else []

    # No way to do this with a list comprehension (nested conditions work only if there is always an else)
    # Remap categories to their redirect and filter hidden categories
    categories_nohidd = []
    for category in categories:
        category_norm = normalize_title(category)
        if category_norm not in hidden_categories:
            if category_norm in category_redirects.keys():
                if (c := category_redirects[category_norm]) not in hidden_categories:
                    categories_nohidd.append(c)
            else:
                categories_nohidd.append(category_norm)

    return Row(
        id=row.id,
        title=normalize_title(row.title),
        url=row.image_path,
        categories=categories_nohidd,
    )

In [83]:
# Schema of the processed files DataFrame
schema_files = StructType(
    [
        StructField("id", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("url", StringType(), True),
        StructField("categories", ArrayType(StringType()), True),
    ]
)

In [84]:
# Also for files, we ignore redirects
wit_files = spark.createDataFrame(
    wit_files_raw.filter("redirect is null")
    .rdd.map(extract_file)
    .filter(lambda r: r is not None),
    schema=schema_files,
)



In [85]:
wit_files.write.mode("overwrite").parquet(FILES_PATH)

                                                                                

In [86]:
wit_files = spark.read.parquet(FILES_PATH)

In [87]:
wit_files.show()

+--------+--------------------+--------------------+--------------------+
|      id|               title|                 url|          categories|
+--------+--------------------+--------------------+--------------------+
|41179998|" Moyan di mandi"...|b/b4/%22_Moyan_di...|    [Anandpur Sahib]|
|21613157|"1zetem" Chocianó...|7/77/%221zetem%22...|[Historic urban l...|
|58385806|"A Yellow Room" b...|8/89/%22A_Yellow_...|[William Bruce El...|
| 2554419|"Baron Renfrew" 1...|4/45/%22Baron_Ren...|[Baron of Renfrew...|
|54619021|"Big John" Helper...|9/9e/%22Big_John%...|[Big John (Muffle...|
| 5653154|"Blue condominium...|5/51/%22Blue_cond...|[Apartment buildi...|
|17065596|"Cheyenne, Wyo., ...|5/5a/%22Cheyenne%...|[Cheyenne, Wyomin...|
|28832857|"Fuente de los Cá...|f/f3/%22Fuente_de...|[Fuente de los Cá...|
|57969172|"Harewood," Samue...|d/da/%22Harewood%...|[Photographs by F...|
|91404757|"Island Number Te...|1/19/%22Island_Nu...|[Abraham Lincoln ...|
|17073306|"Kelly Miller" - ...|2/2c/%2

In [88]:
wit_files.count()

3891446

## Categories/2

In [41]:
# List of categories that appear in en.wikipedia
categories_in_wikipedia = (
    wit_files.rdd.flatMap(lambda x: x.categories).distinct().map(Row("title")).toDF()
)
categories_in_wikipedia = categories_in_wikipedia.withColumn("in_en_wiki", lit(True))

                                                                                

In [42]:
categories_in_wikipedia.show(5)

+--------------------+----------+
|               title|in_en_wiki|
+--------------------+----------+
|        John McLenan|      true|
|Postcards of brid...|      true|
|    Mortier (organs)|      true|
|  2000s black sedans|      true|
|Rail transport ma...|      true|
+--------------------+----------+
only showing top 5 rows



                                                                                

In [43]:
categories_in_wikipedia.count()

                                                                                

2983495

In [49]:
# Not all categories find within files are existing categories - Red links
red_links = categories_in_wikipedia.join(categories, "title", "left_anti")
print(red_links.count())

red_links.take(10)

                                                                                

62091


                                                                                

[Row(title='(145453) 2005 RR43', in_en_wiki=True),
 Row(title='(275809) 2001 QY297', in_en_wiki=True),
 Row(title='(416400) 2003 UZ117', in_en_wiki=True),
 Row(title='100.000 dollari per Ringo', in_en_wiki=True),
 Row(title='1000 Islands', in_en_wiki=True),
 Row(title='104 in logos', in_en_wiki=True),
 Row(title='1079 in art', in_en_wiki=True),
 Row(title='1252 in Genoa', in_en_wiki=True),
 Row(title='12th Night', in_en_wiki=True),
 Row(title='13-023', in_en_wiki=True)]

In [50]:
categories = (
    categories.alias("c")
    .join(categories_in_wikipedia, "title", "left")
    .select("c.*", categories_in_wikipedia.in_en_wiki)
)
categories = categories.na.fill(False, subset=["in_en_wiki"])

In [51]:
categories.filter("in_en_wiki == True").count()

                                                                                

2921404

In [52]:
categories.count()

                                                                                

11029650

In [53]:
temp_path = ".".join(CATEGORIES_PATH.split(".")[:-1]) + "-temp.parquet"
categories.write.mode("overwrite").parquet(temp_path)

shutil.rmtree(CATEGORIES_PATH)
os.rename(temp_path, CATEGORIES_PATH)

                                                                                

## Close

In [70]:
spark.stop()