# Article -> Country (Cultural)
This notebook computes what countries are associated with each Wikipedia article based on various cultural properties on Wikidata. It complements a notebook that computes countries via geolocation of the latitude-longitude coordinates extracted from Wikidata.

In [16]:
import csv
import json
import os
import sys

import requests

import wmfdata

In [2]:
spark = wmfdata.spark.create_session(app_name='pyspark reg; regions; isaacj',
                                  type='yarn-regular', # local, yarn-regular, yarn-large
                                  )  

SPARK_HOME: /usr/lib/spark3
Using Hadoop client lib jars at 3.2.0, provided by Spark.
PYSPARK_PYTHON=/opt/conda-analytics/bin/python3


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/26 15:11:11 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
24/04/26 15:11:11 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
24/04/26 15:11:11 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
24/04/26 15:11:11 WARN Utils: Service 'sparkDriver' could not bind on port 12002. Attempting port 12003.
24/04/26 15:11:11 WARN Utils: Service 'sparkDriver' could not bind on port 12003. Attempting port 12004.
24/04/26 15:11:11 WARN Utils: Service 'sparkDriver' could not bind on port 12004. Attempting port 12005.
24/04/26 15:11:11 WARN Utils: Service 'sparkDriver' could not bind on port 12005. Attempting port 12006.
24/04/26 15:11:11 WARN Utils: Service 'spar

## Load in supporting data files from Github

In [3]:
QID_TO_REGION = {}
COUNTRY_PROPERTIES = {
    "P17": "country",
    "P19": "place of birth",
    "P27": "country of citizenship",
    "P131": "located in the administrative territorial entity",
    "P183": "endemic to",
    "P361": "part of",
    "P495": "country of origin",
    "P1269": "facet of",
    "P1532": "country for sport",
    "P3842": "located in present-day administrative territorial entity",
    }

In [4]:
data_dir = './data'
aggregation_tsv = os.path.join(data_dir, 'country_aggregation.tsv')
aggregation_url = "https://github.com/geohci/wiki-region-groundtruth/raw/main/resources/country_aggregation.tsv"
aggregation_header = ['Aggregation', 'From', 'QID To', 'QID From']
agg_qid_to_idx = aggregation_header.index("QID To")
agg_qid_from_idx = aggregation_header.index("QID From")

countries_tsv = os.path.join(data_dir, "countries.tsv")
countries_url = "https://raw.githubusercontent.com/wikimedia-research/canonical-data/master/country/countries.tsv"
countries_header = ['name', 'iso_code', 'iso_alpha3_code', 'wikidata_id', 'is_protected', 'data_risk_score', 'data_risk_classification',
                    'maxmind_continent', 'un_continent', 'un_subcontinent', 'un_m49_code', 'wikimedia_region', 'grant_committee_region',
                    'form_990_region', 'economic_region', 'emerging_classification', 'is_eu', 'is_un_member', 'is_un_data_entity',
                    'is_imf_data_entity', 'is_world_bank_data_entity', 'is_penn_world_table_data_entity', 'market_research_classification']
country_qid_idx = countries_header.index("wikidata_id")
country_name_idx = countries_header.index("name")

In [17]:
if not os.path.exists(countries_tsv):
    response = requests.get(countries_url)
    with open(countries_tsv, mode="wb") as fout:
        fout.write(response.content)
        
if not os.path.exists(aggregation_tsv):
    response = requests.get(aggregation_url)
    with open(aggregation_tsv, mode="wb") as fout:
        fout.write(response.content)

In [19]:
# load in canonical mapping of QID -> region name for labeling
with open(countries_tsv, 'r') as fin:
    assert next(fin).strip().split('\t') == countries_header
    for line in fin:
        row = line.strip().split("\t")
        qid = row[country_qid_idx]
        region_name = row[country_name_idx]
        QID_TO_REGION[qid] = region_name
print(f"Loaded {len(QID_TO_REGION)} QID-region pairs for matching against Wikidata -- e.g., Q31: {QID_TO_REGION['Q31']}")
    
with open(aggregation_tsv, 'r') as fin:
    assert next(fin).strip().split("\t") == aggregation_header
    for line in fin:
        row = line.strip().split("\t")
        qid_to = row[agg_qid_to_idx]
        qid_from = row[agg_qid_from_idx]
        if qid_to in QID_TO_REGION:
            # map new QID to valid country
            # e.g., QID for West Bank -> Palestine
            QID_TO_REGION[qid_from] = QID_TO_REGION[qid_to]
print(f"Now {len(QID_TO_REGION)} QID-region pairs after adding aggregations -- e.g., Q40362: {QID_TO_REGION['Q40362']}")

Loaded 250 QID-region pairs for matching against Wikidata -- e.g., Q31: Belgium
Now 304 QID-region pairs after adding aggregations -- e.g., Q40362: Western Sahara


In [23]:
# Create table with QID -> Region mapping for making data in end result more readable
spark.createDataFrame(pd.DataFrame(QID_TO_REGION.items(), columns=['qid', 'region'])).createOrReplaceTempView('qid_to_region')
spark.sql("SELECT * FROM qid_to_region LIMIT 10").show(50, False)

[Stage 0:>                                                          (0 + 1) / 1]

+---------+-------------------+
|qid      |region             |
+---------+-------------------+
|Q889     |Afghanistan        |
|Q5689    |Åland              |
|Q222     |Albania            |
|Q262     |Algeria            |
|Q16641   |American Samoa     |
|Q228     |Andorra            |
|Q916     |Angola             |
|Q25228   |Anguilla           |
|Q21590062|Antarctica         |
|Q781     |Antigua and Barbuda|
+---------+-------------------+



                                                                                

In [24]:
# value info in wikidata entity table (https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Wikidata_entity)
# is a string as opposed to struct (because it has a variable schema)
# this UDF extracts the QID value (or null if doesn't exist)
def getWikidataValue(obj):
    try:
        d = json.loads(obj)
        return d.get('id')
    except Exception:
        return None
    
spark.udf.register('getWikidataValue', getWikidataValue, 'String')

<function __main__.getWikidataValue(obj)>

In [25]:
print("Wikidata snapshots:")
spark.sql('SHOW PARTITIONS wmf.wikidata_item_page_link').show(50, False)

print("\nMediawiki snapshots:")
spark.sql('SHOW PARTITIONS wmf_raw.mediawiki_project_namespace_map').show(50, False)

Wikidata snapshots:
+-------------------+
|partition          |
+-------------------+
|snapshot=2024-02-26|
|snapshot=2024-03-04|
|snapshot=2024-03-11|
|snapshot=2024-03-18|
|snapshot=2024-03-25|
|snapshot=2024-04-01|
|snapshot=2024-04-08|
|snapshot=2024-04-15|
+-------------------+


Mediawiki snapshots:
+------------------------+
|partition               |
+------------------------+
|snapshot=2016-12_private|
|snapshot=2017-07_private|
|snapshot=2023-10        |
|snapshot=2023-11        |
|snapshot=2023-12        |
|snapshot=2024-01        |
|snapshot=2024-02        |
|snapshot=2024-03        |
+------------------------+



In [26]:
mediawiki_snapshot = '2024-03'
wikidata_snapshot = '2024-04-01'
prop_list = tuple(COUNTRY_PROPERTIES)
print(prop_list)
tablename = 'isaacj.qid_to_country'

('P17', 'P19', 'P27', 'P131', 'P183', 'P361', 'P495', 'P1269', 'P1532', 'P3842')


In [33]:
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {tablename} (
        qid              STRING  COMMENT 'Wikidata ID of item with at least one Wikipedia sitelink -- e.g., Q42',
        property         STRING  COMMENT 'Wikidata property (e.g., P625 for coordinates) from which country was derived',
        country          STRING  COMMENT 'Region name'
    )
    PARTITIONED BY (
        snapshot         STRING  COMMENT 'Wikidata snapshot processed'
    )
    """

print(create_table_query)
spark.sql(create_table_query)


    CREATE TABLE IF NOT EXISTS isaacj.qid_to_country (
        qid              STRING  COMMENT 'Wikidata ID of item with at least one Wikipedia sitelink -- e.g., Q42',
        property         STRING  COMMENT 'Wikidata property (e.g., P625 for coordinates) from which country was derived',
        country          STRING  COMMENT 'Region name'
    )
    PARTITIONED BY (
        snapshot         STRING  COMMENT 'Wikidata snapshot processed'
    )
    


24/04/26 15:35:08 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
24/04/26 15:35:08 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


DataFrame[]

In [39]:
"""
Full query is below that grabs country data from a set of ~10 properties
AND grabs country data based on an item's coordinates

Explanation of CTEs:
* relevant_wikis: get list of Wikipedia wiki_dbs (e.g., enwiki) so as to limit the
    the Wikidata items considered to just those with Wikipedia sitelinks
* relevant_qids: get set of Wikidata item IDs that have at least one Wikipedia sitelink
* exploded_statements: explode Wikidata entity data to one Wikidata claim per row
* relevant_statements: get all Wikidata claims that might have a country value
* INSERT: property countries into single table
"""

query = f"""
WITH relevant_wikis AS (
    SELECT
      DISTINCT(database_code) AS wiki_db
    FROM canonical_data.wikis
    WHERE
      database_group = 'wikipedia'
      AND status = 'open'
      AND visibility = 'public'
      AND editability = 'public'
),
relevant_qids AS (
    SELECT
      DISTINCT(item_id) AS item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN relevant_wikis db
      ON (wd.wiki_db = db.wiki_db)
    WHERE
      snapshot = '{wikidata_snapshot}'
      AND page_namespace = 0
),
exploded_statements AS (
    SELECT
      id AS item_id,
      explode(claims) AS claim
    FROM wmf.wikidata_entity w
    INNER JOIN relevant_qids q
      ON (w.id = q.item_id)
    WHERE
      w.snapshot = '{wikidata_snapshot}'
),
relevant_statements AS (
    SELECT
      item_id,
      claim.mainSnak.property AS property,
      getWikidataValue(claim.mainSnak.dataValue.value) as value
    FROM exploded_statements
    WHERE
      claim.mainSnak.property IN {prop_list}
)
INSERT OVERWRITE TABLE {tablename}
PARTITION(snapshot='{wikidata_snapshot}')
SELECT
  item_id,
  property,
  q.region AS country
FROM relevant_statements r
INNER JOIN qid_to_region q
  ON (r.value = q.qid)
"""

print(query)
result = spark.sql(query)


WITH relevant_wikis AS (
    SELECT
      DISTINCT(database_code) AS wiki_db
    FROM canonical_data.wikis
    WHERE
      database_group = 'wikipedia'
      AND status = 'open'
      AND visibility = 'public'
      AND editability = 'public'
),
relevant_qids AS (
    SELECT
      DISTINCT(item_id) AS item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN relevant_wikis db
      ON (wd.wiki_db = db.wiki_db)
    WHERE
      snapshot = '2024-04-01'
      AND page_namespace = 0
),
exploded_statements AS (
    SELECT
      id AS item_id,
      explode(claims) AS claim
    FROM wmf.wikidata_entity w
    INNER JOIN relevant_qids q
      ON (w.id = q.item_id)
    WHERE
      w.snapshot = '2024-04-01'
),
relevant_statements AS (
    SELECT
      item_id,
      claim.mainSnak.property AS property,
      getWikidataValue(claim.mainSnak.dataValue.value) as value
    FROM exploded_statements
    WHERE
      claim.mainSnak.property IN ('P17', 'P19', 'P27', 'P131', 'P183', 'P361', 'P495', '

24/04/26 15:43:58 ERROR YarnScheduler: Lost executor 41 on an-worker1138.eqiad.wmnet: Container killed by YARN for exceeding physical memory limits. 8.9 GB of 8.8 GB physical memory used. Consider boosting spark.executor.memoryOverhead.
24/04/26 15:43:58 WARN TaskSetManager: Lost task 16.0 in stage 6.0 (TID 982) (an-worker1138.eqiad.wmnet executor 41): ExecutorLostFailure (executor 41 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding physical memory limits. 8.9 GB of 8.8 GB physical memory used. Consider boosting spark.executor.memoryOverhead.
24/04/26 15:43:58 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 41 for reason Container killed by YARN for exceeding physical memory limits. 8.9 GB of 8.8 GB physical memory used. Consider boosting spark.executor.memoryOverhead.
24/04/26 15:43:59 WARN TaskSetManager: Lost task 7.0 in stage 6.0 (TID 973) (an-worker1138.eqiad.wmnet executor 41): ExecutorLostFailure (exe

In [40]:
# check output (this was before coordinate data was added)
spark.sql(f"""
WITH enwiki_qids AS (
    SELECT
      DISTINCT(item_id)
    FROM wmf.wikidata_item_page_link
    WHERE
      snapshot = "{wikidata_snapshot}"
      AND wiki_db = "enwiki"
)
SELECT
  country,
  COUNT(DISTINCT(qid)) AS num_qids,
  COUNT(1) AS num_rows
FROM {tablename} all
INNER JOIN enwiki_qids e
  ON (all.qid = e.item_id)
WHERE
  snapshot = "{wikidata_snapshot}"
GROUP BY
  country
ORDER BY
  num_qids DESC
""").show(500, False)




+---------------------------------------------+--------+--------+
|country                                      |num_qids|num_rows|
+---------------------------------------------+--------+--------+
|United States                                |1131578 |1169109 |
|United Kingdom                               |376172  |398002  |
|France                                       |164963  |172442  |
|India                                        |158797  |162277  |
|Canada                                       |152921  |156768  |
|Australia                                    |128825  |132925  |
|Germany                                      |125051  |133869  |
|Japan                                        |98163   |102044  |
|Poland                                       |85451   |87406   |
|Italy                                        |84769   |88790   |
|Russia                                       |72605   |78628   |
|Iran                                         |69848   |70733   |
|Spain    

24/04/26 15:45:53 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 32 for reason Container killed by YARN for exceeding physical memory limits. 8.8 GB of 8.8 GB physical memory used. Consider boosting spark.executor.memoryOverhead.
24/04/26 15:45:53 ERROR YarnScheduler: Lost executor 32 on an-worker1136.eqiad.wmnet: Container killed by YARN for exceeding physical memory limits. 8.8 GB of 8.8 GB physical memory used. Consider boosting spark.executor.memoryOverhead.
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57590)
Traceback (most recent call last):
  File "/home/isaacj/.conda/envs/2024-01-17T14.42.46_isaacj/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/isaacj/.conda/envs/2024-01-17T14.42.46_isaacj/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_ad