In [1]:
# using custom conda environment so I can include shapely for the coordinate -> country look-up
!which python

/home/isaacj/.conda/envs/2021-03-18T15.28.24_isaacj/bin/python


In [2]:
!conda install -c conda-forge shapely 

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/isaacj/.conda/envs/2021-03-18T15.28.24_isaacj

  added / updated specs:
    - shapely


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geos-3.9.1                 |       h9c3ff4c_2         1.1 MB  conda-forge
    libblas-3.9.0              |       8_openblas          11 KB  conda-forge
    libcblas-3.9.0             |       8_openblas          11 KB  conda-forge
    libgfortran-ng-9.3.0       |      hff62375_18          22 KB  conda-forge
    libgfortran5-9.3.0         |      hff62375_18         2.0 MB  conda-forge
    liblapack-3.9.0            |       8_openblas          11 KB  conda-forge
    libopenblas-0.3.12         |pthreads_h4812303_1         8.9 MB  conda-forge
    numpy-1.20.2               |   py37h038b26d_0         5.8 MB  conda-forge
    ope

In [2]:
import csv
import json
import os
import sys

import pandas as pd
from shapely.geometry import shape, Point

import wmfdata

In [3]:
spark = wmfdata.spark.get_session(app_name='pyspark large; regions; isaacj',
                                  type='yarn-large', # local, yarn-regular, yarn-large
                                  ship_python_env=True,  # this will include shapely library
                                  )  

A conda environment is already packed at conda-2021-03-18T15.28.24_isaacj.tgz. If you have recently installed new packages into your conda env, set force=True in conda_pack_kwargs and it will be repacked for you.
Will ship conda-2021-03-18T15.28.24_isaacj.tgz to remote Spark executors.
PySpark executors will use conda-2021-03-18T15.28.24_isaacj/bin/python3.


## Load in supporting data files from Github

In [4]:
data_dir = './data'
region_geoms = ('ne_10m_admin_0_map_units.geojson',
                'https://github.com/geohci/wiki-region-groundtruth/raw/main/resources/ne_10m_admin_0_map_units.geojson')
properties_tsv = ('country_properties.tsv',
                  'https://github.com/geohci/wiki-region-groundtruth/raw/main/resources/country_properties.tsv')
aggregation_tsv = ('country_aggregation.tsv',
                   'https://github.com/geohci/wiki-region-groundtruth/raw/main/resources/country_aggregation.tsv')
region_tsv = ('base_regions_qids.tsv',
              'https://github.com/geohci/wiki-region-groundtruth/raw/main/resources/base_regions_qids.tsv')

In [5]:
!rm -R {data_dir}
!mkdir -p {data_dir}
# have to string commands together or `cd` doesn't apply to later commands
!cd {data_dir}; wget -q {region_geoms[1]} {properties_tsv[1]} {aggregation_tsv[1]} {region_tsv[1]}; ls -lht

rm: cannot remove './data': No such file or directory
total 25M
-rw-r--r-- 1 isaacj wikidev 4.4K Aug 30 21:30 base_regions_qids.tsv
-rw-r--r-- 1 isaacj wikidev 2.7K Aug 30 21:30 country_aggregation.tsv
-rw-r--r-- 1 isaacj wikidev  281 Aug 30 21:29 country_properties.tsv
-rw-r--r-- 1 isaacj wikidev  25M Aug 30 21:29 ne_10m_admin_0_map_units.geojson


In [6]:
def get_region_properties(properties_tsv):
    """List of properties used for directly linking Wikidata items to regions.

    e.g., P19: place of birth
    These are compiled based on knowledge of Wikidata and Marc Miquel's excellent work:
    https://github.com/marcmiquel/WDO/blob/e482a2df2b41d389945f3b82179b8b7ca338b8d5/src_data/wikipedia_diversity.py
    """
    expected_header = ['Property', 'Label']
    region_properties = []
    with open(properties_tsv, 'r') as fin:
        tsvreader = csv.reader(fin, delimiter='\t')
        assert next(tsvreader) == expected_header
        for line in tsvreader:
            property = line[0]
            label = line[1]
            region_properties.append((property, label))
    return region_properties

def get_aggregation_logic(aggregates_tsv):
    """Mapping of QIDs -> regions not directly associated with them.

    e.g., Sahrawi Arab Democratic Republic (Q40362) -> Western Sahara (Q6250)
    """
    expected_header = ['Aggregation', 'From', 'QID To', 'QID From']
    aggregation = {}
    with open(aggregates_tsv, 'r') as fin:
        tsvreader = csv.reader(fin, delimiter='\t')
        assert next(tsvreader) == expected_header
        for line in tsvreader:
            try:
                qid_to = line[2]
                qid_from = line[3]
            except Exception:
                print("Skipped:", line)
            if qid_from:
                aggregation[qid_from] = qid_to
    return aggregation

def get_region_data(region_qids_tsv, region_geoms_geojson, aggregation_tsv):
    # load in canonical mapping of QID -> region name for labeling
    qid_to_region = {}
    with open(region_qids_tsv, 'r') as fin:
        tsvreader = csv.reader(fin, delimiter='\t')
        assert next(tsvreader) == ['Region', 'QID']
        for line in tsvreader:
            region = line[0]
            qid = line[1]
            qid_to_region[qid] = region
    print("\nLoaded {0} QID-region pairs for matching against Wikidata -- e.g., Q31: {1}".format(
        len(qid_to_region), qid_to_region['Q31']))
    # load in additional QIDs that should be mapped to a more canonical region name
    aggregation = get_aggregation_logic(aggregation_tsv)
    for qid_from in aggregation:
        qid_to = aggregation[qid_from]
        if qid_to in qid_to_region:
            qid_to_region[qid_from] = qid_to_region[qid_to]  
        else:
            print("-- Skipping aggregation for {0} to {1}".format(qid_from, qid_to))
    print("Now {0} QID-region pairs after adding aggregations -- e.g., Q40362: {1}".format(
        len(qid_to_region), qid_to_region['Q40362']))

    # load in geometries for the regions identified via Wikidata
    with open(region_geoms_geojson, 'r') as fin:
        regions = json.load(fin)['features']
    region_shapes = {}
    skipped = []
    for c in regions:
        qid = c['properties']['WIKIDATAID']
        if qid in qid_to_region:
            region_shapes[qid] = shape(c['geometry'])
        else:
            skipped.append('{0} ({1})'.format(c['properties']['NAME'], qid))
    print("\nLoaded {0} region geometries. Skipped {1}: {2}".format(
        len(region_shapes), len(skipped), skipped))
    
    # check alignment between QID list and region geometries
    in_common = 0
    for qid in qid_to_region:
        if qid in region_shapes:
            in_common += 1
        else:
            alt_found = False
            for qid_alt in qid_to_region:
                if qid != qid_alt and qid_to_region[qid] == qid_to_region[qid_alt]:
                    alt_found = True
            if not alt_found:
                print('Prop-only: {0} ({1})'.format(qid_to_region[qid], qid))
    print("{0} QIDs in common between prop-values and geometries.".format(in_common))
    return region_shapes, qid_to_region

In [7]:
# load in data
# I skip a few regions that have coordinates -- see https://github.com/geohci/wiki-region-groundtruth/blob/main/resources/REGIONS.md
# And a few regions are only present as Wikidata properties and don't have coordinates
# Abkhazia / South Ossetia just aren't in the geographic data
# the Kingdoms are agglomerations of regions but if I didn't include them, certain
# Wikidata items that e.g., link to UK but not England specifically would be missed
region_properties = get_region_properties(os.path.join(data_dir, properties_tsv[0]))
region_shapes, qid_to_region = get_region_data(os.path.join(data_dir, region_tsv[0]),
                                               os.path.join(data_dir, region_geoms[0]),
                                               os.path.join(data_dir, aggregation_tsv[0]))



Loaded 253 QID-region pairs for matching against Wikidata -- e.g., Q31: Belgium
Now 304 QID-region pairs after adding aggregations -- e.g., Q40362: Western Sahara

Loaded 279 region geometries. Skipped 16: ['Dhekelia (Q9206745)', 'UNDOF Zone (Q1428532)', 'Korean DMZ (south) (Q331990)', 'Korean DMZ (north) (Q331990)', 'Iraqi Kurdistan (Q205047)', 'USNB Guantanamo Bay (Q762570)', 'N. Cyprus (Q23681)', 'Cyprus U.N. Buffer Zone (Q116970)', 'Siachen Glacier (Q333946)', 'Akrotiri (Q9143535)', 'Paracel Is. (Q274388)', 'Spratly Is. (Q215664)', 'Clipperton I. (Q161258)', 'Bajo Nuevo Bank (Q1257783)', 'Serranilla Bank (Q1169008)', 'Scarborough Reef (Q628716)']
Prop-only: United Kingdom (Q145)
279 QIDs in common between prop-values and geometries.


In [8]:
# Create table with QID -> Region mapping for making data in end result more readable
spark.createDataFrame(pd.DataFrame(qid_to_region.items(), columns=['qid', 'region'])).createOrReplaceTempView('qid_to_region')
spark.sql("SELECT * FROM qid_to_region LIMIT 10").show(50, False)

+------+-------------------+
|qid   |region             |
+------+-------------------+
|Q889  |Afghanistan        |
|Q5689 |Åland Islands      |
|Q222  |Albania            |
|Q262  |Algeria            |
|Q16641|American Samoa     |
|Q228  |Andorra            |
|Q916  |Angola             |
|Q25228|Anguilla           |
|Q51   |Antarctica         |
|Q781  |Antigua and Barbuda|
+------+-------------------+



In [9]:
def pointInCountry(lon, lat):
    """Determine which region contains a lat-lon coordinate.
    
    Depends on shapely library and region_shapes object, which contains a dictionary
    mapping QIDs to shapely geometry objects.
    """
    try:
        pt = Point(lon, lat)
        for qid in region_shapes:
            if region_shapes[qid].contains(pt):
                return qid
        return "N/A"
    except Exception:
        return None
    
spark.udf.register('pointInCountry', pointInCountry, 'String')

<function __main__.pointInCountry(lon, lat)>

In [10]:
# value info in wikidata entity table (https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Wikidata_entity)
# is a string as opposed to struct (because it has a variable schema)
# this UDF extracts the QID value (or null if doesn't exist)
def getWikidataValue(obj):
    try:
        d = json.loads(obj)
        return d.get('id')
    except Exception:
        return None
    
spark.udf.register('getWikidataValue', getWikidataValue, 'String')

# specific functions for getting lat and lon out of the P625 property in the Wikidata entity
def getLat(obj):
    try:
        d = json.loads(obj)
        return d.get('latitude')
    except Exception:
        return None
    
def getLon(obj):
    try:
        d = json.loads(obj)
        return d.get('longitude')
    except Exception:
        return None
    
spark.udf.register('getLat', getLat, 'Float')
spark.udf.register('getLon', getLon, 'Float')

<function __main__.getLon(obj)>

In [13]:
print("Wikidata snapshots:")
spark.sql('SHOW PARTITIONS wmf.wikidata_item_page_link').show(50, False)

print("\nMediawiki snapshots:")
spark.sql('SHOW PARTITIONS wmf_raw.mediawiki_project_namespace_map').show(50, False)

Wikidata snapshots:
+-------------------+
|partition          |
+-------------------+
|snapshot=2021-07-12|
|snapshot=2021-07-19|
|snapshot=2021-07-26|
|snapshot=2021-08-02|
|snapshot=2021-08-09|
|snapshot=2021-08-16|
+-------------------+


Mediawiki snapshots:
+------------------------+
|partition               |
+------------------------+
|snapshot=2016-12_private|
|snapshot=2017-07_private|
|snapshot=2021-02        |
|snapshot=2021-03        |
|snapshot=2021-04        |
|snapshot=2021-05        |
|snapshot=2021-06        |
|snapshot=2021-07        |
+------------------------+



In [17]:
mediawiki_snapshot = '2021-07'
wikidata_snapshot = '2021-08-02'
prop_list = tuple([p[0] for p in region_properties])
print(prop_list)
tablename = 'isaacj.qid_to_country_2021_08_02'

('P19', 'P17', 'P27', 'P495', 'P131', 'P1532', 'P3842', 'P361', 'P1269', 'P183')


In [15]:
do_execute = True
create_table_query = """
    CREATE TABLE IF NOT EXISTS {0} (
        qid              STRING  COMMENT 'Wikidata ID of item with at least one Wikipedia sitelink -- e.g., Q42',
        property         STRING  COMMENT 'Wikidata property (e.g., P625 for coordinates) from which country was derived',
        country          STRING  COMMENT 'Region name'
    )
    """.format(tablename)

if do_execute:
    print(create_table_query)
    spark.sql(create_table_query)


    CREATE TABLE IF NOT EXISTS isaacj.qid_to_country_2021_08_02 (
        qid              STRING  COMMENT 'Wikidata ID of item with at least one Wikipedia sitelink -- e.g., Q42',
        property         STRING  COMMENT 'Wikidata property (e.g., P625 for coordinates) from which country was derived',
        country          STRING  COMMENT 'Region name'
    )
    


In [18]:
"""
Full query is below that grabs country data from a set of ~10 properties
AND grabs country data based on an item's coordinates

Explanation of CTEs:
* relevant_wikis: get list of Wikipedia wiki_dbs (e.g., enwiki) so as to limit the
    the Wikidata items considered to just those with Wikipedia sitelinks
* relevant_qids: get set of Wikidata item IDs that have at least one Wikipedia sitelink
* exploded_statements: explode Wikidata entity data to one Wikidata claim per row
* lat_lon_coords: extract lat/lon values from claims to be geolocated
* geolocated: pass lat/lon values to UDF to identify which country they are in.
* coordinate_countries: map country QIDs for geolocation to country names
* relevant_statements: get all Wikidata claims that might have a country value
* property_countries: extract any country values from these claims
* INSERT: union together coordinate countries and property countries into single table
"""
print_for_hive = False
do_execute = True

query = f"""
WITH relevant_wikis AS (
    SELECT
      DISTINCT(dbname) AS wiki_db
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '{mediawiki_snapshot}'
      AND hostname LIKE '%.wikipedia.org'
),
relevant_qids AS (
    SELECT
      DISTINCT(item_id) AS item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN relevant_wikis db
      ON (wd.wiki_db = db.wiki_db)
    WHERE
      snapshot = '{wikidata_snapshot}'
      AND page_namespace = 0
),
exploded_statements AS (
    SELECT
      id AS item_id,
      explode(claims) AS claim
    FROM wmf.wikidata_entity w
    INNER JOIN relevant_qids q
      ON (w.id = q.item_id)
    WHERE
      w.snapshot = '{wikidata_snapshot}'
),
lat_lon_coords AS (
    SELECT
      item_id,
      getLat(claim.mainSnak.dataValue.value) as lat,
      getLon(claim.mainSnak.dataValue.value) as lon
    FROM exploded_statements
    WHERE
      claim.mainSnak.property = 'P625'
),
geolocated AS (
    SELECT
      item_id,
      pointInCountry(lon, lat) AS country_qid
    FROM lat_lon_coords
    WHERE
      lat IS NOT NULL
      AND lon IS NOT NULL
),
coordinate_countries AS (
    SELECT
      item_id AS item_id,
      q.region AS country
    FROM geolocated g
    INNER JOIN qid_to_region q
      ON (g.country_qid = q.qid)
),
relevant_statements AS (
    SELECT
      item_id,
      claim.mainSnak.property AS property,
      getWikidataValue(claim.mainSnak.dataValue.value) as value
    FROM exploded_statements
    WHERE
      claim.mainSnak.property IN {prop_list}
),
property_countries AS (
    SELECT
      item_id,
      property,
      q.region AS country
    FROM relevant_statements r
    INNER JOIN qid_to_region q
      ON (r.value = q.qid)
)
INSERT OVERWRITE TABLE {tablename}
SELECT
  item_id,
  'P625',
  country
FROM coordinate_countries
UNION ALL
SELECT
  item_id,
  property,
  country
FROM property_countries
"""

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)


WITH relevant_wikis AS (
    SELECT
      DISTINCT(dbname) AS wiki_db
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '2021-07'
      AND hostname LIKE '%.wikipedia.org'
),
relevant_qids AS (
    SELECT
      DISTINCT(item_id) AS item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN relevant_wikis db
      ON (wd.wiki_db = db.wiki_db)
    WHERE
      snapshot = '2021-08-02'
      AND page_namespace = 0
),
exploded_statements AS (
    SELECT
      id AS item_id,
      explode(claims) AS claim
    FROM wmf.wikidata_entity w
    INNER JOIN relevant_qids q
      ON (w.id = q.item_id)
    WHERE
      w.snapshot = '2021-08-02'
),
lat_lon_coords AS (
    SELECT
      item_id,
      getLat(claim.mainSnak.dataValue.value) as lat,
      getLon(claim.mainSnak.dataValue.value) as lon
    FROM exploded_statements
    WHERE
      claim.mainSnak.property = 'P625'
),
geolocated AS (
    SELECT
      item_id,
      pointInCountry(lon, lat) AS country_qid
    FROM l

## Distribution of countries on English Wikipedia

In [5]:
spark.sql(f"""
WITH relevant_qids AS (
    SELECT
      DISTINCT(item_id) AS item_id
    FROM wmf.wikidata_item_page_link
    WHERE
      snapshot = '2021-10-04'
      AND page_namespace = 0
      AND wiki_db = 'enwiki'
)
SELECT
  country,
  COUNT(DISTINCT(c.qid)) AS num_articles
FROM isaacj.qid_to_country_2021_08_02 c
INNER JOIN relevant_qids e
  ON (c.qid = e.item_id)
GROUP BY
  country
ORDER BY
  num_articles DESC
""").show(500, False)

+--------------------------------------------+------------+
|country                                     |num_articles|
+--------------------------------------------+------------+
|United States of America                    |1056601     |
|United Kingdom                              |335816      |
|France                                      |152942      |
|India                                       |147087      |
|Canada                                      |142981      |
|Germany                                     |121420      |
|Australia                                   |120993      |
|England                                     |87638       |
|Japan                                       |87325       |
|Poland                                      |83114       |
|Italy                                       |77674       |
|Iran                                        |74215       |
|Russia                                      |66430       |
|Spain                                  