### Person Occupation Taxonomy
Map a person's occupations in Wikidata to a standard taxonomy of occupations -- e.g., science fiction writer -> writer. This notebook has two steps: pre-caching a mapping of every occupation value on Wikidata to the taxonomy and then applying that to a Wikidata dump.

In [56]:
import re
import time

import mwapi
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import wmfdata

In [2]:
!which python

/home/isaacj/.conda/envs/2021-03-18T15.28.24_isaacj/bin/python


In [3]:
spark = wmfdata.spark.get_session(app_name='pyspark reg; occupations',
                                  type='yarn-regular')  # local, yarn-regular, yarn-large

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [4]:
print("Wikidata:")
spark.sql('show partitions wmf.wikidata_item_page_link').show(50, False)

print("\nMediawiki:")
spark.sql("show partitions wmf_raw.mediawiki_project_namespace_map").show(50, False)

Wikidata:
+-------------------+
|partition          |
+-------------------+
|snapshot=2021-12-13|
|snapshot=2021-12-20|
|snapshot=2021-12-27|
|snapshot=2022-01-03|
|snapshot=2022-01-10|
|snapshot=2022-01-17|
+-------------------+



In [74]:
wd_snapshot = '2022-01-03'
mw_snapshot = '2021-12'
taxonomy_tablename = 'isaacj.occupation_taxonomy'
data_tablename = 'isaacj.occupation_wikidata'

# Occupations which are overly generic
# Worker, Person, Individual, Researcher, Academic, Official, White-collar worker, Creator, Position, Profession, Group of humans, Organization, Author
PERSON_STOPPOINTS = ['Q327055', 'Q215627', 'Q795052', 'Q1650915', 'Q3400985', 'Q599151', 'Q255274', 'Q2500638', 'Q4164871', 'Q28640', 'Q16334295', 'Q43229']

# the high-level occupation categories we want to map every occupation to
PERSON_CATEGORIES = {
    'Q15253558':'activist',
    'Q19261760':'agricultural worker',
    'Q483501':'artist',
    'Q2066131':'athlete',
    'Q864503':'biologist',
    'Q43845':'businessperson',
    'Q593644':'chemist',
    'Q212238':'civil servant',
    'Q3315492':'clergyperson',
    'Q82594':'computer scientist',
    'Q5322166':'designer',
    'Q11424604':'earth scientist',
    'Q974144':'educator',
    'Q81096':'engineer',
    'Q11974939':'health professional',
    'Q16727193':'humanities scholar',
    'Q1662485':'information professional',
    'Q1930187':'journalist',
    'Q185351':'jurist',
    'Q14467526':'linguist',
    'Q170790':'mathematician',
    'Q15980804':'media professional',
    'Q47064':'military personnel',
    'Q639669':'musician',
    'Q713200':'performing artist',
    'Q169470':'physicist',
    'Q82955':'politician',
    'Q901':'scientist',
    'Q15319501':'social scientist',
    'Q50995749':'sportsperson (non-athlete)',
    'Q56148021':'transportation occupation',
    'Q36180':'writer'
}

## Get all potential occupation values

In [9]:
def querySparql(query):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.queryAndConvert()
    return results['results']['bindings']

In [16]:
all_occupation_values_query = """
SELECT DISTINCT ?value
WHERE
{
  ?item wdt:P106 ?value
}
"""

occ_values = [o['value']['value'].split('/')[-1] for o in querySparql(all_occupation_values_query)]
print(f'{len(occ_values)} distinct occupation values.')

17162 distinct occupation values.


In [50]:
class OccupationTaxonomy():
    
    def __init__(self, base_occupations, stoppoints, endpoints, maxiter=8):
        self.occupation_categories = endpoints
        self.person_taxonomy = {q:set([q]) for q in self.occupation_categories}  # eventual mapping of Occupation QID -> Stop-point Label
        for sp in stoppoints:  # too generic -- doesn't map to anything
            self.person_taxonomy[sp] = set()
        self.session = mwapi.Session('https://www.wikidata.org', user_agent='Isaac (WMF); occupation taxonomy')
        self.maxiter = maxiter
        self.build_person_taxonomy(base_occupations)
        
    def build_person_taxonomy(self, base_occupations):
        for i, occ in enumerate(base_occupations, start=1):
            try:
                self.leaf_to_roots(occ, 0)
            except Exception:
                print(f"Failed for {occ}")
                continue
            if i % 100 == 0:
                print(f'{i} occupations processed -- taxonomy now has {len(self.person_taxonomy)} entries.')

    def leaf_to_roots(self, qid, iter_num=0):
        # Uses subclass-of (P279) as that seems optimal for occupation. Potentially could be tweaked to include other properties.
        if qid in self.person_taxonomy:  # already found
            return
        elif iter_num == self.maxiter:  # hit max-depth, bummer but stop search
            return
        elif qid not in self.person_taxonomy:  # make sure qid has entry in taxonomy
            self.person_taxonomy[qid] = set()
        
        # no quick solution -- get super-classes for QID and recurse from there
        scs = self.get_superclasses(qid)
        for sc in scs:
            # not found: process superclass
            if sc not in self.person_taxonomy:
                self.leaf_to_roots(sc, iter_num+1)            
            # sc in taxonomy already or was just updated: update taxonomy
            if sc in self.person_taxonomy:
                self.person_taxonomy[qid].update(self.person_taxonomy[sc])                
                

    def get_superclasses(self, qid):
        # e.g., input qid for chemical engineer -> ['engineer', 'chemist', 'engineering professionals'] (though represented as QIDs)
        try:
            result = self.session.get(
                action="wbgetclaims",
                entity=qid,
                property='P279',
                format='json',
                formatversion=2
            )
            scs = []
            for sc in result['claims'].get('P279', []):
                if sc.get('mainsnak', {}).get('datatype') == 'wikibase-item':
                    try:
                        scs.append(sc['mainsnak']['datavalue']['value']['id'])
                    except KeyError:
                        continue
        except Exception:
            scs = []
        time.sleep(0.25)
        return scs
    
    def print_taxonomy(self):
        occurences = {}
        for q in self.person_taxonomy:
            print(f'{q}:\t{[self.occupation_categories[o] for o in self.person_taxonomy[q]]}')
            for o in self.person_taxonomy[q]:
                occurences[o] = occurences.get(o, 0) + 1
        print("\nStats of occurrences:")
        sorted_occs = sorted(occurences, key=occurences.get, reverse=True)
        for o in sorted_occs:
            print(f'{self.occupation_categories.get(o, o)}:\t{occurences[o]} occurences.')

In [None]:
ot = OccupationTaxonomy(occ_values, PERSON_STOPPOINTS, PERSON_CATEGORIES)

In [58]:
tax_df = []
for o,t in ot.person_taxonomy.items():
    if len(t) > 0:
        for q in t:
            tax_df.append((o, ot.occupation_categories[q]))
tax_df = pd.DataFrame(tax_df, columns=['qid', 'root_occupation'])
tax_df

Unnamed: 0,qid,root_occupation
0,Q15253558,activist
1,Q19261760,agricultural worker
2,Q483501,artist
3,Q2066131,athlete
4,Q864503,biologist
...,...,...
7198,Q110776779,humanities scholar
7199,Q110779725,educator
7200,Q110779725,businessperson
7201,Q110779750,health professional


In [61]:
spark.createDataFrame(tax_df).createOrReplaceTempView('person_taxonomy')
spark.sql('''
SELECT
  root_occupation,
  count(distinct(qid)) as num_leaves
FROM person_taxonomy
GROUP BY
  root_occupation
ORDER BY
  num_leaves DESC''').show(50, False)


In [68]:
do_execute = True
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {taxonomy_tablename} (
        qid              STRING  COMMENT 'Potential value for occupation',
        root_occupation  STRING  COMMENT 'High-level occupation that occupation value maps up to'
    )
    """

if do_execute:
    spark.sql(create_table_query)

In [70]:
spark.sql(f'''
INSERT OVERWRITE TABLE {taxonomy_tablename}
SELECT
  qid,
  root_occupation
FROM person_taxonomy
''')

spark.sql(f'select root_occupation, count(distinct(qid)) as num_leaves from {taxonomy_tablename} group by root_occupation order by num_leaves DESC').show(50, False)


+--------------------------+----------+
|root_occupation           |num_leaves|
+--------------------------+----------+
|politician                |614       |
|artist                    |512       |
|civil servant             |488       |
|businessperson            |432       |
|musician                  |399       |
|health professional       |377       |
|athlete                   |353       |
|humanities scholar        |331       |
|military personnel        |331       |
|clergyperson              |266       |
|scientist                 |265       |
|educator                  |258       |
|biologist                 |243       |
|jurist                    |215       |
|performing artist         |211       |
|writer                    |208       |
|sportsperson (non-athlete)|205       |
|engineer                  |205       |
|media professional        |177       |
|activist                  |160       |
|social scientist          |159       |
|designer                  |131       |


## Build Dataset

In [72]:
do_execute = True
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {data_tablename} (
        item_id          STRING  COMMENT 'Wikidata ID',
        root_occupation  STRING  COMMENT 'High-level occupation that occupation value maps up to'
    )
    """

if do_execute:
    spark.sql(create_table_query)

In [73]:
# value info in wikidata entity table (https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Wikidata_entity)
# is a string as opposed to struct (because it has a variable schema)
# this UDF extracts the QID value (or null if doesn't exist)
def getValue(obj):
    try:
        d =  eval(obj)
        return d.get('id')
    except Exception:
        return None
    
spark.udf.register('getValue', getValue, 'string')

<function __main__.getValue(obj)>

In [79]:
print_for_hive = False
do_execute = True

query = f"""
WITH wikipedia_projects AS (
    SELECT DISTINCT
      dbname
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '{mw_snapshot}'
      AND hostname LIKE '%wikipedia%'
),
relevant_qids AS (
    SELECT DISTINCT
      item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN wikipedia_projects wp
      ON (wd.wiki_db = wp.dbname)
    WHERE
      snapshot = '{wd_snapshot}'
      AND page_namespace = 0
),
exploded_statements AS (
    SELECT
      id as item_id,
      explode(claims) as claim
    FROM wmf.wikidata_entity w
    INNER JOIN relevant_qids q
      ON (w.id = q.item_id)
    WHERE
      snapshot = '{wd_snapshot}'
),
relevant_statements AS (
    SELECT
      item_id,
      getValue(claim.mainSnak.dataValue.value) as value
    FROM exploded_statements
    WHERE
      claim.mainSnak.property = 'P106'
)
INSERT OVERWRITE TABLE {data_tablename}
SELECT
  item_id,
  root_occupation
FROM relevant_statements s
INNER JOIN {taxonomy_tablename} t
  ON (s.value = t.qid)
"""

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)


WITH wikipedia_projects AS (
    SELECT DISTINCT
      dbname
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '2021-12'
      AND hostname LIKE '%wikipedia%'
),
relevant_qids AS (
    SELECT DISTINCT
      item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN wikipedia_projects wp
      ON (wd.wiki_db = wp.dbname)
    WHERE
      snapshot = '2022-01-03'
      AND page_namespace = 0
),
exploded_statements AS (
    SELECT
      id as item_id,
      explode(claims) as claim
    FROM wmf.wikidata_entity w
    INNER JOIN relevant_qids q
      ON (w.id = q.item_id)
    WHERE
      snapshot = '2022-01-03'
),
relevant_statements AS (
    SELECT
      item_id,
      getValue(claim.mainSnak.dataValue.value) as value
    FROM exploded_statements
    WHERE
      claim.mainSnak.property = 'P106'
)
INSERT OVERWRITE TABLE isaacj.occupation_wikidata
SELECT
  item_id,
  root_occupation
FROM relevant_statements s
INNER JOIN isaacj.occupation_taxonomy t
  ON (s.value

In [80]:
spark.sql(f'SELECT root_occupation, COUNT(1) AS num_rows, COUNT(DISTINCT(qid)) AS num_items FROM {data_tablename} GROUP BY root_occupation ORDER BY num_items DESC').show(100, False)

+--------------------------+--------+---------+
|root_occupation           |num_rows|num_items|
+--------------------------+--------+---------+
|athlete                   |890148  |851553   |
|politician                |571663  |563141   |
|writer                    |490914  |373651   |
|artist                    |459051  |352631   |
|performing artist         |467580  |321120   |
|musician                  |422295  |270888   |
|businessperson            |282655  |267607   |
|educator                  |178171  |170680   |
|humanities scholar        |189215  |164500   |
|military personnel        |155565  |149702   |
|media professional        |181579  |147408   |
|journalist                |139273  |132188   |
|jurist                    |143297  |120415   |
|sportsperson (non-athlete)|109498  |101627   |
|civil servant             |100170  |97637    |
|social scientist          |96400   |88838    |
|health professional       |99636   |86907    |
|clergyperson              |91696   |844

In [82]:
# studs terkel
spark.sql(f'SELECT * FROM {data_tablename} WHERE qid = "Q529276"').show(100, False)

+-------+------------------+
|qid    |root_occupation   |
+-------+------------------+
|Q529276|journalist        |
|Q529276|humanities scholar|
|Q529276|media professional|
|Q529276|writer            |
|Q529276|writer            |
|Q529276|jurist            |
|Q529276|journalist        |
|Q529276|performing artist |
+-------+------------------+



In [52]:
# out of order but output too large so put at end of notebook
ot = OccupationTaxonomy(occ_values, PERSON_STOPPOINTS, PERSON_CATEGORIES)
ot.print_taxonomy()

100 occupations processed -- taxonomy now has 590 entries.
200 occupations processed -- taxonomy now has 945 entries.
300 occupations processed -- taxonomy now has 1278 entries.
400 occupations processed -- taxonomy now has 1638 entries.
500 occupations processed -- taxonomy now has 1935 entries.
600 occupations processed -- taxonomy now has 2177 entries.
700 occupations processed -- taxonomy now has 2364 entries.
800 occupations processed -- taxonomy now has 2600 entries.
900 occupations processed -- taxonomy now has 2792 entries.
1000 occupations processed -- taxonomy now has 3018 entries.
1100 occupations processed -- taxonomy now has 3213 entries.
1200 occupations processed -- taxonomy now has 3403 entries.
1300 occupations processed -- taxonomy now has 3578 entries.
1400 occupations processed -- taxonomy now has 3743 entries.
1500 occupations processed -- taxonomy now has 3909 entries.
1600 occupations processed -- taxonomy now has 4082 entries.
1700 occupations processed -- taxon