In [1]:
spark

In [17]:
# value info in wikidata entity table (https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Wikidata_entity)
# is a string as opposed to struct (because it has a variable schema)
# this UDF extracts the QID value (or null if doesn't exist)
def getValue(obj):
    try:
        d =  eval(obj)
        return d.get('id')
    except Exception:
        return None
    
spark.udf.register('getValue', getValue, 'string')

<function __main__.getValue>

In [34]:
tablename = 'isaacj.gender_wikidata'
wiki_db = 'enwiki'
create_table_query = """
    CREATE TABLE IF NOT EXISTS {0} (
        item_id    STRING  COMMENT 'QID',
        page_id    INT     COMMENT 'Page ID in {1}',
        property   STRING  COMMENT 'Property -- always P21 sex-or-gender',
        value      STRING  COMMENT 'Value -- generally either Q6581097 (male) or Q6581072 (female)'
    )
    """.format(tablename, wiki_db)

print(create_table_query)
spark.sql(create_table_query)


    CREATE TABLE IF NOT EXISTS isaacj.gender_wikidata (
        item_id    STRING  COMMENT 'QID',
        page_id    INT     COMMENT 'Page ID in enwiki',
        property   STRING  COMMENT 'Property -- always P21 sex-or-gender',
        value      STRING  COMMENT 'Value -- generally either Q6581097 (male) or Q6581072 (female)'
    )
    


In [35]:
print_for_hive = False
do_execute = True

query = """
WITH relevant_qids AS (
    SELECT page_id,
           item_id
      FROM wmf.wikidata_item_page_link
     WHERE snapshot = '2020-07-06'
           AND page_namespace = 0
           AND wiki_db = '{0}'
),
exploded_statements AS (
    SELECT id as item_id,
           q.page_id as page_id,
           explode(claims) as claim
      FROM wmf.wikidata_entity w
     INNER JOIN relevant_qids q
           ON (w.id = q.item_id)
     WHERE w.snapshot = '2020-07-06'
),
relevant_statements AS (
    SELECT item_id,
           page_id,
           claim.mainSnak.property AS property,
           getValue(claim.mainSnak.dataValue.value) as value
     FROM exploded_statements
    WHERE claim.mainSnak.property IN ('P31', 'P21')
),
humans AS (
    SELECT DISTINCT item_id
      FROM relevant_statements
     WHERE property = 'P31'
           AND value = 'Q5'
)
INSERT OVERWRITE TABLE {1}
SELECT s.item_id,
       s.page_id,
       s.property,
       s.value
  FROM relevant_statements s
 INNER JOIN humans h
       ON (h.item_id = s.item_id)
 WHERE s.property = 'P21'
""".format(wiki_db, tablename)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)


WITH relevant_qids AS (
    SELECT page_id,
           item_id
      FROM wmf.wikidata_item_page_link
     WHERE snapshot = '2020-07-06'
           AND page_namespace = 0
           AND wiki_db = 'enwiki'
),
exploded_statements AS (
    SELECT id as item_id,
           q.page_id as page_id,
           explode(claims) as claim
      FROM wmf.wikidata_entity w
     INNER JOIN relevant_qids q
           ON (w.id = q.item_id)
     WHERE w.snapshot = '2020-07-06'
),
relevant_statements AS (
    SELECT item_id,
           page_id,
           claim.mainSnak.property AS property,
           getValue(claim.mainSnak.dataValue.value) as value
     FROM exploded_statements
    WHERE claim.mainSnak.property IN ('P31', 'P21')
),
humans AS (
    SELECT DISTINCT item_id
      FROM relevant_statements
     WHERE property = 'P31'
           AND value = 'Q5'
)
INSERT OVERWRITE TABLE isaacj.gender_wikidata
SELECT s.item_id,
       s.page_id,
       s.property,
       s.value
  FROM relevant_statements s


In [30]:
spark.sql("""SELECT value, count(1) as num_items FROM {0} GROUP BY value""".format(tablename)).show(n=100)

+---------+---------+
|    value|num_items|
+---------+---------+
|   Q48270|      175|
| Q6581072|   329414|
|Q27679684|        2|
|     null|      111|
|Q15145778|        5|
|  Q859614|        2|
| Q1289754|        1|
| Q3277905|        1|
|  Q189125|        5|
|  Q179294|       74|
| Q2449503|      186|
| Q1097630|       79|
|  Q505371|        2|
|Q27679766|        1|
|Q15145779|       15|
|Q18116794|       20|
|  Q207959|        1|
| Q6581097|  1439869|
|  Q301702|        2|
| Q1052281|      523|
|Q93954933|        1|
|Q12964198|       10|
|Q52261234|        5|
+---------+---------+

