```
Copyright 2021 Twitter, Inc.
SPDX-License-Identifier: Apache-2.0
```

## Collect analysis data from Wikidata

Save the output of the query run on https://query.wikidata.org/ as described in the paper with the name `dataset.json`

In [1]:
!pip install --upgrade pip
!pip install torch torchvision torchaudio
!pip install transformers
!pip install py-feat
!pip install scikit-image pandas matplotlib statsmodels requests dash notebook jupyterlab  



In [2]:
import sys
import json
from pathlib import Path
import pandas as pd

In [3]:
HOME_DIR = Path("../").expanduser()
sys.path.append(str(HOME_DIR / "src"))
data_dir = HOME_DIR / Path("./data/")
data_dir.exists()

True

In [4]:
with open(data_dir / "./dataset.json") as fp:
    wikidata_data = json.load(fp)

len(wikidata_data["results"]["bindings"])

1

In [5]:
wikidata_data["results"]["bindings"][0]

{'human': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q335552'},
 'image': {'type': 'uri',
  'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/Jack_Dorsey_2014.jpg'},
 'sex_or_gender': {'type': 'uri',
  'value': 'http://www.wikidata.org/entity/Q6581097'},
 'ethnic_group': {'type': 'uri',
  'value': 'http://www.wikidata.org/entity/Q1075293'},
 'occupation': {'type': 'uri',
  'value': 'http://www.wikidata.org/entity/Q131524'},
 'loc_aid': {'type': 'literal', 'value': 'n2011042258'},
 'url': 'http://commons.wikimedia.org/wiki/Special:FilePath/Jack_Dorsey_2014.jpg'}

In [6]:
wikidata_data["results"].keys()

dict_keys(['bindings'])

In [7]:
wikidata_data["results"]["bindings"][0].keys()

dict_keys(['human', 'image', 'sex_or_gender', 'ethnic_group', 'occupation', 'loc_aid', 'url'])

In [8]:
wikidata_data["results"]["bindings"][0]["human"]["value"].rsplit("/", 1)

['http://www.wikidata.org/entity', 'Q335552']

In [9]:
REQUIRED_COLS = [
    "human",
    "image",
    "sex_or_gender",
    "ethnic_group",
    "occupation",
    "loc_aid",
]


def parse_row(row):
    data = {}
    for c in REQUIRED_COLS:
        value = row[c]["value"]
        if row[c]["type"] == "uri":
            value = value.rsplit("/", 1)[-1]
        data[c] = value
    url = row["url"]
    extension = Path(url.rsplit("/", 1)[-1]).suffix
    local_path = f"{data['human']}{extension}"
    data["url"] = url
    data["local_path"] = local_path
    return data

In [10]:
parse_row(wikidata_data["results"]["bindings"][0])

{'human': 'Q335552',
 'image': 'Jack_Dorsey_2014.jpg',
 'sex_or_gender': 'Q6581097',
 'ethnic_group': 'Q1075293',
 'occupation': 'Q131524',
 'loc_aid': 'n2011042258',
 'url': 'http://commons.wikimedia.org/wiki/Special:FilePath/Jack_Dorsey_2014.jpg',
 'local_path': 'Q335552.jpg'}

In [11]:
df = pd.DataFrame([parse_row(row) for row in wikidata_data["results"]["bindings"]])
df.head()

Unnamed: 0,human,image,sex_or_gender,ethnic_group,occupation,loc_aid,url,local_path
0,Q335552,Jack_Dorsey_2014.jpg,Q6581097,Q1075293,Q131524,n2011042258,http://commons.wikimedia.org/wiki/Special:File...,Q335552.jpg


# Gather images for all rows in `df`

Put the required images for each wikidata id in `df` into the `OUTPUT_DIR` using the file name specified via the column `local_path`

In [12]:
OUTPUT_DIR = Path(data_dir / "./images/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

df["file_exists"] = df["local_path"].apply(lambda x: (OUTPUT_DIR / x).exists())
df.file_exists.value_counts()

file_exists
False    1
Name: count, dtype: int64

In [13]:
df.file_exists.value_counts()[False]

1

## After putting all images in the folder run the next cell to update the dataframe with file status

In [14]:
df["file_exists"] = df["local_path"].apply(lambda x: (OUTPUT_DIR / x).exists())
df.file_exists.value_counts()[False]

1

In [15]:
len(list(OUTPUT_DIR.glob("./*")))

2

In [16]:
df.file_exists.value_counts()

file_exists
False    1
Name: count, dtype: int64

In [17]:
df["ethnic_group"].value_counts()

ethnic_group
Q1075293    1
Name: count, dtype: int64

In [18]:
df.to_csv(data_dir / "./dataset.tsv", sep="\t", index=False)