<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## WikiData

In [19]:
# Sparql functions
import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"


def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  return res.json()


def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  # column names we draw from the first result
  cols = [ val for val in results[0] ]
  rows = []
  for result in results:
      values = [ result[val]["value"] for val in result ]
      rows.append(values)
  return pd.DataFrame(rows, columns=cols)


In [26]:
sparql = """
SELECT ?place ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q12479774.
  ?place wdt:P31 ?class.
  ?class wdt:P279 wd:Q12479774.
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
res = wiki_data(sparql)
wiki_raw = wikidata_to_dataframe(res)
wiki_raw
# res

Unnamed: 0,class,place,placeLabel,classLabel
0,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q12488339,Banggai Laut,regency of Indonesia
1,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q11214749,Sidenreng Rappang,regency of Indonesia
2,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201892,Katingan,regency of Indonesia
3,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201768,Maluku Tengah,regency of Indonesia
4,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q14622,Sinjai,regency of Indonesia
...,...,...,...,...
512,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7253,Padang,city of Indonesia
513,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7248,Bukittinggi,city of Indonesia
514,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5989,Tebing Tinggi,city of Indonesia
515,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5987,Tanjungbalai,city of Indonesia


In [27]:
wiki_raw.classLabel.unique()

kabkota_to_regiontype = {
    'regency of Indonesia' : 'Regency',
    'administrative regency of Indonesia' : 'Regency',
    'city of Indonesia': 'City',
    'administrative city of Indonesia': 'City',
}

wiki_df = wiki_raw
wiki_df['regiontype'] = wiki_df.apply(lambda r: kabkota_to_regiontype[r.classLabel], axis=1)
wiki_df

Unnamed: 0,class,place,placeLabel,classLabel,regiontype
0,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q12488339,Banggai Laut,regency of Indonesia,Regency
1,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q11214749,Sidenreng Rappang,regency of Indonesia,Regency
2,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201892,Katingan,regency of Indonesia,Regency
3,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201768,Maluku Tengah,regency of Indonesia,Regency
4,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q14622,Sinjai,regency of Indonesia,Regency
...,...,...,...,...,...
512,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7253,Padang,city of Indonesia,City
513,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7248,Bukittinggi,city of Indonesia,City
514,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5989,Tebing Tinggi,city of Indonesia,City
515,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5987,Tanjungbalai,city of Indonesia,City


## Sheet

In [None]:
!pip install --upgrade gspread

In [28]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
sheet_raw = pd.DataFrame.from_records(rows[2:], columns=rows[1])


In [43]:
sheet_df = sheet_raw

kabkota_to_regiontype = {
    'Kab.' : 'Regency', 
    'Kota': 'City', 
    'zTam' : 'zTam'
}
sheet_df['regiontype'] = sheet_df.apply(lambda r: kabkota_to_regiontype[r.KabKota], axis=1)

spellings = {
    'Labuhanbatu': 'Labuhan Batu',
    'Batu Bara' : 'Batubara',
    'Pematang Siantar' : 'Pematangsiantar',
    'Tapanuli Tengah' : 'Central Tapanuli',
    'Toba Samosir' : 'Toba Regency',
}
sheet_df['Kota_respelled'] = sheet_df.apply(lambda r: spellings.get(r.Kota, r.Kota), axis=1)
# sheet_df

## Sheet vs WikiData

In [52]:
# Find rows in wiki_df with no matches in sheet_df
df = wiki_df.merge(sheet_df, how='left', right_on=['Kota_respelled', 'regiontype'], left_on=['placeLabel', 'regiontype']) 
missing = df[df['Kota'].isnull()]
missing
# len(missing)

34

In [53]:
# Find rows in sheet_df with no matches in wiki_df
df = sheet_df.merge(wiki_df, how='left', left_on=['Kota_respelled', 'regiontype'], right_on=['placeLabel', 'regiontype']) 
missing = df[df['placeLabel'].isnull() & ~df['KabKota'].isin(['zTam'])]
missing
# len(missing)

31