<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:

import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"

def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  columns=[]
  for result in results:
    if len(result) > len(columns):
      columns=[val for val in result]
  rows = []
  for result in results:
    values = [result[c]["value"] if c in result else None for c in columns ]
    rows.append(values)
  return pd.DataFrame(rows, columns=columns)

def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  # print(res.json())
  return wikidata_to_dataframe(res.json())

In [2]:
!pip install --upgrade gspread

Requirement already up-to-date: gspread in /usr/local/lib/python3.7/dist-packages (3.7.0)


## Indonesia

### Wikidata

In [21]:
sparql = """
SELECT ?place ?subregion1Label ?placeLabel ?classLabel ?indonesia_admincode ?subregion1_endtime ?class_endtime
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q12479774. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  OPTIONAL { ?place p:P131 [ps:P131 ?subregion1; pq:P582 ?subregion1_endtime ]. }
  ?place wdt:P2588 ?indonesia_admincode.
  ?place wdt:P31 ?class.
  OPTIONAL { ?place p:P31 [ps:P31 ?class; pq:P582 ?class_endtime ]. }
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_raw = wiki_data(sparql)
wiki_raw
len(wiki_raw)

679

In [58]:
def city_or_regency(classLabel: str) -> str:
  classLabel = classLabel.lower()
  if 'regency' in classLabel:
    return 'regency'
  elif 'city' in classLabel:
    return 'city'
  else:
    return 'Other'

wiki_df = wiki_raw.copy()
wiki_df = wiki_df[wiki_df.apply(lambda r : r.subregion1_endtime is None and r.class_endtime is None, axis=1)]
wiki_df['city_or_regency'] = wiki_df.apply(lambda r: city_or_regency(r.classLabel), axis=1)
wiki_df = wiki_df.drop(['classLabel'], axis=1)
wiki_df = wiki_df.drop_duplicates()

wiki_df
len(wiki_df)

528

In [59]:
wiki_df = wiki_df[wiki_df.subregion1Label != 'Q28725381']
duplicate = wiki_df[wiki_df.duplicated(['placeLabel', 'city_or_regency'], keep=False)] 
# len(duplicate)
duplicate


Unnamed: 0,place,class_endtime,indonesia_admincode,subregion1Label,placeLabel,subregion1_endtime,city_or_regency


### OpenCovid

In [36]:
meta = pd.read_csv('https://raw.githubusercontent.com/GoogleCloudPlatform/covid-19-open-data/main/src/data/metadata.csv')
meta = meta.query('(country_code == "ID")')
# meta

In [10]:
#  wiki_cities = wiki_cities.merge(meta, how='inner', left_on=['subregion1Label'], right_on=['subregion1_name']) 
#  wiki_regencies = wiki_regencies.merge(meta, how='inner', left_on=['subregion1Label'], right_on=['subregion1_name']) 
 

### Translations

In [37]:
def indonesian_direction_to_english(place: str):
  to_english = {
      'Pusat': 'Central',
      'Tengah' : 'Central',
      'Utara' : 'North',
      'Selatan' : 'South',
      'Timur' : 'East',
      'Barat' : 'West',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo_dir, eng_dir in to_english.items():
      # in indonesian the dir is at the end 
      if bits[-1] == indo_dir:
        bits.pop()
        bits.insert(0, eng_dir)
        break
  place = ' '.join(bits)
  return place

def indonesian_islands_to_english(place: str):
  to_english = {
      'Kepulauan' : 'Islands',
      'Pulau' : 'Island',
  }
  bits = place.split()
  bits = [to_english.get(b, b) for b in bits]
  # in english, islands is at the end
  if len(bits) > 1:
    for eng in to_english.values():
      # in indonesian the dir is at the end 
      if bits[0] == eng:
        bits.pop(0)
        bits.append(eng)
        break
  place = ' '.join(bits)
  return place

[
  indonesian_direction_to_english('whatever Tengah'),
  indonesian_islands_to_english('Kepulauan Tengah')
]

['Central whatever', 'Tengah Islands']

In [38]:
spellings = {
    # cities
    'Bau-Bau' : 'Baubau',
    'Sungaipenuh' : 'Sungai Penuh',
    'Palangka Raya' : 'Palangkaraya',
    'Tidore Kepulauan' : 'Tidore', # https://www.wikidata.org/wiki/Q19153
    'Lubuk Linggau' : 'Lubuklinggau', # https://www.wikidata.org/wiki/Q8129
    'Pematang Siantar': 'Pematangsiantar', # https://www.wikidata.org/wiki/Q5979
    'Tanjung Balai' : 'Tanjungbalai', # https://www.wikidata.org/wiki/Q5987
    # regencies
    'Kupang' : 'Kupang Regency', # https://www.wikidata.org/wiki/Q14141
    'Toba Samosir' : 'Toba Regency', # https://www.wikidata.org/wiki/Q5911
    'Tojo Una-Una' : 'Tojo Una Una',
    'Toli-Toli': 'Tolitoli',
    'Muko Muko': 'Mukomuko', # https://www.wikidata.org/wiki/Q8033
    'Kepulauan Seribu' : 'Thousand Islands', # https://translate.google.com/?sl=id&tl=en&text=Kepulauan%20Seribu&op=translate
    'Penajam Paser Utara' : 'Penajam North Paser',
    'Pasangkayu (Mamuju Utara)': 'Pasangkayu', # https://en.wikipedia.org/wiki/Pasangkayu_Regency
    'Labuhanbatu' : 'Labuhan Batu', # https://www.wikidata.org/wiki/Q5814
    'Batu Bara': 'Batubara', # https://www.wikidata.org/wiki/Q5797
    'Limapuluh Kota': 'Lima Puluh Kota', # https://www.wikidata.org/wiki/Q6032
    'Batang Hari': 'Batanghari', # https://www.wikidata.org/wiki/Q7370
    'Kepulauan Sangihe': 'Sangihe', # https://www.wikidata.org/wiki/Q15839
    'Kepulauan Sitaro': 'Kepulauan Siau Tagulandang Biaro', # https://www.wikidata.org/wiki/Q15840
}
def translate_sheet_place_to_wikidata(place: str) -> str:
  if place in spellings:
    return spellings[place]
  place = indonesian_direction_to_english(place)
  place = indonesian_islands_to_english(place)
  return place

[
 translate_sheet_place_to_wikidata('Kepulauan Tengah'),
 translate_sheet_place_to_wikidata('Kepulauan Tenga'),
 translate_sheet_place_to_wikidata('Jakarta Barat'),
] 
 

['Central Islands', 'Tenga Islands', 'West Jakarta']

### Sheet

In [39]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()
sheet_raw = pd.DataFrame.from_records(rows[2:], columns=rows[1])

In [62]:
def city_or_regency(KabKota: str) -> str:
  if KabKota == 'Kab.':
    return 'regency'
  elif KabKota == 'Kota':
    return 'city'
  else:
    return KabKota

sheet_df = sheet_raw.copy()
sheet_df = sheet_df[sheet_df.KabKota != 'zTam']
sheet_df['KabKota_eng'] = sheet_df.apply(lambda r: city_or_regency(r.KabKota), axis=1)
[set(sheet_df.KabKota_eng), len(sheet_df) ]

[{'city', 'regency'}, 514]

In [74]:
# Match rows in sheet with rows in wiki
sheet_df['Kota_translated'] = sheet_df.apply(lambda r: translate_sheet_place_to_wikidata(r.Kota), axis=1)
df = sheet_df.merge(wiki_df, how='left', left_on=['KabKota_eng', 'Kota'], right_on=['city_or_regency','placeLabel'], validate="1:1")

df1 = df[~df['placeLabel'].isnull()]
df2 = df[df['placeLabel'].isnull()]
df2 = df2[list(sheet_df.columns)]
df2 = df2.merge(wiki_df, how='left', left_on=['KabKota_eng', 'Kota_translated'], right_on=['city_or_regency','placeLabel'], validate="1:1")
df = pd.concat([df1, df2])

df
# missing = df[df['placeLabel'].isnull()]
# missing
# len(missing)

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,KabKota_eng,Kota_translated,place,class_endtime,indonesia_admincode,subregion1Label,placeLabel,subregion1_endtime,city_or_regency
0,1,Aceh (NAD),258,Kab.,Aceh Barat,regency,West Aceh,http://www.wikidata.org/entity/Q5778,,11.05,Aceh,Aceh Barat,,regency
1,1,Aceh (NAD),259,Kab.,Aceh Barat Daya,regency,Aceh Barat Daya,http://www.wikidata.org/entity/Q5775,,11.12,Aceh,Aceh Barat Daya,,regency
2,1,Aceh (NAD),260,Kab.,Aceh Besar,regency,Aceh Besar,http://www.wikidata.org/entity/Q5662,,11.06,Aceh,Aceh Besar,,regency
3,1,Aceh (NAD),261,Kab.,Aceh Jaya,regency,Aceh Jaya,http://www.wikidata.org/entity/Q5667,,11.14,Aceh,Aceh Jaya,,regency
4,1,Aceh (NAD),262,Kab.,Aceh Selatan,regency,South Aceh,http://www.wikidata.org/entity/Q5759,,11.01,Aceh,Aceh Selatan,,regency
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,34,Sumatera Utara,472,Kab.,Labuhanbatu,regency,Labuhan Batu,http://www.wikidata.org/entity/Q5814,,12.10,North Sumatra,Labuhan Batu,,regency
32,34,Sumatera Utara,486,Kota,Pematang Siantar,city,Pematangsiantar,http://www.wikidata.org/entity/Q5979,,12.72,North Sumatra,Pematangsiantar,,city
33,34,Sumatera Utara,491,Kota,Tanjung Balai,city,Tanjungbalai,http://www.wikidata.org/entity/Q5987,,12.74,North Sumatra,Tanjungbalai,,city
34,34,Sumatera Utara,493,Kab.,Tapanuli Tengah,regency,Central Tapanuli,http://www.wikidata.org/entity/Q5800,,12.01,North Sumatra,Central Tapanuli,,regency


In [75]:
# merge with meta
df = df.merge(meta, how='left', left_on=['subregion1Label'], right_on=['subregion1_name'])
missing = df[df['subregion1_name'].isnull()]
missing
# len(missing)


Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,KabKota_eng,Kota_translated,place,class_endtime,indonesia_admincode,subregion1Label,placeLabel,subregion1_endtime,city_or_regency,key,country_code,country_name,subregion1_code,subregion1_name,subregion2_code,subregion2_name,locality_code,locality_name,match_string,aggregate_report_offset
48,5,DI Yogyakarta,35,Kab.,Bantul,regency,Bantul,http://www.wikidata.org/entity/Q11477,,34.02,Special Region of Yogyakarta,Bantul,,regency,,,,,,,,,,,
49,5,DI Yogyakarta,36,Kab.,Gunung Kidul,regency,Gunung Kidul,http://www.wikidata.org/entity/Q11478,,34.03,Special Region of Yogyakarta,Gunung Kidul,,regency,,,,,,,,,,,
50,5,DI Yogyakarta,37,Kab.,Kulon Progo,regency,Kulon Progo,http://www.wikidata.org/entity/Q11479,,34.01,Special Region of Yogyakarta,Kulon Progo,,regency,,,,,,,,,,,
51,5,DI Yogyakarta,38,Kab.,Sleman,regency,Sleman,http://www.wikidata.org/entity/Q11480,,34.04,Special Region of Yogyakarta,Sleman,,regency,,,,,,,,,,,
52,5,DI Yogyakarta,39,Kota,Yogyakarta,city,Yogyakarta,http://www.wikidata.org/entity/Q7568,,34.71,Special Region of Yogyakarta,Yogyakarta,,city,,,,,,,,,,,


In [None]:
# Matching rows with subregions
df = df.merge(provs, how='inner', left_on='subregion1Label', right_on='subregion1_name')
df['key'] = df.apply(lambda r: str(r.key) + '_' + str(r["ID Kota"]), axis=1)
df = df[~df['Kota'].isnull() & ~df['key'].isnull()]
cols = ['key', 'country_code', 'country_name', 'subregion1_code', 'subregion1_name', 'ID Kota', 'placeLabel']

print(df[cols].to_csv())
