<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [34]:

import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"

def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  rows = []
  for result in results:
      values = [ result[val]["value"] for val in result ]
      rows.append(values)
  return pd.DataFrame(rows, columns=[val for val in results[0]])

def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  return wikidata_to_dataframe(res.json())

In [41]:
!pip install --upgrade gspread

Requirement already up-to-date: gspread in /usr/local/lib/python3.7/dist-packages (3.7.0)


## Indonesia

### Wikidata

In [48]:
# indonesia level2 areas
# city=Q3199141, Q3191695 = regency
sparql_cities = """
SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q3199141. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_cities = wiki_data(sparql_cities)

sparql_regencies = """
SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q3191695. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_regencies = wiki_data(sparql_regencies)

# wiki_cities
wiki_regencies


Unnamed: 0,place,subregion1Label,placeLabel
0,http://www.wikidata.org/entity/Q1795,North Sumatra,Asahan
1,http://www.wikidata.org/entity/Q5662,Aceh,Aceh Besar
2,http://www.wikidata.org/entity/Q5666,Aceh,Gayo Lues
3,http://www.wikidata.org/entity/Q5667,Aceh,Aceh Jaya
4,http://www.wikidata.org/entity/Q5672,Aceh,Aceh Singkil
...,...,...,...
428,http://www.wikidata.org/entity/Q46483,West Java,Pangandaran
429,http://www.wikidata.org/entity/Q676084,North Maluku,Pulau Taliabu
430,http://www.wikidata.org/entity/Q848672,West Papua,Manokwari
431,http://www.wikidata.org/entity/Q3182322,Bengkulu,Kaur


### OpenCovid

In [49]:
meta = pd.read_csv('https://raw.githubusercontent.com/GoogleCloudPlatform/covid-19-open-data/main/src/data/metadata.csv')
meta = provs.query('(country_code == "ID")')
meta



Unnamed: 0,key,country_code,country_name,subregion1_code,subregion1_name,subregion2_code,subregion2_name,locality_code,locality_name,match_string,aggregate_report_offset
10605,ID,ID,Indonesia,,,,,,,,-1.0
10606,ID_AC,ID,Indonesia,AC,Aceh,,,,,,0.0
10607,ID_BA,ID,Indonesia,BA,Bali,,,,,,0.0
10608,ID_BB,ID,Indonesia,BB,Bangka Belitung Islands,,,,,Kepulauan Bangka Belitung,
10609,ID_BE,ID,Indonesia,BE,Bengkulu,,,,,,
10610,ID_BT,ID,Indonesia,BT,Banten,,,,,,
10611,ID_GO,ID,Indonesia,GO,Gorontalo,,,,,,
10612,ID_JA,ID,Indonesia,JA,Jambi,,,,,,
10613,ID_JB,ID,Indonesia,JB,West Java,,,,,Jawa Barat,
10614,ID_JI,ID,Indonesia,JI,East Java,,,,,Jawa Timur,


### Translations

In [56]:
def indonesian_direction_to_english(place: str):
  # place is in lower
  to_english = {
      'pusat': 'central',
      'tengah' : 'central',
      'utara' : 'north',
      'selatan' : 'south',
      'timur' : 'east',
      'barat' : 'west',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo_dir, eng_dir in to_english.items():
      # in indonesian the dir is at the end 
      if bits[-1] == indo_dir:
        bits.pop()
        bits.insert(0, eng_dir)
        break
  place = ' '.join(bits)
  return place

def indonesian_to_english(place: str):
  # place is in lower
  to_english = {
      'kepulauan' : 'islands',
      'pulau' : 'island',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo, eng in to_english.items():
      # in indonesian the island is at the beginning while in english its at the end
      if bits[0] == indo:
        bits.pop(0)
        bits.append(eng)
        break
  place = ' '.join(bits)
  return place

[
  indonesian_direction_to_english('whatever tengah'),
  indonesian_to_english('kepulauan tengah')
]

['central whatever', 'tengah islands']

In [57]:
spellings = {
    'Toba Samosir' : 'Toba Regency', # https://www.wikidata.org/wiki/Q5911
    'Penajam Paser Utara' : 'Penajam North Paser',
    'Kupang' : 'Kupang Regency', # https://www.wikidata.org/wiki/Q14141
    'Kepulauan Seribu' : 'Thousand Islands', # https://translate.google.com/?sl=id&tl=en&text=Kepulauan%20Seribu&op=translate
    'Pangkajene Kepulauan' : 'Pangkajene Islands', # https://translate.google.com/?sl=id&tl=en&text=Pangkajene%20Kepulauan&op=translate
    'Pasangkayu (Mamuju Utara)': 'Pasangkayu', # https://en.wikipedia.org/wiki/Pasangkayu_Regency
    # due to dashes
    'Bau-Bau' : 'Baubau',
    'Tojo Una-Una' : 'Tojo Una Una',
    'Toli-Toli': 'Tolitoli',
     # verify below
    # 'Kepulauan Sangihe' : 'Sangihe',
    # 'Kepulauan Sitaro' : 'Kepulauan Siau Tagulandang Biaro',
}
def standard_place_spelling(place: str) -> str:
   # other spellings if still required
  place = spellings.get(place, place)
  # all lowercase
  place = place.lower()
  # apply translations
  place = indonesian_direction_to_english(place)
  place = indonesian_to_english(place)
  # remove spaces
  # place = place.replace(' ', '')
  return place


wiki_df['place_standardised'] = wiki_df.apply(lambda r: standard_place_spelling(r.placeLabel), axis=1)


### Sheet

In [58]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
sheet_df = pd.DataFrame.from_records(rows[2:], columns=rows[1])

sheet_df['KabKota_standardized'] = sheet_df.apply(lambda r: standard_place_spelling(r.Kota), axis=1)

# 'Kab.' : 'Regency', 
# 'Kota': 'City', 
sheet_cities = sheet_df[sheet_df.KabKota == 'Kota']
sheet_regencies = sheet_df[sheet_df.KabKota == 'Kab.']
sheet_cities
# sheet_regencies

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,KabKota_standardized
11,1,Aceh (NAD),269,Kota,Banda Aceh,bandaaceh
15,1,Aceh (NAD),273,Kota,Langsa,langsa
16,1,Aceh (NAD),274,Kota,Lhokseumawe,lhokseumawe
20,1,Aceh (NAD),278,Kota,Sabang,sabang
22,1,Aceh (NAD),280,Kota,Subulussalam,subulussalam
...,...,...,...,...,...,...
549,34,Sumatera Utara,484,Kota,Padang Sidempuan,padangsidempuan
551,34,Sumatera Utara,486,Kota,Pematang Siantar,pematangsiantar
554,34,Sumatera Utara,489,Kota,Sibolga,sibolga
556,34,Sumatera Utara,491,Kota,Tanjung Balai,tanjungbalai


In [None]:
# Find rows in wiki_df with no matches in sheet_df
df = wiki_df.merge(sheet_df, how='left', right_on=['Kota_standardised', 'regiontype'], left_on=['place_standardised', 'regiontype']) 
missing = df[df['Kota'].isnull()]
missing
# len(missing)



Unnamed: 0,class,place,admin_code,subregion1,subregion1Label,placeLabel,classLabel,regiontype,place_standardised,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,Kota_standardised
475,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q19153,82.72,http://www.wikidata.org/entity/Q5094,North Maluku,Tidore,city of Indonesia,City,tidore,,,,,,


In [None]:
# Matching rows with subregions
df = df.merge(provs, how='inner', left_on='subregion1Label', right_on='subregion1_name')
df['key'] = df.apply(lambda r: str(r.key) + '_' + str(r["ID Kota"]), axis=1)
df = df[~df['Kota'].isnull() & ~df['key'].isnull()]
cols = ['key', 'country_code', 'country_name', 'subregion1_code', 'subregion1_name', 'ID Kota', 'placeLabel']

print(df[cols].to_csv())


,key,country_code,country_name,subregion1_code,subregion1_name,ID Kota,placeLabel
0,ID_JI_161,ID,Indonesia,JI,East Java,161,Tulungagung
1,ID_JI_160,ID,Indonesia,JI,East Java,160,Tuban
2,ID_JI_159,ID,Indonesia,JI,East Java,159,Trenggalek
3,ID_JI_157,ID,Indonesia,JI,East Java,157,Sumenep
4,ID_JI_156,ID,Indonesia,JI,East Java,156,Situbondo
5,ID_JI_155,ID,Indonesia,JI,East Java,155,Sidoarjo
6,ID_JI_154,ID,Indonesia,JI,East Java,154,Sampang
7,ID_JI_152,ID,Indonesia,JI,East Java,152,Probolinggo
8,ID_JI_151,ID,Indonesia,JI,East Java,151,Ponorogo
9,ID_JI_149,ID,Indonesia,JI,East Java,149,Pasuruan
10,ID_JI_148,ID,Indonesia,JI,East Java,148,Pamekasan
11,ID_JI_147,ID,Indonesia,JI,East Java,147,Pacitan
12,ID_JI_146,ID,Indonesia,JI,East Java,146,Ngawi
13,ID_JI_145,ID,Indonesia,JI,East Java,145,Nganjuk
14,ID_JI_143,ID,Indonesia,JI,East Java,143,Mojokerto
15,ID_JI_141,ID,Indonesia,JI,East Java,141,Malang
16,ID_JI_140,ID,Indonesia,JI,East Java,140,Magetan
17,ID_JI_138,ID,Indonesia,JI,East Java,138,Mad

In [None]:
# Find rows in sheet_df with no matches in wiki_df
df = sheet_df.merge(wiki_df, how='left', left_on=['Kota_standardised', 'regiontype'], right_on=['place_standardised', 'regiontype']) 
# missing = df[df['placeLabel'].isnull() & ~df['KabKota'].isin(['zTam'])]
missing = df[df['placeLabel'].isnull()]
missing
# len(missing)

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,regiontype,Kota_standardised,class,place,admin_code,subregion1,subregion1Label,placeLabel,classLabel,place_standardised
23,1,Aceh (NAD),1011,zTam,Luar Provinsi Aceh,zTam,luarprovinsiaceh,,,,,,,,
24,1,Aceh (NAD),1013,zTam,Luar Negeri (Aceh),zTam,luarnegeri(aceh),,,,,,,,
25,1,Aceh (NAD),1012,zTam,Belum Diverifikasi (Aceh),zTam,belumdiverifikasi(aceh),,,,,,,,
26,2,Bali,1021,zTam,WNA (Bali),zTam,wna(bali),,,,,,,,
27,2,Bali,1022,zTam,Luar Provinsi Bali,zTam,luarprovinsibali,,,,,,,,
28,2,Bali,1023,zTam,Belum Diverifikasi (Bali),zTam,belumdiverifikasi(bali),,,,,,,,
42,3,Banten,21,Kota,Serang,City,serang,,,,,,,,
46,3,Banten,1031,zTam,Belum Diverifikasi (Banten),zTam,belumdiverifikasi(banten),,,,,,,,
47,4,Bengkulu,1041,zTam,Belum Diverifikasi (Bengkulu),zTam,belumdiverifikasi(bengkulu),,,,,,,,
66,5,DI Yogyakarta,1051,zTam,Luar Provinsi DIY,zTam,luarprovinsidiy,,,,,,,,
