<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:

import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"

def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  rows = []
  for result in results:
      values = [ result[val]["value"] for val in result ]
      rows.append(values)
  return pd.DataFrame(rows, columns=[val for val in results[0]])

def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  return wikidata_to_dataframe(res.json())

In [None]:
!pip install --upgrade gspread

## Indonesia

### Wikidata

In [140]:
# indonesia level2 areas
# city=Q3199141,Q4272761 
# regency = Q3191695, Q11127777
sparql = """
SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31 wd:Q3199141. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_cities = wiki_data(sparql)

sparql = """
SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31 wd:Q4272761. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_cities1 = wiki_data(sparql)

sparql = """
SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31 wd:Q3191695. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_regencies = wiki_data(sparql)
sparql = """
SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31 wd:Q11127777. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_regencies1 = wiki_data(sparql)

wiki_cities = pd.concat([wiki_cities, wiki_cities1])
wiki_regencies = pd.concat([wiki_regencies, wiki_regencies1])
wiki_cities
# d = wiki_cities[wiki_cities.placeLabel]
# d
# wiki_regencies



Unnamed: 0,place,subregion1Label,placeLabel
0,http://www.wikidata.org/entity/Q18975,Maluku,Tual
1,http://www.wikidata.org/entity/Q19151,North Maluku,Ternate
2,http://www.wikidata.org/entity/Q19153,North Maluku,Tidore
3,http://www.wikidata.org/entity/Q26837,West Papua,Sorong
4,http://www.wikidata.org/entity/Q27860,Papua,Jayapura
...,...,...,...
0,http://www.wikidata.org/entity/Q10109,Jakarta,Central Jakarta
1,http://www.wikidata.org/entity/Q10111,Jakarta,East Jakarta
2,http://www.wikidata.org/entity/Q10113,Jakarta,North Jakarta
3,http://www.wikidata.org/entity/Q10114,Jakarta,South Jakarta


### OpenCovid

In [None]:
meta = pd.read_csv('https://raw.githubusercontent.com/GoogleCloudPlatform/covid-19-open-data/main/src/data/metadata.csv')
meta = provs.query('(country_code == "ID")')
meta



### Translations

In [82]:
def indonesian_direction_to_english(place: str):
  to_english = {
      'Pusat': 'Central',
      'Tengah' : 'Central',
      'Utara' : 'North',
      'Selatan' : 'South',
      'Timur' : 'East',
      'Barat' : 'West',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo_dir, eng_dir in to_english.items():
      # in indonesian the dir is at the end 
      if bits[-1] == indo_dir:
        bits.pop()
        bits.insert(0, eng_dir)
        break
  place = ' '.join(bits)
  return place

def indonesian_to_english(place: str):
  to_english = {
      'Kepulauan' : 'Islands',
      'Pulau' : 'Island',
  }
  bits = place.split()
  bits = [to_english.get(b, b) for b in bits]
  place = ' '.join(bits)
  return place

[
  indonesian_direction_to_english('whatever Tengah'),
  indonesian_to_english('Kepulauan Tengah')
]

['Central whatever', 'Islands Tengah']

In [147]:
spellings = {
    # 'Pangkajene Kepulauan' : 'Pangkajene Islands', # https://translate.google.com/?sl=id&tl=en&text=Pangkajene%20Kepulauan&op=translate
    # cities
    'Bau-Bau' : 'Baubau',
    'Sungaipenuh' : 'Sungai Penuh',
    'Palangka Raya' : 'Palangkaraya',
    'Tidore Kepulauan' : 'Tidore', # https://www.wikidata.org/wiki/Q19153
    'Lubuk Linggau' : 'Lubuklinggau', # https://www.wikidata.org/wiki/Q8129
    'Pematang Siantar': 'Pematangsiantar', # https://www.wikidata.org/wiki/Q5979
    'Tanjung Balai' : 'Tanjungbalai', # https://www.wikidata.org/wiki/Q5987
    # regencies
    'Kupang' : 'Kupang Regency', # https://www.wikidata.org/wiki/Q14141
    'Toba Samosir' : 'Toba Regency', # https://www.wikidata.org/wiki/Q5911
    'Tojo Una-Una' : 'Tojo Una Una',
    'Toli-Toli': 'Tolitoli',
    'Muko Muko': 'Mukomuko', # https://www.wikidata.org/wiki/Q8033
    'Kepulauan Seribu' : 'Thousand Islands', # https://translate.google.com/?sl=id&tl=en&text=Kepulauan%20Seribu&op=translate
    'Penajam Paser Utara' : 'Penajam North Paser',
    'Pasangkayu (Mamuju Utara)': 'Pasangkayu', # https://en.wikipedia.org/wiki/Pasangkayu_Regency
    'Labuhanbatu' : 'Labuhan Batu', # https://www.wikidata.org/wiki/Q5814
    'Batu Bara': 'Batubara', # https://www.wikidata.org/wiki/Q5797
    'Limapuluh Kota': 'Lima Puluh Kota', # https://www.wikidata.org/wiki/Q6032
    'Batang Hari': 'Batanghari', # https://www.wikidata.org/wiki/Q7370

     # verify below
    # 'Kepulauan Sangihe' : 'Sangihe',
    # 'Kepulauan Sitaro' : 'Kepulauan Siau Tagulandang Biaro',
}
def translate_sheet_place_to_wikidata(place: str) -> str:
  place = spellings.get(place, place)
  place = indonesian_direction_to_english(place)
  place = indonesian_to_english(place)
  return place

[
 translate_sheet_place_to_wikidata('Kepulauan Tengah'),
 translate_sheet_place_to_wikidata('Kepulauan Tenga'),
 translate_sheet_place_to_wikidata('Bau-Bau'),
] 
 

['Central Islands', 'Islands Tenga', 'Baubau']

### Sheet

In [148]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
sheet_df = pd.DataFrame.from_records(rows[2:], columns=rows[1])

sheet_df['Kota_translated'] = sheet_df.apply(lambda r: translate_sheet_place_to_wikidata(r.Kota), axis=1)

# 'Kab.' : 'Regency', 
# 'Kota': 'City', 
sheet_cities = sheet_df[sheet_df.KabKota == 'Kota']
sheet_regencies = sheet_df[sheet_df.KabKota == 'Kab.']
# sheet_cities
# sheet_regencies

In [149]:
# Match rows in sheet with rows in wiki
cities = sheet_cities.merge(wiki_cities, how='left', left_on=['Kota'], right_on=['placeLabel']) 
cities = cities.merge(wiki_cities, how='left', left_on=['Kota_translated'], right_on=['placeLabel'], suffixes = [ None, '_y']) 
missing = cities[cities['placeLabel'].isnull() & cities['placeLabel_y'].isnull()]
missing

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,Kota_translated,place,subregion1Label,placeLabel,place_y,subregion1Label_y,placeLabel_y


In [150]:
regencies = sheet_regencies.merge(wiki_regencies, how='left', left_on=['Kota'], right_on=['placeLabel']) 
regencies = regencies.merge(wiki_regencies, how='left', left_on=['Kota_translated'], right_on=['placeLabel'], suffixes = [ None, '_y']) 
missing = regencies[regencies['placeLabel'].isnull() & regencies['placeLabel_y'].isnull()]
missing

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,Kota_translated,place,subregion1Label,placeLabel,place_y,subregion1Label_y,placeLabel_y
252,21,Maluku Utara,255,Kab.,Pulau Morotai,Island Morotai,,,,,,
295,24,Papua,322,Kab.,Kepulauan Yapen,Islands Yapen,,,,,,
361,28,Sulawesi Selatan,384,Kab.,Kepulauan Selayar,Islands Selayar,,,,,,
405,31,Sulawesi Utara,420,Kab.,Kepulauan Sangihe,Islands Sangihe,,,,,,
406,31,Sulawesi Utara,421,Kab.,Kepulauan Sitaro,Islands Sitaro,,,,,,
407,31,Sulawesi Utara,422,Kab.,Kepulauan Talaud,Islands Talaud,,,,,,


In [None]:
df = wiki_df.merge(sheet_df, how='left', right_on=['Kota_standardised', 'regiontype'], left_on=['place_standardised', 'regiontype']) 
missing = df[df['Kota'].isnull()]
missing
# len(missing)



In [None]:
# Matching rows with subregions
df = df.merge(provs, how='inner', left_on='subregion1Label', right_on='subregion1_name')
df['key'] = df.apply(lambda r: str(r.key) + '_' + str(r["ID Kota"]), axis=1)
df = df[~df['Kota'].isnull() & ~df['key'].isnull()]
cols = ['key', 'country_code', 'country_name', 'subregion1_code', 'subregion1_name', 'ID Kota', 'placeLabel']

print(df[cols].to_csv())
