<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [78]:

import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"

def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  columns=[]
  for result in results:
    if len(result) > len(columns):
      columns=[val for val in result]
  rows = []
  for result in results:
    values = [result[c]["value"] if c in result else None for c in columns ]
    rows.append(values)
  return pd.DataFrame(rows, columns=columns)

def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  # print(res.json())
  return wikidata_to_dataframe(res.json())

In [2]:
!pip install --upgrade gspread

Collecting gspread
  Downloading https://files.pythonhosted.org/packages/df/f0/e345e7159c89b898f183cc40ed9909619475492bb000652d709f395f096a/gspread-3.7.0-py3-none-any.whl
Installing collected packages: gspread
  Found existing installation: gspread 3.0.1
    Uninstalling gspread-3.0.1:
      Successfully uninstalled gspread-3.0.1
Successfully installed gspread-3.7.0


## Indonesia

### Wikidata

In [79]:
sparql = """
SELECT ?place ?subregion1Label ?placeLabel ?classLabel ?indonesia_admincode ?subregion1_endtime ?class_endtime
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q12479774. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  OPTIONAL { ?place p:P131 [ps:P131 ?subregion1; pq:P582 ?subregion1_endtime ]. }
  ?place wdt:P2588 ?indonesia_admincode.
  ?place wdt:P31 ?class.
  OPTIONAL { ?place p:P31 [ps:P31 ?class; pq:P582 ?class_endtime ]. }
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_df = wiki_data(sparql)
wiki_df
# len(wiki_df)

Unnamed: 0,place,class_endtime,indonesia_admincode,subregion1Label,placeLabel,classLabel,subregion1_endtime
0,http://www.wikidata.org/entity/Q7256,,13.74,West Sumatra,Padang Panjang,human settlement,
1,http://www.wikidata.org/entity/Q15840,,71.09,North Sulawesi,Kepulauan Siau Tagulandang Biaro,island group,
2,http://www.wikidata.org/entity/Q16561,,81.07,Maluku,Kepulauan Aru,island group,
3,http://www.wikidata.org/entity/Q4803,,33.72,Central Java,Surakarta,big city,
4,http://www.wikidata.org/entity/Q5779,,11.71,Aceh,Banda Aceh,big city,
...,...,...,...,...,...,...,...
674,http://www.wikidata.org/entity/Q18511812,,16.13,South Sumatra,Musi Rawas Utara,regency of Indonesia,
675,http://www.wikidata.org/entity/Q19745487,,74.14,Southeast Sulawesi,Buton Tengah,regency of Indonesia,
676,http://www.wikidata.org/entity/Q19745612,,74.15,Southeast Sulawesi,Buton Selatan,regency of Indonesia,
677,http://www.wikidata.org/entity/Q19746428,,74.13,Southeast Sulawesi,Muna Barat,regency of Indonesia,


In [80]:
set(wiki_df['classLabel'])

def city_or_regency(classLabel: str) -> str:
  classLabel = classLabel.lower()
  if 'regency' in classLabel:
    return 'regency'
  elif 'city' in classLabel:
    return 'city'
  else:
    return 'Other'
wiki_df['city_or_regency'] = wiki_df.apply(lambda r: city_or_regency(r.classLabel), axis=1)

set(wiki_df['city_or_regency'])

wiki_df = wiki_df.drop(['classLabel'], axis=1)
wiki_df = wiki_df.drop_duplicates()

wiki_cities = wiki_df[wiki_df['city_or_regency'] == 'city']
wiki_regencies = wiki_df[wiki_df['city_or_regency'] == 'regency']
wiki_cities
# wiki_regencies

Unnamed: 0,place,class_endtime,indonesia_admincode,subregion1Label,placeLabel,subregion1_endtime,city_or_regency
3,http://www.wikidata.org/entity/Q4803,,33.72,Central Java,Surakarta,,city
4,http://www.wikidata.org/entity/Q5779,,11.71,Aceh,Banda Aceh,,city
5,http://www.wikidata.org/entity/Q5781,,11.74,Aceh,Langsa,,city
6,http://www.wikidata.org/entity/Q5784,,11.73,Aceh,Lhokseumawe,,city
7,http://www.wikidata.org/entity/Q5954,,12.75,North Sumatra,Binjai,,city
...,...,...,...,...,...,...,...
630,http://www.wikidata.org/entity/Q10111,,31.75,Jakarta,East Jakarta,,city
631,http://www.wikidata.org/entity/Q10109,,31.71,Jakarta,Central Jakarta,,city
632,http://www.wikidata.org/entity/Q10114,,31.74,Jakarta,South Jakarta,,city
633,http://www.wikidata.org/entity/Q10113,,31.72,Jakarta,North Jakarta,,city


In [81]:
wiki_cities = wiki_cities[wiki_cities.apply(lambda r : r.subregion1_endtime is None and r.class_endtime is None, axis=1)]

In [82]:
duplicate = wiki_cities[wiki_cities.duplicated('placeLabel', keep=False)] 
len(duplicate)
duplicate

Unnamed: 0,place,class_endtime,indonesia_admincode,subregion1Label,placeLabel,subregion1_endtime,city_or_regency
80,http://www.wikidata.org/entity/Q15847,,71.71,North Sulawesi,Manado,,city
81,http://www.wikidata.org/entity/Q15847,,71.71,Q28725381,Manado,,city
558,http://www.wikidata.org/entity/Q10126,,36.73,Banten,Serang,,city
559,http://www.wikidata.org/entity/Q10126,,36.73,city of Indonesia,Serang,,city


In [3]:
# # indonesia level2 areas
# # city=Q3199141,Q4272761 
# # regency = Q3191695, Q11127777

# sparql_template = """
# SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel ?indonesia_admincode
# WHERE
# {
#   ?place wdt:P31 wd:Q3199141. # P31=instance of, P279=subclass of
#   ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
#   ?place wdt:P2588 ?indonesia_admincode.
#   SERVICE wikibase:label {
#     bd:serviceParam wikibase:language "en" .
#   }
# }
# """
# wiki_cities = wiki_data(sparql)



# sparql = """
# SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel ?indonesia_admincode
# WHERE
# {
#   ?place wdt:P31 wd:Q3199141. # P31=instance of, P279=subclass of
#   ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
#   ?place wdt:P2588 ?indonesia_admincode.
#   SERVICE wikibase:label {
#     bd:serviceParam wikibase:language "en" .
#   }
# }
# """
# wiki_cities = wiki_data(sparql)

# sparql = """
# SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel ?indonesia_admincode
# WHERE
# {
#   ?place wdt:P31 wd:Q4272761. # P31=instance of, P279=subclass of
#   ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
#   ?place wdt:P2588 ?indonesia_admincode.
#   SERVICE wikibase:label {
#     bd:serviceParam wikibase:language "en" .
#   }
# }
# """
# wiki_cities1 = wiki_data(sparql)

# sparql = """
# SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel ?indonesia_admincode
# WHERE
# {
#   ?place wdt:P31 wd:Q3191695. # P31=instance of, P279=subclass of
#   ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
#   ?place wdt:P2588 ?indonesia_admincode.
#   SERVICE wikibase:label {
#     bd:serviceParam wikibase:language "en" .
#   }
# }
# """
# wiki_regencies = wiki_data(sparql)
# sparql = """
# SELECT ?place ?subregion1Label ?placeLabel ?class ?classLabel ?indonesia_admincode
# WHERE
# {
#   ?place wdt:P31 wd:Q11127777. # P31=instance of, P279=subclass of
#   ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
#   ?place wdt:P2588 ?indonesia_admincode.
#   SERVICE wikibase:label {
#     bd:serviceParam wikibase:language "en" .
#   }
# }
# """
# wiki_regencies1 = wiki_data(sparql)

# wiki_cities = pd.concat([wiki_cities, wiki_cities1])
# wiki_regencies = pd.concat([wiki_regencies, wiki_regencies1])
# wiki_cities
# # d = wiki_cities[wiki_cities.placeLabel]
# # d
# wiki_regencies



Unnamed: 0,place,indonesia_admincode,subregion1Label,placeLabel
0,http://www.wikidata.org/entity/Q11495,51.08,Bali,Buleleng
1,http://www.wikidata.org/entity/Q11496,51.04,Bali,Gianyar
2,http://www.wikidata.org/entity/Q11499,51.01,Bali,Jembrana
3,http://www.wikidata.org/entity/Q11501,51.07,Bali,Karangasem
4,http://www.wikidata.org/entity/Q11503,51.05,Bali,Klungkung
...,...,...,...,...
428,http://www.wikidata.org/entity/Q676084,82.08,North Maluku,Pulau Taliabu
429,http://www.wikidata.org/entity/Q3182322,17.04,Bengkulu,Kaur
430,http://www.wikidata.org/entity/Q4069046,53.21,East Nusa Tenggara,Malaka
431,http://www.wikidata.org/entity/Q4201768,81.01,Maluku,Maluku Tengah


### OpenCovid

In [24]:
meta = pd.read_csv('https://raw.githubusercontent.com/GoogleCloudPlatform/covid-19-open-data/main/src/data/metadata.csv')
meta = meta.query('(country_code == "ID")')
# meta

In [25]:
#  wiki_cities = wiki_cities.merge(meta, how='inner', left_on=['subregion1Label'], right_on=['subregion1_name']) 
#  wiki_regencies = wiki_regencies.merge(meta, how='inner', left_on=['subregion1Label'], right_on=['subregion1_name']) 
 

In [26]:
# c = wiki_regencies[wiki_regencies.placeLabel == 'Tuban']
# c

Unnamed: 0,place,indonesia_admincode,subregion1Label,placeLabel,city_or_regency,key,country_code,country_name,subregion1_code,subregion1_name,subregion2_code,subregion2_name,locality_code,locality_name,match_string,aggregate_report_offset
427,http://www.wikidata.org/entity/Q11141,35.23,East Java,Tuban,regency,ID_JI,ID,Indonesia,JI,East Java,,,,,Jawa Timur,


In [27]:
duplicate = wiki_cities[wiki_cities.duplicated('placeLabel', keep=False)] 
len(duplicate)


12

### Translations

In [8]:
def indonesian_direction_to_english(place: str):
  to_english = {
      'Pusat': 'Central',
      'Tengah' : 'Central',
      'Utara' : 'North',
      'Selatan' : 'South',
      'Timur' : 'East',
      'Barat' : 'West',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo_dir, eng_dir in to_english.items():
      # in indonesian the dir is at the end 
      if bits[-1] == indo_dir:
        bits.pop()
        bits.insert(0, eng_dir)
        break
  place = ' '.join(bits)
  return place

def indonesian_islands_to_english(place: str):
  to_english = {
      'Kepulauan' : 'Islands',
      'Pulau' : 'Island',
  }
  bits = place.split()
  bits = [to_english.get(b, b) for b in bits]
  # in english, islands is at the end
  if len(bits) > 1:
    for eng in to_english.values():
      # in indonesian the dir is at the end 
      if bits[0] == eng:
        bits.pop(0)
        bits.append(eng)
        break
  place = ' '.join(bits)
  return place

[
  indonesian_direction_to_english('whatever Tengah'),
  indonesian_islands_to_english('Kepulauan Tengah')
]

['Central whatever', 'Tengah Islands']

In [9]:
spellings = {
    # cities
    'Bau-Bau' : 'Baubau',
    'Sungaipenuh' : 'Sungai Penuh',
    'Palangka Raya' : 'Palangkaraya',
    'Tidore Kepulauan' : 'Tidore', # https://www.wikidata.org/wiki/Q19153
    'Lubuk Linggau' : 'Lubuklinggau', # https://www.wikidata.org/wiki/Q8129
    'Pematang Siantar': 'Pematangsiantar', # https://www.wikidata.org/wiki/Q5979
    'Tanjung Balai' : 'Tanjungbalai', # https://www.wikidata.org/wiki/Q5987
    # regencies
    'Kupang' : 'Kupang Regency', # https://www.wikidata.org/wiki/Q14141
    'Toba Samosir' : 'Toba Regency', # https://www.wikidata.org/wiki/Q5911
    'Tojo Una-Una' : 'Tojo Una Una',
    'Toli-Toli': 'Tolitoli',
    'Muko Muko': 'Mukomuko', # https://www.wikidata.org/wiki/Q8033
    'Kepulauan Seribu' : 'Thousand Islands', # https://translate.google.com/?sl=id&tl=en&text=Kepulauan%20Seribu&op=translate
    'Penajam Paser Utara' : 'Penajam North Paser',
    'Pasangkayu (Mamuju Utara)': 'Pasangkayu', # https://en.wikipedia.org/wiki/Pasangkayu_Regency
    'Labuhanbatu' : 'Labuhan Batu', # https://www.wikidata.org/wiki/Q5814
    'Batu Bara': 'Batubara', # https://www.wikidata.org/wiki/Q5797
    'Limapuluh Kota': 'Lima Puluh Kota', # https://www.wikidata.org/wiki/Q6032
    'Batang Hari': 'Batanghari', # https://www.wikidata.org/wiki/Q7370
    'Kepulauan Sangihe': 'Sangihe', # https://www.wikidata.org/wiki/Q15839
    'Kepulauan Sitaro': 'Kepulauan Siau Tagulandang Biaro', # https://www.wikidata.org/wiki/Q15840
}
def translate_sheet_place_to_wikidata(place: str) -> str:
  if place in spellings:
    return spellings[place]
  place = indonesian_direction_to_english(place)
  place = indonesian_islands_to_english(place)
  return place

[
 translate_sheet_place_to_wikidata('Kepulauan Tengah'),
 translate_sheet_place_to_wikidata('Kepulauan Tenga'),
 translate_sheet_place_to_wikidata('Jakarta Barat'),
] 
 

['Central Islands', 'Tenga Islands', 'West Jakarta']

### Sheet

In [10]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()

sheet_df = pd.DataFrame.from_records(rows[2:], columns=rows[1])
sheet_cities = sheet_df[sheet_df.KabKota == 'Kota']
sheet_regencies = sheet_df[sheet_df.KabKota == 'Kab.']
# sheet_cities
# sheet_regencies

In [11]:
# Match rows in sheet with rows in wiki
def firstNotNull(l):
  for item in l:
    if not pd.isnull(item):
      return item
  return l[0]

def match_sheet_to_wiki(sdf, wdf):
  df = sdf.merge(wdf, how='left', left_on=['Kota'], right_on=['placeLabel'], validate="1:1")
  def kota_translated(r):
    if not pd.isnull(r.placeLabel):
      return None
    else:
      return translate_sheet_place_to_wikidata(r.Kota)
  df['Kota_translated'] = df.apply(kota_translated, axis=1)
  df = df.merge(wdf, how='left', left_on=['Kota_translated'], right_on=['placeLabel'], suffixes = [ None, '_y'], validate="1:1") 
  df['subregion2'] = df.apply(lambda r: firstNotNull([r.placeLabel, r.placeLabel_y]), axis=1)
  df['subregion1'] = df.apply(lambda r: firstNotNull([r.subregion1Label, r.subregion1Label_y]), axis=1)
  # now join to meta
  df = df.merge(meta, how='left', left_on=['subregion1'], right_on=['subregion1_name']) 
  return df

cities = match_sheet_to_wiki(sheet_cities, wiki_cities)
# missing = cities[cities['subregion2'].isnull()]
missing = cities[cities['subregion1_code'].isnull()]
missing

MergeError: ignored

In [None]:
regencies = match_sheet_to_wiki(sheet_regencies, wiki_regencies)
missing = regencies[regencies['subregion2'].isnull()]
missing

In [None]:
df = wiki_df.merge(sheet_df, how='left', right_on=['Kota_standardised', 'regiontype'], left_on=['place_standardised', 'regiontype']) 
missing = df[df['Kota'].isnull()]
missing
# len(missing)



In [None]:
# Matching rows with subregions
df = df.merge(provs, how='inner', left_on='subregion1Label', right_on='subregion1_name')
df['key'] = df.apply(lambda r: str(r.key) + '_' + str(r["ID Kota"]), axis=1)
df = df[~df['Kota'].isnull() & ~df['key'].isnull()]
cols = ['key', 'country_code', 'country_name', 'subregion1_code', 'subregion1_name', 'ID Kota', 'placeLabel']

print(df[cols].to_csv())
