<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/Indonesia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [None]:
import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"

def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  columns=[]
  for result in results:
    if len(result) > len(columns):
      columns=[val for val in result]
  rows = []
  for result in results:
    values = [result[c]["value"] if c in result else None for c in columns ]
    rows.append(values)
  return pd.DataFrame(rows, columns=columns)

def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  # print(res.json())
  return wikidata_to_dataframe(res.json())

In [None]:
!pip install --upgrade gspread

### Wikidata

In [None]:
sparql = """
SELECT ?place ?subregion1Label ?placeLabel ?classLabel ?indonesia_admincode ?subregion1_endtime ?class_endtime
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q12479774. # P31=instance of, P279=subclass of
  ?place wdt:P131 ?subregion1. # P131=located in the administrative territorial entity
  OPTIONAL { ?place p:P131 [ps:P131 ?subregion1; pq:P582 ?subregion1_endtime ]. }
  ?place wdt:P2588 ?indonesia_admincode.
  ?place wdt:P31 ?class.
  OPTIONAL { ?place p:P31 [ps:P31 ?class; pq:P582 ?class_endtime ]. }
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
wiki_raw = wiki_data(sparql)
wiki_raw
len(wiki_raw)

In [None]:
def city_or_regency(classLabel: str) -> str:
  classLabel = classLabel.lower()
  if 'regency' in classLabel:
    return 'regency'
  elif 'city' in classLabel:
    return 'city'
  else:
    return 'Other'

wiki_df = wiki_raw.copy()
wiki_df = wiki_df[wiki_df.apply(lambda r : r.subregion1_endtime is None and r.class_endtime is None, axis=1)]
wiki_df['city_or_regency'] = wiki_df.apply(lambda r: city_or_regency(r.classLabel), axis=1)
wiki_df = wiki_df.drop(['classLabel'], axis=1)
wiki_df = wiki_df.drop_duplicates()

wiki_df
len(wiki_df)

In [None]:
wiki_df = wiki_df[wiki_df.subregion1Label != 'Q28725381']
duplicate = wiki_df[wiki_df.duplicated(['placeLabel', 'city_or_regency'], keep=False)] 
# len(duplicate)
duplicate


### Andrafarm Sheet

In [None]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()
sheet_raw = pd.DataFrame.from_records(rows[2:], columns=rows[1])

### Match Andrafarm to Wikidata

In [None]:
def indonesian_direction_to_english(place: str):
  to_english = {
      'Pusat': 'Central',
      'Tengah' : 'Central',
      'Utara' : 'North',
      'Selatan' : 'South',
      'Timur' : 'East',
      'Barat' : 'West',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo_dir, eng_dir in to_english.items():
      # in indonesian the dir is at the end 
      if bits[-1] == indo_dir:
        bits.pop()
        bits.insert(0, eng_dir)
        break
  place = ' '.join(bits)
  return place

def indonesian_islands_to_english(place: str):
  to_english = {
      'Kepulauan' : 'Islands',
      'Pulau' : 'Island',
  }
  bits = place.split()
  bits = [to_english.get(b, b) for b in bits]
  # in english, islands is at the end
  if len(bits) > 1:
    for eng in to_english.values():
      # in indonesian the dir is at the end 
      if bits[0] == eng:
        bits.pop(0)
        bits.append(eng)
        break
  place = ' '.join(bits)
  return place

[
  indonesian_direction_to_english('whatever Tengah'),
  indonesian_islands_to_english('Kepulauan Tengah')
]

In [None]:
spellings = {
    # cities
    'Bau-Bau' : 'Baubau',
    'Sungaipenuh' : 'Sungai Penuh',
    'Palangka Raya' : 'Palangkaraya',
    'Tidore Kepulauan' : 'Tidore', # https://www.wikidata.org/wiki/Q19153
    'Lubuk Linggau' : 'Lubuklinggau', # https://www.wikidata.org/wiki/Q8129
    'Pematang Siantar': 'Pematangsiantar', # https://www.wikidata.org/wiki/Q5979
    'Tanjung Balai' : 'Tanjungbalai', # https://www.wikidata.org/wiki/Q5987
    'Yogyakarta': 'Yogyakarta City',
    # regencies
    'Kupang' : 'Kupang Regency', # https://www.wikidata.org/wiki/Q14141
    'Toba Samosir' : 'Toba Regency', # https://www.wikidata.org/wiki/Q5911
    'Tojo Una-Una' : 'Tojo Una Una',
    'Toli-Toli': 'Tolitoli',
    'Muko Muko': 'Mukomuko', # https://www.wikidata.org/wiki/Q8033
    'Kepulauan Seribu' : 'Thousand Islands', # https://translate.google.com/?sl=id&tl=en&text=Kepulauan%20Seribu&op=translate
    'Penajam Paser Utara' : 'Penajam North Paser',
    'Pasangkayu (Mamuju Utara)': 'Pasangkayu', # https://en.wikipedia.org/wiki/Pasangkayu_Regency
    'Labuhanbatu' : 'Labuhan Batu', # https://www.wikidata.org/wiki/Q5814
    'Batu Bara': 'Batubara', # https://www.wikidata.org/wiki/Q5797
    'Limapuluh Kota': 'Lima Puluh Kota', # https://www.wikidata.org/wiki/Q6032
    'Batang Hari': 'Batanghari', # https://www.wikidata.org/wiki/Q7370
    'Kepulauan Sangihe': 'Sangihe', # https://www.wikidata.org/wiki/Q15839
    'Kepulauan Sitaro': 'Kepulauan Siau Tagulandang Biaro', # https://www.wikidata.org/wiki/Q15840
}
def translate_sheet_place_to_wikidata(place: str) -> str:
  if place in spellings:
    return spellings[place]
  place = indonesian_direction_to_english(place)
  place = indonesian_islands_to_english(place)
  return place

[
 translate_sheet_place_to_wikidata('Kepulauan Tengah'),
 translate_sheet_place_to_wikidata('Kepulauan Tenga'),
 translate_sheet_place_to_wikidata('Jakarta Barat'),
]

In [None]:
# Match rows in sheet with rows in wiki
def city_or_regency(KabKota: str) -> str:
  if KabKota == 'Kab.':
    return 'regency'
  elif KabKota == 'Kota':
    return 'city'
  else:
    return KabKota

sheet_df = sheet_raw.copy()
sheet_df = sheet_df[sheet_df.KabKota != 'zTam']
sheet_df['KabKota_eng'] = sheet_df.apply(lambda r: city_or_regency(r.KabKota), axis=1)
[set(sheet_df.KabKota_eng), len(sheet_df) ]


sheet_df['Kota_translated'] = sheet_df.apply(lambda r: translate_sheet_place_to_wikidata(r.Kota), axis=1)

# Join to Kota
df = sheet_df.merge(wiki_df, how='left', left_on=['KabKota_eng', 'Kota'], right_on=['city_or_regency','placeLabel'], validate="1:1")
# For rows which didn't match on Kota, join on Kota_translated
df1 = df[~df['placeLabel'].isnull()]
df2 = df[df['placeLabel'].isnull()]
df2 = df2[list(sheet_df.columns)]
df2 = df2.merge(wiki_df, how='left', left_on=['KabKota_eng', 'Kota_translated'], right_on=['city_or_regency','placeLabel'], validate="1:1")
df = pd.concat([df1, df2])
df.head()

### Covid19OpenData Metadat

In [None]:
meta = pd.read_csv('https://raw.githubusercontent.com/GoogleCloudPlatform/covid-19-open-data/main/src/data/metadata.csv')
meta = meta.query('(country_code == "ID")')
# meta
len(meta)
meta1 = meta[meta['subregion1_code'].notna() & meta['subregion2_code'].isna()]
meta1
len(meta1)

### Match to Covid19OpenData

In [None]:
def translate_wiki_subregion_to_metadata(place: str) -> str:
  translations = {
      'Special Region of Yogyakarta': 'Yogyakarta'
  }
  return translations.get(place, place)

# merge with meta1
df['subregion1_translated'] = df.apply(lambda r: translate_wiki_subregion_to_metadata(r.subregion1Label), axis=1)
df = df.merge(meta1, how='left', left_on=['subregion1_translated'], right_on=['subregion1_name'])

missing = df[df['subregion1_name'].isnull()]
missing
# len(missing)
# df


In [None]:
# add other columns for metadata.csv
df['country_code'] = 'ID'
df['country_name'] = 'Indonesia'
df['subregion2_code'] = df.apply(lambda r: r.indonesia_admincode.replace('.', ''), axis=1)
df['key'] = df.apply(lambda r: 'ID_' + r.subregion1_code + '_' + r.subregion2_code, axis=1)

df['locality_code'] = None
df['locality_name'] = None
df['match_string'] = df.apply(lambda r: r.Kota if r.Kota != r.placeLabel else None, axis=1)
df['aggregate_report_offset'] = None
df.head()

In [None]:
# key to andrafarm id mapping
df = df.sort_values(by=['key'])
dictionary = dict(zip(list(df.subregion2_code), list(df['ID Kota'])))
for key, value in dictionary.items():
    print('  "' + str(key) + '": ' + str(value) + ",")

In [None]:
# to add to metadata.csv
cols = ['key', 'country_code', 'country_name', 'subregion1_code', 'subregion1_name', 'subregion2_code', 'placeLabel', 'locality_code', 'locality_name', 'match_string', 'aggregate_report_offset']
df1 = df[cols]
df = pd.concat([df1, meta1])
df = df.sort_values(by=['key'])
print(df[cols].to_csv(index=False, header=False))