<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## WikiData

In [None]:
# Sparql functions
import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"


def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  return res.json()


def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  # column names we draw from the first result
  cols = [ val for val in results[0] ]
  rows = []
  for result in results:
      values = [ result[val]["value"] for val in result ]
      rows.append(values)
  return pd.DataFrame(rows, columns=cols)


In [None]:
sparql = """
SELECT ?place ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q12479774.
  ?place wdt:P31 ?class.
  ?class wdt:P279 wd:Q12479774.
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
res = wiki_data(sparql)
wiki_raw = wikidata_to_dataframe(res)
wiki_raw
# res

In [None]:
wiki_df = wiki_raw.copy()

# wiki_raw.classLabel.unique()
kabkota_to_regiontype = {
    'regency of Indonesia' : 'Regency',
    'administrative regency of Indonesia' : 'Regency',
    'city of Indonesia': 'City',
    'administrative city of Indonesia': 'City',
}
wiki_df['regiontype'] = wiki_df.apply(lambda r: kabkota_to_regiontype[r.classLabel], axis=1)
wiki_df

## Sheet

In [None]:
!pip install --upgrade gspread

In [None]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
sheet_raw = pd.DataFrame.from_records(rows[2:], columns=rows[1])


In [None]:
sheet_df = sheet_raw.copy()

kabkota_to_regiontype = {
    'Kab.' : 'Regency', 
    'Kota': 'City', 
    'zTam' : 'zTam'
}
sheet_df['regiontype'] = sheet_df.apply(lambda r: kabkota_to_regiontype[r.KabKota], axis=1)
sheet_df


## Sheet vs WikiData

In [None]:
def indonesian_direction_to_english(place: str):
  # place is in lower
  to_english = {
      'pusat': 'central',
      'tengah' : 'central',
      'utara' : 'north',
      'selatan' : 'south',
      'timur' : 'east',
      'barat' : 'west',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo_dir, eng_dir in to_english.items():
      # in indonesian the dir is at the end. island/islands 
      if bits[-1] == indo_dir:
        bits.pop()
        bits.insert(0, eng_dir)
        break
  place = ' '.join(bits)
  return place

def indonesian_to_english(place: str):
  # place is in lower
  to_english = {
      'kepulauan' : 'islands',
      'pulau' : 'island',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo, eng in to_english.items():
      # in indonesian the island is at the beginning while in english its at the end
      if bits[0] == indo:
        bits.pop(0)
        bits.append(eng)
        break
  place = ' '.join(bits)
  return place

# indonesian_direction_to_english('tengah whatever')
indonesian_direction_to_english('whatever tengah')
indonesian_to_english('kepulauan tengah')

In [None]:
spellings = {
    'Toba Samosir' : 'Toba Regency',
    'Penajam Paser Utara' : 'Penajam North Paser',
    'Kupang' : 'Kupang Regency',
    'Kepulauan Seribu' : 'Thousand Islands',
    'Pangkajene Kepulauan' : 'Pangkajene Islands',
    'Pasangkayu (Mamuju Utara)': 'Pasangkayu', # https://en.wikipedia.org/wiki/Pasangkayu_Regency
    # due to dashes
    'Bau-Bau' : 'Baubau',
    'Tojo Una-Una' : 'Tojo Una Una',
    'Toli-Toli': 'Tolitoli',
     # verify below
    'Kepulauan Sangihe' : 'Sangihe',
    'Kepulauan Sitaro' : 'Kepulauan Siau Tagulandang Biaro',
}
def standard_place_spelling(place: str):
   # other spellings if still required
  place = spellings.get(place, place)
  # all lowercase
  place = place.lower()
  # apply translations
  place = indonesian_direction_to_english(place)
  place = indonesian_to_english(place)
  # remove spaces
  place = place.replace(' ', '')
  return place


wiki_df['place_standardised'] = wiki_df.apply(lambda r: standard_place_spelling(r.placeLabel), axis=1)
sheet_df['Kota_standardised'] = sheet_df.apply(lambda r: standard_place_spelling(r.Kota), axis=1)


In [None]:
# Find rows in wiki_df with no matches in sheet_df
df = wiki_df.merge(sheet_df, how='left', right_on=['Kota_standardised', 'regiontype'], left_on=['place_standardised', 'regiontype']) 
missing = df[df['Kota'].isnull()]
missing
# len(missing)

In [None]:
# Find rows in sheet_df with no matches in wiki_df
df = sheet_df.merge(wiki_df, how='left', left_on=['Kota_standardised', 'regiontype'], right_on=['place_standardised', 'regiontype']) 
# missing = df[df['placeLabel'].isnull() & ~df['KabKota'].isin(['zTam'])]
missing = df[df['placeLabel'].isnull()]
missing
# len(missing)