<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## WikiData

In [19]:
# Sparql functions
import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"


def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  return res.json()


def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  # column names we draw from the first result
  cols = [ val for val in results[0] ]
  rows = []
  for result in results:
      values = [ result[val]["value"] for val in result ]
      rows.append(values)
  return pd.DataFrame(rows, columns=cols)


In [71]:
sparql = """
SELECT ?place ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q12479774.
  ?place wdt:P31 ?class.
  ?class wdt:P279 wd:Q12479774.
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
res = wiki_data(sparql)
wiki_raw = wikidata_to_dataframe(res)
wiki_raw
# res

Unnamed: 0,class,place,placeLabel,classLabel
0,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q12488339,Banggai Laut,regency of Indonesia
1,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q11214749,Sidenreng Rappang,regency of Indonesia
2,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201892,Katingan,regency of Indonesia
3,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201768,Maluku Tengah,regency of Indonesia
4,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q11491,Bangli,regency of Indonesia
...,...,...,...,...
512,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7253,Padang,city of Indonesia
513,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7248,Bukittinggi,city of Indonesia
514,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5989,Tebing Tinggi,city of Indonesia
515,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5987,Tanjungbalai,city of Indonesia


In [73]:
wiki_df = wiki_raw.copy()

# wiki_raw.classLabel.unique()
kabkota_to_regiontype = {
    'regency of Indonesia' : 'Regency',
    'administrative regency of Indonesia' : 'Regency',
    'city of Indonesia': 'City',
    'administrative city of Indonesia': 'City',
}
wiki_df['regiontype'] = wiki_df.apply(lambda r: kabkota_to_regiontype[r.classLabel], axis=1)
wiki_df

Unnamed: 0,class,place,placeLabel,classLabel,regiontype
0,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q12488339,Banggai Laut,regency of Indonesia,Regency
1,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q11214749,Sidenreng Rappang,regency of Indonesia,Regency
2,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201892,Katingan,regency of Indonesia,Regency
3,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q4201768,Maluku Tengah,regency of Indonesia,Regency
4,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q11491,Bangli,regency of Indonesia,Regency
...,...,...,...,...,...
512,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7253,Padang,city of Indonesia,City
513,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q7248,Bukittinggi,city of Indonesia,City
514,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5989,Tebing Tinggi,city of Indonesia,City
515,http://www.wikidata.org/entity/Q3199141,http://www.wikidata.org/entity/Q5987,Tanjungbalai,city of Indonesia,City


## Sheet

In [None]:
!pip install --upgrade gspread

In [74]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
sheet_raw = pd.DataFrame.from_records(rows[2:], columns=rows[1])


In [75]:
sheet_df = sheet_raw.copy()

kabkota_to_regiontype = {
    'Kab.' : 'Regency', 
    'Kota': 'City', 
    'zTam' : 'zTam'
}
sheet_df['regiontype'] = sheet_df.apply(lambda r: kabkota_to_regiontype[r.KabKota], axis=1)
sheet_df


Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,regiontype
0,1,Aceh (NAD),258,Kab.,Aceh Barat,Regency
1,1,Aceh (NAD),259,Kab.,Aceh Barat Daya,Regency
2,1,Aceh (NAD),260,Kab.,Aceh Besar,Regency
3,1,Aceh (NAD),261,Kab.,Aceh Jaya,Regency
4,1,Aceh (NAD),262,Kab.,Aceh Selatan,Regency
...,...,...,...,...,...,...
557,34,Sumatera Utara,492,Kab.,Tapanuli Selatan,Regency
558,34,Sumatera Utara,493,Kab.,Tapanuli Tengah,Regency
559,34,Sumatera Utara,494,Kab.,Tapanuli Utara,Regency
560,34,Sumatera Utara,495,Kota,Tebing Tinggi,City


## Sheet vs WikiData

In [118]:
def indonesian_direction_to_english(place: str):
  # place is in lower
  to_english = {
      'pusat': 'central',
      'tengah' : 'central',
      'utara' : 'north',
      'selatan' : 'south',
      'timur' : 'east',
      'barat' : 'west',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo_dir, eng_dir in to_english.items():
      # in indonesian the dir is at the end. island/islands 
      if bits[-1] == indo_dir:
        bits.pop()
        bits.insert(0, eng_dir)
        break
  place = ' '.join(bits)
  return place

def indonesian_to_english(place: str):
  # place is in lower
  to_english = {
      'kepulauan' : 'islands',
      'pulau' : 'island',
  }
  bits = place.split()
  if len(bits) > 1:
    for indo, eng in to_english.items():
      # in indonesian the island is at the beginning while in english its at the end
      if bits[0] == indo:
        bits.pop(0)
        bits.append(eng)
        break
  place = ' '.join(bits)
  return place

# indonesian_direction_to_english('tengah whatever')
indonesian_direction_to_english('whatever tengah')
indonesian_to_english('kepulauan tengah')

'tengah islands'

In [119]:
spellings = {
    # 'Labuhanbatu': 'Labuhan Batu',
    # 'Batu Bara' : 'Batubara',
    # 'Pematang Siantar' : 'Pematangsiantar',
    # 'Tapanuli Tengah' : 'Central Tapanuli',
    'Toba Samosir' : 'Toba Regency',
}
def standard_place_spelling(place: str):
   # other spellings if still required
  place = spellings.get(place, place)
  # all lowercase
  place = place.lower()
  # apply translations
  place = indonesian_direction_to_english(place)
  place = indonesian_to_english(place)
  # remove spaces
  place = place.replace(' ', '')
  return place


wiki_df['place_standardised'] = wiki_df.apply(lambda r: standard_place_spelling(r.placeLabel), axis=1)
sheet_df['Kota_standardised'] = sheet_df.apply(lambda r: standard_place_spelling(r.Kota), axis=1)


In [122]:
# Find rows in wiki_df with no matches in sheet_df
df = wiki_df.merge(sheet_df, how='left', right_on=['Kota_standardised', 'regiontype'], left_on=['place_standardised', 'regiontype']) 
missing = df[df['Kota'].isnull()]
missing
# len(missing)

Unnamed: 0,class,place,placeLabel,classLabel,regiontype,place_standardised,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,Kota_standardised
68,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q15840,Kepulauan Siau Tagulandang Biaro,regency of Indonesia,Regency,siautagulandangbiaroislands,,,,,,
69,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q15839,Sangihe,regency of Indonesia,Regency,sangihe,,,,,,
75,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q15824,Tolitoli,regency of Indonesia,Regency,tolitoli,,,,,,
76,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q15823,Tojo Una Una,regency of Indonesia,Regency,tojounauna,,,,,,
96,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q15350,Pasangkayu,regency of Indonesia,Regency,pasangkayu,,,,,,
104,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q105124463,Q105124463,regency of Indonesia,Regency,q105124463,,,,,,
109,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q19727021,Bungo Tebo,regency of Indonesia,Regency,bungotebo,,,,,,
117,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q12488356,Sarolangun Bangko,regency of Indonesia,Regency,sarolangunbangko,,,,,,
122,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q12488342,Covalima,regency of Indonesia,Regency,covalima,,,,,,
223,http://www.wikidata.org/entity/Q3191695,http://www.wikidata.org/entity/Q14613,Pangkajene Islands,regency of Indonesia,Regency,pangkajeneislands,,,,,,


In [123]:
# Find rows in sheet_df with no matches in wiki_df
df = sheet_df.merge(wiki_df, how='left', left_on=['Kota_standardised', 'regiontype'], right_on=['place_standardised', 'regiontype']) 
missing = df[df['placeLabel'].isnull() & ~df['KabKota'].isin(['zTam'])]
missing
# len(missing)

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,regiontype,Kota_standardised,class,place,placeLabel,classLabel,place_standardised
42,3,Banten,21,Kota,Serang,City,serang,,,,,
72,6,DKI Jakarta,45,Kab.,Kepulauan Seribu,Regency,seribuislands,,,,,
248,15,Kalimantan Timur,213,Kab.,Penajam Paser Utara,Regency,northpenajampaser,,,,,
329,23,Nusa Tenggara Timur (NTT),295,Kab.,Kupang,Regency,kupang,,,,,
410,27,Sulawesi Barat,366,Kab.,Pasangkayu (Mamuju Utara),Regency,pasangkayu(mamujuutara),,,,,
431,28,Sulawesi Selatan,381,Kab.,Pangkajene Kepulauan,Regency,pangkajenekepulauan,,,,,
451,29,Sulawesi Tengah,401,Kab.,Tojo Una-Una,Regency,tojouna-una,,,,,
452,29,Sulawesi Tengah,402,Kab.,Toli-Toli,Regency,toli-toli,,,,,
456,30,Sulawesi Tenggara,403,Kota,Bau-Bau,City,bau-bau,,,,,
477,31,Sulawesi Utara,420,Kab.,Kepulauan Sangihe,Regency,sangiheislands,,,,,
