<a href="https://colab.research.google.com/github/dhan16/colabs/blob/master/covid19opendata/WikiData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sparql

In [1]:
# Sparql functions
import requests
import pandas as pd

ENDPOINT = "https://query.wikidata.org/sparql"


def wiki_data(sparql):
  res = requests.get(ENDPOINT, params = {'format': 'json', 'query': sparql})
  return res.json()


def wikidata_to_dataframe(json):
  results = json["results"]["bindings"]
  # column names we draw from the first result
  cols = [ val for val in results[0] ]
  rows = []
  for result in results:
      values = [ result[val]["value"] for val in result ]
      rows.append(values)
  return pd.DataFrame(rows, columns=cols)


In [4]:
sparql = """
SELECT ?place ?placeLabel ?class ?classLabel
WHERE
{
  ?place wdt:P31/wdt:P279* wd:Q12479774.
  # ?place wdt:P31 ?class
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
"""
res = wiki_data(sparql)
wiki_df = wikidata_to_dataframe(res)
wiki_df

Unnamed: 0,place,placeLabel
0,http://www.wikidata.org/entity/Q1795,Asahan
1,http://www.wikidata.org/entity/Q5662,Aceh Besar
2,http://www.wikidata.org/entity/Q5666,Gayo Lues
3,http://www.wikidata.org/entity/Q5667,Aceh Jaya
4,http://www.wikidata.org/entity/Q5672,Aceh Singkil
...,...,...
512,http://www.wikidata.org/entity/Q14634,Makassar
513,http://www.wikidata.org/entity/Q14635,Palopo
514,http://www.wikidata.org/entity/Q14636,Parepare
515,http://www.wikidata.org/entity/Q15378,Baubau


## Sheet

In [5]:
!pip install --upgrade gspread

Collecting gspread
  Downloading https://files.pythonhosted.org/packages/9c/ba/bc8de4f5077bd34bc873bdd67a89cb29c4f181abba8a836d2c6a0a142365/gspread-3.6.0-py3-none-any.whl
Installing collected packages: gspread
  Found existing installation: gspread 3.0.1
    Uninstalling gspread-3.0.1:
      Successfully uninstalled gspread-3.0.1
Successfully installed gspread-3.6.0


In [6]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

sheet_url = 'https://docs.google.com/spreadsheets/d/1FJJXiGuOb5nXrjJeV3QcHNhTo38YdcsTIFl29mWDIqI/edit#gid=2006070746'
worksheet = gc.open_by_url(sheet_url).worksheet('Kode Kota')
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
sheet_df = pd.DataFrame.from_records(rows[2:], columns=rows[1])
sheet_df

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota
0,1,Aceh (NAD),258,Kab.,Aceh Barat
1,1,Aceh (NAD),259,Kab.,Aceh Barat Daya
2,1,Aceh (NAD),260,Kab.,Aceh Besar
3,1,Aceh (NAD),261,Kab.,Aceh Jaya
4,1,Aceh (NAD),262,Kab.,Aceh Selatan
...,...,...,...,...,...
557,34,Sumatera Utara,492,Kab.,Tapanuli Selatan
558,34,Sumatera Utara,493,Kab.,Tapanuli Tengah
559,34,Sumatera Utara,494,Kab.,Tapanuli Utara
560,34,Sumatera Utara,495,Kota,Tebing Tinggi


## Analyze

In [7]:
# Find rows in sheet_df with no matches in wiki_df
df = sheet_df.merge(wiki_df, how='left', left_on='Kota', right_on='placeLabel') 
# missing = df[df['placeLabel'].isnull() & ~df['KabKota'].isin(['zTam'])]
missing = df[df['placeLabel'].isnull()]
missing

Unnamed: 0,ID Provinsi,Provinsi,ID Kota,KabKota,Kota,place,placeLabel
23,1,Aceh (NAD),1011,zTam,Luar Provinsi Aceh,,
24,1,Aceh (NAD),1013,zTam,Luar Negeri (Aceh),,
25,1,Aceh (NAD),1012,zTam,Belum Diverifikasi (Aceh),,
26,2,Bali,1021,zTam,WNA (Bali),,
27,2,Bali,1022,zTam,Luar Provinsi Bali,,
...,...,...,...,...,...,...,...
585,34,Sumatera Utara,472,Kab.,Labuhanbatu,,
599,34,Sumatera Utara,486,Kota,Pematang Siantar,,
604,34,Sumatera Utara,491,Kota,Tanjung Balai,,
606,34,Sumatera Utara,493,Kab.,Tapanuli Tengah,,
