# Scraping Wilayah Kerja Statistik 2020

In [2]:
import requests
import json
import pandas as pd
import numpy as np
import time
from io import StringIO

## Import Data Provinsi

In [36]:
prov = pd.read_csv('prov.csv')
prov.head()

Unnamed: 0,kode,nama
0,11,ACEH
1,12,SUMATERA UTARA
2,13,SUMATERA BARAT
3,14,RIAU
4,15,JAMBI


## Ambil Data Semua BPS Kab/Kota Masing-Masing Provinsi

In [4]:
# base dataframe
df = pd.DataFrame({
    "kode_prov": [],
    "kode_bps": [],
    "nama_bps": [],
    "nama_dagri": []
})

for k in prov['kode']:
    url = 'https://sig.bps.go.id/rest-bridging/getwilayah?level=kabupaten&parent={}'.format(k)
    r = requests.get(url)
    data = r.json()

    df_temp = pd.DataFrame(data)
    df_temp['kode_prov'] = k

    df = pd.concat([df, df_temp], ignore_index=True)

df["kode_prov"] = df["kode_prov"].astype(int)
df.head()


Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri
0,11,1101,SIMEULUE,KAB. SIMEULUE,11.09
1,11,1102,ACEH SINGKIL,KAB. ACEH SINGKIL,11.1
2,11,1103,ACEH SELATAN,KAB. ACEH SELATAN,11.01
3,11,1104,ACEH TENGGARA,KAB. ACEH TENGGARA,11.02
4,11,1105,ACEH TIMUR,KAB. ACEH TIMUR,11.03


## Export Data BPS Kab/Kota

In [5]:
df.to_csv('kabupaten.csv', index=False)
df.to_json('kabupaten.json', orient='records')

# Scraping Data Latitude Longitude Kantor BPS Daerah

In [13]:
from selenium import webdriver
driver = webdriver.Edge()

# input data
kabkota = pd.read_csv('kabupaten.csv')
kabkota.head()

Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri
0,11,1101,SIMEULUE,KAB. SIMEULUE,11.09
1,11,1102,ACEH SINGKIL,KAB. ACEH SINGKIL,11.1
2,11,1103,ACEH SELATAN,KAB. ACEH SELATAN,11.01
3,11,1104,ACEH TENGGARA,KAB. ACEH TENGGARA,11.02
4,11,1105,ACEH TIMUR,KAB. ACEH TIMUR,11.03


## Split Data Ke 5 Bagian

In [9]:
# split data into 5 chunks
kabkota_split = np.array_split(kabkota, 5)
kabkota_split

  return bound(*args, **kwds)


[    kode_prov  kode_bps              nama_bps                 nama_dagri  \
 0          11      1101              SIMEULUE              KAB. SIMEULUE   
 1          11      1102          ACEH SINGKIL          KAB. ACEH SINGKIL   
 2          11      1103          ACEH SELATAN          KAB. ACEH SELATAN   
 3          11      1104         ACEH TENGGARA         KAB. ACEH TENGGARA   
 4          11      1105            ACEH TIMUR            KAB. ACEH TIMUR   
 ..        ...       ...                   ...                        ...   
 92         15      1506  TANJUNG JABUNG TIMUR  KAB. TANJUNG JABUNG TIMUR   
 93         15      1507  TANJUNG JABUNG BARAT  KAB. TANJUNG JABUNG BARAT   
 94         15      1508                  TEBO                  KAB. TEBO   
 95         15      1509                 BUNGO                 KAB. BUNGO   
 96         15      1571                 JAMBI                 KOTA JAMBI   
 
     kode_dagri  
 0        11.09  
 1        11.10  
 2        11.01  
 3

## Membuat Variabel untuk Menyimpan Data

In [17]:
latlong = pd.DataFrame({
    "kode_bps": [],
    "latitude": [],
    "longitude": []
})
new_data_list = []
latlong

Unnamed: 0,kode_bps,latitude,longitude


## Looping untuk Ambil Data Google Maps

In [18]:
def iterateScrape(chunk):
    for i, row in chunk.iterrows():
        url = 'https://www.google.com/maps/search/Badan Pusat Statistik {}, {}'.format(row['nama_bps'], row['nama_dagri'])
        driver.get(url)
        time.sleep(3.5)
        latlong = driver.current_url.split("@")[1].split("/")[0].split(",")
        new_data = {
            "kode_bps": row['kode_bps'],
            "latitude": latlong[0],
            "longitude": latlong[1]
        }

        print("process {} of {}".format(i, len(chunk)))
        new_data_list.append(new_data)

## Ambil Semua Data

In [14]:
# iterateScrape(kabkota_split[0])
i = 1
for chunk in kabkota_split:
    print(f"chunk {i} of {len(kabkota_split)}")
    iterateScrape(chunk)
    i+=1
latlong = pd.concat([latlong, pd.DataFrame(new_data_list)], ignore_index=True)


chunk 1 of 5
process 0 of 97
process 1 of 97
process 2 of 97
process 3 of 97
process 4 of 97
process 5 of 97
process 6 of 97
process 7 of 97
process 8 of 97
process 9 of 97
process 10 of 97
process 11 of 97
process 12 of 97
process 13 of 97
process 14 of 97
process 15 of 97
process 16 of 97
process 17 of 97
process 18 of 97
process 19 of 97
process 20 of 97
process 21 of 97
process 22 of 97
process 23 of 97
process 24 of 97
process 25 of 97
process 26 of 97
process 27 of 97
process 28 of 97
process 29 of 97
process 30 of 97
process 31 of 97
process 32 of 97
process 33 of 97
process 34 of 97
process 35 of 97
process 36 of 97
process 37 of 97
process 38 of 97
process 39 of 97
process 40 of 97
process 41 of 97
process 42 of 97
process 43 of 97
process 44 of 97
process 45 of 97
process 46 of 97
process 47 of 97
process 48 of 97
process 49 of 97
process 50 of 97
process 51 of 97
process 52 of 97
process 53 of 97
process 54 of 97
process 55 of 97
process 56 of 97
process 57 of 97
process 58 

In [17]:
latlong_bck = latlong
latlong_bck

Unnamed: 0,kode_bps,latitude,longitude
0,1101.0,2.4517114,96.3774566
1,1102.0,2.2732113,97.813226
2,1103.0,3.260106,97.189643
3,1104.0,3.4732392,97.8151078
4,1105.0,4.9114668,97.8056172
...,...,...,...
480,9109.0,-6.3249684,106.6666453
481,9110.0,-1.3655687,132.2940383
482,9111.0,-0.8624453,134.0626113
483,9112.0,-1.6697561,132.6797821


In [23]:
latlong['kode_bps'] = latlong['kode_bps'].astype(int)
latlong

Unnamed: 0,kode_bps,latitude,longitude
0,1101,2.4517114,96.3774566
1,1102,2.2732113,97.813226
2,1103,3.260106,97.189643
3,1104,3.4732392,97.8151078
4,1105,4.9114668,97.8056172
...,...,...,...
480,9109,-6.3249684,106.6666453
481,9110,-1.3655687,132.2940383
482,9111,-0.8624453,134.0626113
483,9112,-1.6697561,132.6797821


In [25]:
temp = pd.merge(kabkota, latlong, on='kode_bps', how='inner')
# kabkota_split[0]
# get info variable
# latlong
# kabkota_split[0]

In [26]:
temp

Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri,latitude,longitude
0,11,1101,SIMEULUE,KAB. SIMEULUE,11.09,2.4517114,96.3774566
1,11,1102,ACEH SINGKIL,KAB. ACEH SINGKIL,11.10,2.2732113,97.813226
2,11,1103,ACEH SELATAN,KAB. ACEH SELATAN,11.01,3.260106,97.189643
3,11,1104,ACEH TENGGARA,KAB. ACEH TENGGARA,11.02,3.4732392,97.8151078
4,11,1105,ACEH TIMUR,KAB. ACEH TIMUR,11.03,4.9114668,97.8056172
...,...,...,...,...,...,...,...
480,91,9109,TAMBRAUW,KAB. TAMBRAUW,92.09,-6.3249684,106.6666453
481,91,9110,MAYBRAT,KAB. MAYBRAT,92.10,-1.3655687,132.2940383
482,91,9111,MANOKWARI SELATAN,KAB. MANOKWARI SELATAN,92.11,-0.8624453,134.0626113
483,91,9112,PEGUNUNGAN ARFAK,KAB. PEGUNUNGAN ARFAK,92.12,-1.6697561,132.6797821


In [27]:
temp.to_csv('kabupaten_latlong.csv', index=False)

In [9]:
new_latlong = pd.read_csv('kabupaten_latlong.csv')
new_latlong.tail()

Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri,latitude,longitude
480,91,9109,TAMBRAUW,KAB. TAMBRAUW,92.09,-6.324968,106.666645
481,91,9110,MAYBRAT,KAB. MAYBRAT,92.1,-1.365569,132.294038
482,91,9111,MANOKWARI SELATAN,KAB. MANOKWARI SELATAN,92.11,-0.862445,134.062611
483,91,9112,PEGUNUNGAN ARFAK,KAB. PEGUNUNGAN ARFAK,92.12,-1.669756,132.679782
484,91,9171,SORONG,KOTA SORONG,92.71,-0.880355,131.288533


In [16]:
kabkota_papua = kabkota[kabkota['kode_prov'] == 94]
kabkota_papua.head()

Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri
485,94,9401,MERAUKE,KAB. MERAUKE,91.01
486,94,9402,JAYAWIJAYA,KAB. JAYAWIJAYA,91.02
487,94,9403,JAYAPURA,KAB. JAYAPURA,91.03
488,94,9404,NABIRE,KAB. NABIRE,91.04
489,94,9408,KEPULAUAN YAPEN,KAB. KEPULAUAN YAPEN,91.05


In [19]:
iterateScrape(kabkota_papua)

process 485 of 29
process 486 of 29
process 487 of 29
process 488 of 29
process 489 of 29
process 490 of 29
process 491 of 29
process 492 of 29
process 493 of 29
process 494 of 29
process 495 of 29
process 496 of 29
process 497 of 29
process 498 of 29
process 499 of 29
process 500 of 29
process 501 of 29
process 502 of 29
process 503 of 29
process 504 of 29
process 505 of 29
process 506 of 29
process 507 of 29
process 508 of 29
process 509 of 29
process 510 of 29
process 511 of 29
process 512 of 29
process 513 of 29


In [23]:
latlong = pd.concat([latlong, pd.DataFrame(new_data_list)], ignore_index=True)

In [25]:
kabkota_papua_latlong = pd.merge(kabkota_papua, latlong, on='kode_bps', how='inner')
kabkota_papua_latlong.head()

Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri,latitude,longitude
0,94,9401,MERAUKE,KAB. MERAUKE,91.01,-8.5116578,140.4091634
1,94,9402,JAYAWIJAYA,KAB. JAYAWIJAYA,91.02,-4.0992374,138.9456582
2,94,9403,JAYAPURA,KAB. JAYAPURA,91.03,-2.5740161,140.5960441
3,94,9404,NABIRE,KAB. NABIRE,91.04,-3.3653831,135.5014038
4,94,9408,KEPULAUAN YAPEN,KAB. KEPULAUAN YAPEN,91.05,-1.8724708,136.2375008


In [28]:
new_latlong = pd.concat([new_latlong, kabkota_papua_latlong], ignore_index=True)
new_latlong

Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri,latitude,longitude
0,11,1101,SIMEULUE,KAB. SIMEULUE,11.09,2.451711,96.377457
1,11,1102,ACEH SINGKIL,KAB. ACEH SINGKIL,11.10,2.273211,97.813226
2,11,1103,ACEH SELATAN,KAB. ACEH SELATAN,11.01,3.260106,97.189643
3,11,1104,ACEH TENGGARA,KAB. ACEH TENGGARA,11.02,3.473239,97.815108
4,11,1105,ACEH TIMUR,KAB. ACEH TIMUR,11.03,4.911467,97.805617
...,...,...,...,...,...,...,...
509,94,9433,PUNCAK,KAB. PUNCAK,91.25,-3.8415752,137.8070662
510,94,9434,DOGIYAI,KAB. DOGIYAI,91.26,-3.5505705,135.0805137
511,94,9435,INTAN JAYA,KAB. INTAN JAYA,91.27,-3.4243611,137.1363895
512,94,9436,DEIYAI,KAB. DEIYAI,91.28,-4.1298477,136.408159


In [37]:
prov = prov.rename(columns={'nama': 'nama_provinsi', 'kode': 'kode_prov'})
prov.head()

Unnamed: 0,kode_prov,nama_provinsi
0,11,ACEH
1,12,SUMATERA UTARA
2,13,SUMATERA BARAT
3,14,RIAU
4,15,JAMBI


In [38]:
new_latlong = pd.merge(new_latlong, prov, on='kode_prov', how='inner')
new_latlong

Unnamed: 0,kode_prov,kode_bps,nama_bps,nama_dagri,kode_dagri,latitude,longitude,nama_provinsi
0,11,1101,SIMEULUE,KAB. SIMEULUE,11.09,2.451711,96.377457,ACEH
1,11,1102,ACEH SINGKIL,KAB. ACEH SINGKIL,11.10,2.273211,97.813226,ACEH
2,11,1103,ACEH SELATAN,KAB. ACEH SELATAN,11.01,3.260106,97.189643,ACEH
3,11,1104,ACEH TENGGARA,KAB. ACEH TENGGARA,11.02,3.473239,97.815108,ACEH
4,11,1105,ACEH TIMUR,KAB. ACEH TIMUR,11.03,4.911467,97.805617,ACEH
...,...,...,...,...,...,...,...,...
509,94,9433,PUNCAK,KAB. PUNCAK,91.25,-3.8415752,137.8070662,PAPUA
510,94,9434,DOGIYAI,KAB. DOGIYAI,91.26,-3.5505705,135.0805137,PAPUA
511,94,9435,INTAN JAYA,KAB. INTAN JAYA,91.27,-3.4243611,137.1363895,PAPUA
512,94,9436,DEIYAI,KAB. DEIYAI,91.28,-4.1298477,136.408159,PAPUA


## Export Result

In [40]:
new_latlong.to_csv('kabupaten_latlong.csv', index=False)

In [41]:
driver.close()