In [1]:
import os
import requests
import json
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
# load .env
load_dotenv(find_dotenv())

True

In [2]:
# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
directory = "data"
# API ENDPOINT
base_url = os.getenv('BASE_URL_WEBAPI_BPS')
# url kbli
kbli_url = os.getenv('BASE_URL_KBLI_BPS')
# API Key
api_key = os.getenv('WEBAPI_BPS_API_KEY')

In [4]:
# mengambil data kbli2020
def get_data_kbli(kbli, page, perpage, level):
  """
  Fungsi untuk mengambil data dari API KBLI2020
  """
  full_url = f"{base_url}/{kbli}/page/{page}/perpage/{perpage}/level/{level}/key/{api_key}"
  print(full_url)
  response = requests.get(full_url)
  return response.json()

In [5]:
def save_json_to_directory(data, page):
  # Cek apakah direktori sudah ada, jika belum, buat direktori tersebut
  if not os.path.exists(directory):
    os.makedirs(directory)
  # Tentukan path lengkap untuk file JSON
  file_path = os.path.join(f"{directory}/raw", f"data_{page}.json")
  # Menyimpan file JSON ke path yang sudah ditentukan
  with open(file_path, 'w') as json_file:
    # Menggunakan json.dump(), bukan dict.dump()
    json.dump(data, json_file, indent=4)

In [6]:
# looping dari page 1-4
for page in tqdm(range(1, 5)):
  url = base_url.format(page)
  data = get_data_kbli('kbli2020', page, 500, 'kelompok')
  # print(json)
  save_json_to_directory(data, page)

  0%|          | 0/4 [00:00<?, ?it/s]

https://webapi.bps.go.id/v1/api/list/model/kbli2020/page/1/perpage/500/level/kelompok/key/41cb2c370758706a51b38f0a7b59f742


 25%|██▌       | 1/4 [00:01<00:03,  1.14s/it]

https://webapi.bps.go.id/v1/api/list/model/kbli2020/page/2/perpage/500/level/kelompok/key/41cb2c370758706a51b38f0a7b59f742


 50%|█████     | 2/4 [00:02<00:02,  1.33s/it]

https://webapi.bps.go.id/v1/api/list/model/kbli2020/page/3/perpage/500/level/kelompok/key/41cb2c370758706a51b38f0a7b59f742


 75%|███████▌  | 3/4 [00:03<00:01,  1.31s/it]

https://webapi.bps.go.id/v1/api/list/model/kbli2020/page/4/perpage/500/level/kelompok/key/41cb2c370758706a51b38f0a7b59f742


100%|██████████| 4/4 [00:04<00:00,  1.18s/it]


In [4]:
# read json files
def read_json_files():
  # Daftar semua file dalam direktori
  files = os.listdir(f"{directory}/raw")
  # Filter hanya file JSON
  json_files = [file for file in files if file.endswith('.json')]
  # Loop untuk membaca setiap file JSON
  data_list = []
  for file_name in json_files:
      file_path = os.path.join(f"{directory}/raw", file_name)
      
      # Buka dan baca file JSON
      with open(file_path, 'r') as json_file:
          data = json.load(json_file)
          # mengambil data
          json_data = data['data'][1]
          for jd in json_data:
            data_list.append(jd['_source'])  # Simpan data dari setiap file JSON ke dalam list

  return data_list

def extract_json_list(json_list):
    list_data = []
    for item in tqdm(json_list):
        data = {}
        data['id'] = item.get("id")
        data['jenis'] = item.get("jenis")
        data['sumber'] = item.get("source")
        data['kode_kbli'] = item.get("id").split("_")[2]
        data['judul'] = item.get("judul").split("] ")[1]
        data['last_update'] = item.get("last_update")
        data['url_kbli'] = item.get("url")
        data['tags'] = item.get("tags")

        list_sebelumnya = item.get("sebelumnya")
        for sebelumnya in list_sebelumnya:
            if (len(sebelumnya['kode']) == 4):
                data['subgolongan_kode'] = sebelumnya["kode"]
                data['subgolongan_judul'] = sebelumnya["judul"]
                data['subgolongan_url'] = f"{kbli_url}/{sebelumnya['kode']}"
            if (len(sebelumnya['kode']) == 3):
                data['golongan_kode'] = sebelumnya["kode"]
                data['golongan_judul'] = sebelumnya["judul"]
                data['golongan_url'] = f"{kbli_url}/{sebelumnya['kode']}"
            if (len(sebelumnya['kode']) == 2):
                data['golongan_pokok_kode'] = sebelumnya["kode"]
                data['golongan_pokok_judul'] = sebelumnya["judul"]
                data['golongan_pokok_url'] = f"{kbli_url}/{sebelumnya['kode']}"
            if (len(sebelumnya['kode']) == 1):
                data['kategori_kode'] = sebelumnya["kode"]
                data['kategori_judul'] = sebelumnya["judul"]
                data['kategori_url'] = f"{kbli_url}/{sebelumnya['kode']}"

        list_data.append(data)
    
    return pd.DataFrame(list_data)

In [5]:
list_data = read_json_files()
final_df = extract_json_list(list_data)
final_df.head()

100%|██████████| 1789/1789 [00:00<00:00, 135804.57it/s]


Unnamed: 0,id,jenis,sumber,kode_kbli,judul,last_update,url_kbli,tags,subgolongan_kode,subgolongan_judul,subgolongan_url,golongan_kode,golongan_judul,golongan_url,golongan_pokok_kode,golongan_pokok_judul,golongan_pokok_url,kategori_kode,kategori_judul,kategori_url
0,kbli_2020_01111,kbli2020,Metadata Management System (MMS),1111,Pertanian Jagung,2023-12-21,https://klasifikasi.web.bps.go.id/app/view/kbl...,"[pertanian, agriculture]",111,"Pertanian serealia (bukan padi), aneka kacang ...",https://klasifikasi.web.bps.go.id/app/view/kbl...,11,Pertanian Tanaman Semusim,https://klasifikasi.web.bps.go.id/app/view/kbl...,1,"Pertanian Tanaman, Peternakan, Perburuan dan K...",https://klasifikasi.web.bps.go.id/app/view/kbl...,A,"Pertanian, Kehutanan dan Perikanan",https://klasifikasi.web.bps.go.id/app/view/kbl...
1,kbli_2020_01112,kbli2020,Metadata Management System (MMS),1112,Pertanian Gandum,2023-12-21,https://klasifikasi.web.bps.go.id/app/view/kbl...,"[pertanian, agriculture]",111,"Pertanian serealia (bukan padi), aneka kacang ...",https://klasifikasi.web.bps.go.id/app/view/kbl...,11,Pertanian Tanaman Semusim,https://klasifikasi.web.bps.go.id/app/view/kbl...,1,"Pertanian Tanaman, Peternakan, Perburuan dan K...",https://klasifikasi.web.bps.go.id/app/view/kbl...,A,"Pertanian, Kehutanan dan Perikanan",https://klasifikasi.web.bps.go.id/app/view/kbl...
2,kbli_2020_01113,kbli2020,Metadata Management System (MMS),1113,Pertanian Kedelai,2023-12-21,https://klasifikasi.web.bps.go.id/app/view/kbl...,"[pertanian, agriculture]",111,"Pertanian serealia (bukan padi), aneka kacang ...",https://klasifikasi.web.bps.go.id/app/view/kbl...,11,Pertanian Tanaman Semusim,https://klasifikasi.web.bps.go.id/app/view/kbl...,1,"Pertanian Tanaman, Peternakan, Perburuan dan K...",https://klasifikasi.web.bps.go.id/app/view/kbl...,A,"Pertanian, Kehutanan dan Perikanan",https://klasifikasi.web.bps.go.id/app/view/kbl...
3,kbli_2020_01114,kbli2020,Metadata Management System (MMS),1114,Pertanian Kacang Tanah,2023-12-21,https://klasifikasi.web.bps.go.id/app/view/kbl...,"[pertanian, agriculture]",111,"Pertanian serealia (bukan padi), aneka kacang ...",https://klasifikasi.web.bps.go.id/app/view/kbl...,11,Pertanian Tanaman Semusim,https://klasifikasi.web.bps.go.id/app/view/kbl...,1,"Pertanian Tanaman, Peternakan, Perburuan dan K...",https://klasifikasi.web.bps.go.id/app/view/kbl...,A,"Pertanian, Kehutanan dan Perikanan",https://klasifikasi.web.bps.go.id/app/view/kbl...
4,kbli_2020_01115,kbli2020,Metadata Management System (MMS),1115,Pertanian Kacang Hijau,2023-12-21,https://klasifikasi.web.bps.go.id/app/view/kbl...,"[pertanian, agriculture]",111,"Pertanian serealia (bukan padi), aneka kacang ...",https://klasifikasi.web.bps.go.id/app/view/kbl...,11,Pertanian Tanaman Semusim,https://klasifikasi.web.bps.go.id/app/view/kbl...,1,"Pertanian Tanaman, Peternakan, Perburuan dan K...",https://klasifikasi.web.bps.go.id/app/view/kbl...,A,"Pertanian, Kehutanan dan Perikanan",https://klasifikasi.web.bps.go.id/app/view/kbl...


In [8]:
 # Save to csv
kbli_file = os.path.join(f'{directory}/raw', 'kbli2020.csv')
final_df.to_csv(kbli_file, index=False)
print("Data saved to kbli2020.csv")

Data saved to kbli2020.csv
