In [1]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv
# load .env
load_dotenv(find_dotenv())

In [34]:
# Non active InsecureRequestWarning
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# Create range of Province and Pages
provinces = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 99]  # Daftar provinsi
pages = list(range(1, 3))  # Make sure big value is 3940

In [4]:
# set base url
base_url = os.getenv('BASE_URL_KEMPERIN')
# Save and load csv files to scrapped
directory = "data"

In [5]:
# extract each html page
def extract_html_page(province, page):
    data = {'id': [], 'name': [], 'address': [], 'phone': [], 'kbli': []}
    url = f"{base_url}?what=&prov={province}&hal={page}"
    
    try:
        response = requests.get(url, verify=False)
        response.raise_for_status()
    except requests.HTTPError as e:
        print(f"HTTP error occurred: {e}")
        return data
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return data

    soup = BeautifulSoup(response.text, 'html.parser')
    div_tb = soup.find('div', {'class':'col-md-12 col-lg-12 col-xs-12 col-sm-12'})
    if not div_tb:
        print(f"No tag div in page {url}")
        return data

    table = soup.find('table', {'id': 'newspaper-a'})
    if not table:
        print(f"No table in page {url}")
        return data

    for row in table.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) < 3:
            continue

        establishment, address, phone = extract_establishment_info(cells[1])
        kbli = cells[2].text.strip()
        index_number = cells[0].text.strip().replace('.', '')

        data['id'].append(int(index_number))
        data['name'].append(establishment)
        data['address'].append(address)
        data['phone'].append(phone)
        data['kbli'].append(kbli)

    return data

# extract each establishment
def extract_establishment_info(cell):
    establishment = cell.find('b').text.strip()
    full_address = cell.find_all(text=True, recursive=False)
    address = full_address[0].strip() if full_address else ''
    phone = full_address[1].strip() if len(full_address) > 1 else ''
    return establishment, address, phone

# Save data to CSV file
def save_to_csv(data, province, page):
    df = pd.DataFrame(data)
    path_file = os.path.join(f'{directory}/raw', f'province_{province}_page_{page}.csv')
    df.to_csv(path_file, index=False)
    print(f"Data page 1 to {page} for province {province} saved to {path_file}")

# Loop through each province and page and save data to CSV files
def generate(provinces, pages):
    for province in tqdm(provinces):
        for idx, page in enumerate(pages, start=1):
            data = extract_html_page(province, page)

            # Save data for every 100 pages
            if idx % 100 == 0:
                save_to_csv(data, province, page)
                
            # Save if not saved yet
            if data['id']:
                save_to_csv(data, province, page)
    
    print("Scraping finished!")

In [6]:
generate(provinces, pages)

  full_address = cell.find_all(text=True, recursive=False)


Data page 1 to 1 for province 1 saved to data/raw\province_1_page_1.csv


2it [00:00,  2.55it/s]
  3%|▎         | 1/39 [00:00<00:29,  1.27it/s]

Data page 1 to 2 for province 1 saved to data/raw\province_1_page_2.csv




Data page 1 to 1 for province 2 saved to data/raw\province_2_page_1.csv


2it [00:00,  2.53it/s]
  5%|▌         | 2/39 [00:01<00:29,  1.25it/s]

Data page 1 to 2 for province 2 saved to data/raw\province_2_page_2.csv




Data page 1 to 1 for province 3 saved to data/raw\province_3_page_1.csv


2it [00:00,  2.30it/s]
  8%|▊         | 3/39 [00:02<00:29,  1.20it/s]

Data page 1 to 2 for province 3 saved to data/raw\province_3_page_2.csv




Data page 1 to 1 for province 4 saved to data/raw\province_4_page_1.csv


2it [00:01,  1.90it/s]
 10%|█         | 4/39 [00:03<00:32,  1.08it/s]

Data page 1 to 2 for province 4 saved to data/raw\province_4_page_2.csv




Data page 1 to 1 for province 5 saved to data/raw\province_5_page_1.csv


2it [00:03,  1.92s/it]
 13%|█▎        | 5/39 [00:07<01:07,  1.97s/it]

Data page 1 to 2 for province 5 saved to data/raw\province_5_page_2.csv




Data page 1 to 1 for province 6 saved to data/raw\province_6_page_1.csv


2it [00:03,  1.92s/it]
 15%|█▌        | 6/39 [00:11<01:26,  2.61s/it]

Data page 1 to 2 for province 6 saved to data/raw\province_6_page_2.csv




Data page 1 to 1 for province 7 saved to data/raw\province_7_page_1.csv


2it [00:00,  3.00it/s]
 18%|█▊        | 7/39 [00:11<01:03,  1.98s/it]

Data page 1 to 2 for province 7 saved to data/raw\province_7_page_2.csv




Data page 1 to 1 for province 8 saved to data/raw\province_8_page_1.csv


2it [00:01,  1.74it/s]
 21%|██        | 8/39 [00:13<00:53,  1.71s/it]

Data page 1 to 2 for province 8 saved to data/raw\province_8_page_2.csv




Data page 1 to 1 for province 9 saved to data/raw\province_9_page_1.csv


2it [00:00,  2.12it/s]
 23%|██▎       | 9/39 [00:14<00:44,  1.48s/it]

Data page 1 to 2 for province 9 saved to data/raw\province_9_page_2.csv




Data page 1 to 1 for province 10 saved to data/raw\province_10_page_1.csv


2it [00:00,  2.48it/s]
 26%|██▌       | 10/39 [00:14<00:36,  1.27s/it]

Data page 1 to 2 for province 10 saved to data/raw\province_10_page_2.csv




Data page 1 to 1 for province 11 saved to data/raw\province_11_page_1.csv


2it [00:00,  2.41it/s]
 28%|██▊       | 11/39 [00:15<00:31,  1.14s/it]

Data page 1 to 2 for province 11 saved to data/raw\province_11_page_2.csv




Data page 1 to 1 for province 12 saved to data/raw\province_12_page_1.csv


2it [00:01,  1.84it/s]
 31%|███       | 12/39 [00:16<00:30,  1.12s/it]

Data page 1 to 2 for province 12 saved to data/raw\province_12_page_2.csv




Data page 1 to 1 for province 13 saved to data/raw\province_13_page_1.csv


2it [00:00,  2.43it/s]
 33%|███▎      | 13/39 [00:17<00:26,  1.03s/it]

Data page 1 to 2 for province 13 saved to data/raw\province_13_page_2.csv




Data page 1 to 1 for province 14 saved to data/raw\province_14_page_1.csv


2it [00:01,  1.99it/s]
 36%|███▌      | 14/39 [00:18<00:25,  1.03s/it]

Data page 1 to 2 for province 14 saved to data/raw\province_14_page_2.csv




Data page 1 to 1 for province 15 saved to data/raw\province_15_page_1.csv


2it [00:01,  1.94it/s]
 38%|███▊      | 15/39 [00:19<00:24,  1.03s/it]

Data page 1 to 2 for province 15 saved to data/raw\province_15_page_2.csv




Data page 1 to 1 for province 16 saved to data/raw\province_16_page_1.csv


2it [00:01,  1.76it/s]
 41%|████      | 16/39 [00:20<00:24,  1.07s/it]

Data page 1 to 2 for province 16 saved to data/raw\province_16_page_2.csv




Data page 1 to 1 for province 17 saved to data/raw\province_17_page_1.csv


2it [00:00,  2.84it/s]
 44%|████▎     | 17/39 [00:21<00:21,  1.04it/s]

Data page 1 to 2 for province 17 saved to data/raw\province_17_page_2.csv




Data page 1 to 1 for province 18 saved to data/raw\province_18_page_1.csv


2it [00:01,  1.13it/s]
 46%|████▌     | 18/39 [00:23<00:25,  1.20s/it]

Data page 1 to 2 for province 18 saved to data/raw\province_18_page_2.csv




Data page 1 to 1 for province 19 saved to data/raw\province_19_page_1.csv


2it [00:01,  1.75it/s]
 49%|████▊     | 19/39 [00:24<00:23,  1.19s/it]

Data page 1 to 2 for province 19 saved to data/raw\province_19_page_2.csv




Data page 1 to 1 for province 20 saved to data/raw\province_20_page_1.csv


2it [00:01,  1.72it/s]
 51%|█████▏    | 20/39 [00:25<00:22,  1.18s/it]

Data page 1 to 2 for province 20 saved to data/raw\province_20_page_2.csv




Data page 1 to 1 for province 21 saved to data/raw\province_21_page_1.csv


2it [00:00,  2.91it/s]
 54%|█████▍    | 21/39 [00:26<00:18,  1.04s/it]

Data page 1 to 2 for province 21 saved to data/raw\province_21_page_2.csv




Data page 1 to 1 for province 22 saved to data/raw\province_22_page_1.csv


2it [00:00,  2.82it/s]
 56%|█████▋    | 22/39 [00:26<00:15,  1.06it/s]

Data page 1 to 2 for province 22 saved to data/raw\province_22_page_2.csv




Data page 1 to 1 for province 23 saved to data/raw\province_23_page_1.csv


2it [00:00,  2.41it/s]
 59%|█████▉    | 23/39 [00:27<00:14,  1.10it/s]

Data page 1 to 2 for province 23 saved to data/raw\province_23_page_2.csv




Data page 1 to 1 for province 24 saved to data/raw\province_24_page_1.csv


2it [00:07,  3.74s/it]
 62%|██████▏   | 24/39 [00:35<00:43,  2.88s/it]

Data page 1 to 2 for province 24 saved to data/raw\province_24_page_2.csv




Data page 1 to 1 for province 25 saved to data/raw\province_25_page_1.csv


2it [00:00,  2.70it/s]
 64%|██████▍   | 25/39 [00:36<00:31,  2.24s/it]

Data page 1 to 2 for province 25 saved to data/raw\province_25_page_2.csv




Data page 1 to 1 for province 26 saved to data/raw\province_26_page_1.csv


2it [00:00,  2.66it/s]
 67%|██████▋   | 26/39 [00:36<00:23,  1.79s/it]

Data page 1 to 2 for province 26 saved to data/raw\province_26_page_2.csv




Data page 1 to 1 for province 27 saved to data/raw\province_27_page_1.csv


2it [00:00,  2.19it/s]
 69%|██████▉   | 27/39 [00:37<00:18,  1.53s/it]

Data page 1 to 2 for province 27 saved to data/raw\province_27_page_2.csv




Data page 1 to 1 for province 28 saved to data/raw\province_28_page_1.csv


2it [00:00,  3.00it/s]
 72%|███████▏  | 28/39 [00:38<00:13,  1.27s/it]

Data page 1 to 2 for province 28 saved to data/raw\province_28_page_2.csv




Data page 1 to 1 for province 29 saved to data/raw\province_29_page_1.csv


2it [00:00,  2.87it/s]
 74%|███████▍  | 29/39 [00:39<00:11,  1.10s/it]

Data page 1 to 2 for province 29 saved to data/raw\province_29_page_2.csv




Data page 1 to 1 for province 30 saved to data/raw\province_30_page_1.csv


2it [00:00,  2.44it/s]
 77%|███████▋  | 30/39 [00:39<00:09,  1.02s/it]

Data page 1 to 2 for province 30 saved to data/raw\province_30_page_2.csv




Data page 1 to 1 for province 31 saved to data/raw\province_31_page_1.csv


2it [00:00,  2.79it/s]
 79%|███████▉  | 31/39 [00:40<00:07,  1.08it/s]

Data page 1 to 2 for province 31 saved to data/raw\province_31_page_2.csv




Data page 1 to 1 for province 32 saved to data/raw\province_32_page_1.csv


2it [00:02,  1.01s/it]
 82%|████████▏ | 32/39 [00:42<00:08,  1.26s/it]

Data page 1 to 2 for province 32 saved to data/raw\province_32_page_2.csv




Data page 1 to 1 for province 33 saved to data/raw\province_33_page_1.csv


2it [00:00,  2.73it/s]
 85%|████████▍ | 33/39 [00:43<00:06,  1.10s/it]

Data page 1 to 2 for province 33 saved to data/raw\province_33_page_2.csv




Data page 1 to 1 for province 34 saved to data/raw\province_34_page_1.csv


2it [00:00,  2.21it/s]
 87%|████████▋ | 34/39 [00:44<00:05,  1.04s/it]

Data page 1 to 2 for province 34 saved to data/raw\province_34_page_2.csv




Data page 1 to 1 for province 35 saved to data/raw\province_35_page_1.csv


2it [00:00,  2.46it/s]
 90%|████████▉ | 35/39 [00:45<00:03,  1.02it/s]

Data page 1 to 1 for province 36 saved to data/raw\province_36_page_1.csv


2it [00:00,  2.76it/s]
2it [00:00,  3.04it/s] [00:45<00:02,  1.11it/s]
 95%|█████████▍| 37/39 [00:46<00:01,  1.21it/s]

Data page 1 to 1 for province 38 saved to data/raw\province_38_page_1.csv


2it [00:00,  2.79it/s]
 97%|█████████▋| 38/39 [00:47<00:00,  1.25it/s]

Data page 1 to 2 for province 38 saved to data/raw\province_38_page_2.csv




Data page 1 to 1 for province 99 saved to data/raw\province_99_page_1.csv


2it [00:00,  2.82it/s]
100%|██████████| 39/39 [00:47<00:00,  1.23s/it]

Scraping finished!



