In [1]:
import os
import pandas as pd
import re
from dotenv import load_dotenv, find_dotenv
# load .env
load_dotenv(find_dotenv())

True

In [2]:
# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'

directory = "data"

In [3]:
provinsi_df = pd.read_csv(f"{directory}/raw/provinces.csv", dtype=str)
kabupaten_df = pd.read_csv(f"{directory}/raw/regencies.csv", dtype=str)

In [4]:
provinsi_df.head()

Unnamed: 0,kode,nama
0,11,ACEH
1,12,SUMATERA UTARA
2,13,SUMATERA BARAT
3,14,RIAU
4,15,JAMBI


In [5]:
kabupaten_df.head()

Unnamed: 0,kode,nama
0,1101,SIMEULUE
1,1102,ACEH SINGKIL
2,1103,ACEH SELATAN
3,1104,ACEH TENGGARA
4,1105,ACEH TIMUR


In [6]:
def get_provinsi(df):
  df['nama'] = df['nama'].str.replace('KEP.', 'KEPULAUAN', regex=False)
  return df

def normalize_kabupaten_nama(kabupaten_nama):
    # Menghilangkan spasi di antara huruf besar yang terpisah oleh satu spasi
    normalized_kabupaten_nama = re.sub(r'\b([A-Z])\s+', r'\1', kabupaten_nama)
    return normalized_kabupaten_nama.strip()

def get_kabupaten(df):
  df['nama'] = df['nama'].apply(normalize_kabupaten_nama)
  return df

def get_relasi_kabupaten_provinsi(df):
  df['provinsi_kode'] = df['kode'].astype(str).str.slice(0, 2)
  df.columns = ['kabupaten_kode', 'kabupaten_nama', 'provinsi_kode']
  selected_df = df[['kabupaten_kode', 'provinsi_kode']]
  return selected_df

def get_relasi_perusahaan_kabupaten(df):
  siinas_df = pd.read_csv('../establishments/data/processed/siinas_data.csv')
  siinas_df['kabupaten_nama'] = siinas_df['kabupaten_nama'].str.replace(r'\b(Kabupaten|Kota)\b\s*', '', regex=True)
  siinas_df['kabupaten_nama'] = siinas_df['kabupaten_nama'].str.upper()
  # mengambil hanya id dan kabupaten_nama
  selected_siinas_df = siinas_df[['id', 'kabupaten_nama']]
  selected_df = df[['kode', 'nama']]
  selected_df['nama'] = selected_df['nama'].apply(normalize_kabupaten_nama)
  # merge
  merged_df = pd.merge(selected_siinas_df, selected_df, left_on='kabupaten_nama', right_on='nama')
  # ambil hanya id dan kode
  selected_merged_df = merged_df[['id', 'kode']]
  selected_merged_df.columns = ['id_perusahaan', 'kabupaten_kode']
  return selected_merged_df

def save_to_csv(df, filename):
  df.to_csv(f"{directory}/processed/{filename}", index=False)
  print(f"{filename} has been saved to processed")

In [7]:
save_to_csv(get_provinsi(provinsi_df), 'provinsi.csv')
save_to_csv(get_kabupaten(kabupaten_df), 'kabupaten.csv')
relasi_kabupaten_provinsi = kabupaten_df.copy()
save_to_csv(get_relasi_kabupaten_provinsi(relasi_kabupaten_provinsi), 'relasi_kabupaten_provinsi.csv')
relasi_perusahaan_kabupaten = kabupaten_df.copy()
save_to_csv(get_relasi_perusahaan_kabupaten(relasi_perusahaan_kabupaten), 'relasi_perusahaan_kabupaten.csv')

provinsi.csv has been saved to processed
kabupaten.csv has been saved to processed
relasi_kabupaten_provinsi.csv has been saved to processed
relasi_perusahaan_kabupaten.csv has been saved to processed


Karena tidak ada lagi data yang diproses dan sudah sesuai, maka tinggal kita copy-paste ke dalam datasets.

In [8]:
import shutil

def copy_csv(source, destination):
  # Ensure the destination directory exists
  if not os.path.exists(destination):
      os.makedirs(destination)

  # Loop through all files in the source directory
  for filename in os.listdir(source):
      # Check if the file is a CSV file
      if filename.endswith('.csv'):
          # Define full file paths
          full_filename = os.path.join(source, filename)
          if os.path.isfile(full_filename):
              # Copy the CSV file to the destination directory
              shutil.copy(full_filename, destination)

copy_csv(f"{directory}/processed", "../../datasets/regions")