# Introduction

**Nama:** Ghassani Nurbaningtyas

**Role:** Data Engineer

**Project Explanation:** In this project, the task was to process a tax-related regulation document, specifically Konsolidasi.pdf, by applying text chunking/splitting techniques. The document needed to be segmented into meaningful sections based on the following criteria: content, chapter (Bab), article (Pasal), paragraph (Ayat), whether the section contains an explanation or not, then export to CSV file.

# Import Library

In [263]:
import re
import pandas as pd
from PyPDF2 import PdfReader

# Data Loading

In [264]:
# Function for extract PDF to text
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return text

# Extract text from Konsolidasi.pdf
pdf_path = "./Konsolidasi.pdf" 
document_text = extract_text_from_pdf(pdf_path)

# Show Result Text
# print(document_text)

# Data Chunking 

In [265]:
# Function to chunk the text
def chunk_document(text):
    chunks = []
    bab = pasal = ayat = poin = None
    current_chunk = {"Teks": "", "Bab": None, "Pasal": None, "Ayat": None, "Poin": None, "Penjelasan": "Bukan"}

    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        
        # Check if the line contains "Penjelasan" to switch to explanation mode
        if line.startswith("Penjelasan"):
            if current_chunk["Teks"]:
                chunks.append(current_chunk)
            current_chunk = {"Teks": "", "Bab": bab, "Pasal": pasal, "Ayat": ayat, "Poin": poin, "Penjelasan": "Ya"}
            
        # Check for "BAB" to set the Bab value
        elif line.startswith("BAB"):
            if current_chunk["Teks"]:
                chunks.append(current_chunk)
            bab_match = re.search(r"BAB\s+([IVXLC]+)", line)
            if bab_match:
                bab = bab_match.group(1)
            current_chunk = {"Teks": "", "Bab": bab, "Pasal": pasal, "Ayat": ayat, "Poin": poin, "Penjelasan": "Bukan"}
        
        # Check for "Pasal" at the start of the line to set the Pasal value
        elif line.startswith("Pasal"):
            if current_chunk["Teks"]:
                chunks.append(current_chunk)
            pasal_match = re.search(r"Pasal\s+(\d+)", line)
            if pasal_match:
                pasal = pasal_match.group(1)
            current_chunk = {"Teks": "", "Bab": bab, "Pasal": pasal, "Ayat": ayat, "Poin": poin, "Penjelasan": "Bukan"}

        # Check for "Ayat" (e.g., (1)) and set Ayat value
        elif re.match(r"\(\d+[a-z]?\)", line):
            if current_chunk["Teks"]:
                chunks.append(current_chunk)
            ayat_match = re.search(r"\((\d+[a-z]?)\)", line)
            if ayat_match:
                ayat = ayat_match.group(1)
                poin = None  # Reset poin
            current_chunk = {"Teks": "", "Bab": bab, "Pasal": pasal, "Ayat": ayat, "Poin": poin, "Penjelasan": "Bukan"}
            current_chunk["Teks"] += line + " "
        
        # Check for "Poin" and set Poin value (e.g., a. or Huruf a)
        elif re.match(r"^[a-z]\.|Huruf\s*[a-z]", line):
            if current_chunk["Teks"]:
                chunks.append(current_chunk)
            poin_match = re.search(r"([a-z])", line)
            if poin_match:
                poin = poin_match.group(1)  # Capture only the letter
            current_chunk = {"Teks": "", "Bab": bab, "Pasal": pasal, "Ayat": ayat, "Poin": poin, "Penjelasan": "Bukan"}
            current_chunk["Teks"] += line + " "

        # Otherwise, append the line to the current chunk's text
        else:
            current_chunk["Teks"] += line + " "

    # Append the last chunk
    if current_chunk["Teks"]:
        chunks.append(current_chunk)

    return chunks

# Apply chunking function to the extracted text
chunks = chunk_document(document_text)

# Check chunking result
print(chunks[5])

{'Teks': '(1) Yang menjadi Subjek Pajak adalah : ', 'Bab': 'II', 'Pasal': '2', 'Ayat': '1', 'Poin': None, 'Penjelasan': 'Bukan'}


# Data Saving

In [266]:
# Convert chunks to a pandas DataFrame
df = pd.DataFrame(chunks)

# Show Data Before Saving
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Teks,Bab,Pasal,Ayat,Poin,Penjelasan
0,TaxBase 6.0 Document - Page : 1SUSUNAN DALAM S...,,,,,Bukan
1,KETENTUAN UMUM,I,,,,Bukan
2,Pajak Penghasilan dikenakan terhadap Subjek Pa...,I,1.0,,,Bukan
3,Undang-Undang ini mengatur pengenaan Pajak Pen...,I,1.0,,,Ya
4,SUBJEK PAJAK,II,1.0,,,Bukan


In [267]:
# Save the DataFrame to a CSV file
df.to_csv('konsolidasi_data.csv', index=False)

# Saving to MongoDB

In [268]:
# # Koneksi ke server MongoDB
# client = MongoClient('localhost', 27017)

# # Pilih database
# db = client['inatax']

# # Pilih koleksi
# collection = db['de']

# # Baca data dari file CSV
# csv_file_path = './konsolidasi_data.csv'
# data = pd.read_csv(csv_file_path)

# # Konversi DataFrame ke format JSON (dict)
# data_dict = data.to_dict(orient='records')

# # Simpan data ke dalam koleksi MongoDB
# collection.insert_many(data_dict)

# print("Data berhasil disimpan ke MongoDB")