In [42]:
import pdfplumber 
import pandas as pd
import csv
import re
from tqdm import tqdm 


In [64]:
def extract_text_from_pdf(pdf_path, start_page=0, end_page=None, max_pages=None):
    all_text = []
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        end_page = min(end_page or total_pages, start_page + (max_pages or total_pages))
        pages_to_process = min(end_page - start_page, max_pages or float('inf'))
        
        print(f"Processing {pages_to_process} pages (from page {start_page + 1} to {end_page})")
        
        for page_num in tqdm(range(start_page, end_page), desc="Extracting pages"):
            page = pdf.pages[page_num]
            all_text.append(page.extract_text())
    
    return '\n'.join(all_text)

In [65]:
def parse_bank_statement(text):
    lines = text.split('\n')
    data = []
    columns = ['Ngày GD/ TNX Date', 'Số CT/ Doc No', 'Số tiền ghi nợ/ Debit', 'Số tiền ghi có/ Credit', 'Số dư/ Balance', 'Nội dung chi tiết/ Transactions in detail']
    
    for line in lines:
        match = re.match(r'(\d{2}/\d{2}/\d{4})\s+(\d+\.\d+)', line)
        if match:
            date, doc_no = match.groups()
            parts = re.split(r'\s{2,}', line)
            if len(parts) >= 6:
                debit = parts[2] if parts[2] != '' else '0'
                credit = parts[3] if parts[3] != '' else '0'
                balance = parts[4]
                details = ' '.join(parts[5:])
                data.append([f"{date}", doc_no, debit, credit, balance, details])
    
    return pd.DataFrame(data, columns=columns)

In [66]:
def process_pdf(pdf_path, output_csv, max_pages=None):
    text = extract_text_from_pdf(pdf_path, max_pages=max_pages)
    df = parse_bank_statement(text)
    df.to_csv(output_csv, index=False)
    print(f"CSV file has been created: {output_csv}")
    print(f"Processed {len(df)} transactions.")

In [None]:
pdf_path="from1-09to10-09.pdf"
output_csv = "output_bank_statement.csv"
process_pdf(pdf_path, output_csv, max_pages=200)