In [None]:
import pdfplumber
import csv
import re
from datetime import datetime

def extract_table_data(pdf_path, output_csv, start_page=1, end_page=None, chunk_size=50):
    all_data = []
    current_page = start_page

    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        end_page = min(end_page or total_pages, total_pages)

        while current_page <= end_page:
            chunk_end = min(current_page + chunk_size - 1, end_page)
            print(f"Processing pages {current_page} to {chunk_end}")

            for page_num in range(current_page - 1, chunk_end):
                page = pdf.pages[page_num]
                text = page.extract_text()
                lines = text.split('\n')

                for line in lines:
                    if re.match(r'\d{2}/\d{2}/\d{4}', line):
                        parts = re.split(r'\s{2,}', line)
                        if len(parts) >= 5:
                            date = parts[0]
                            doc_no = parts[1]
                            debit = parts[2] if parts[2] != '' else '0'
                            credit = parts[3] if parts[3] != '' else '0'
                            balance = parts[4]
                            description = ' '.join(parts[5:]) if len(parts) > 5 else ''
                            all_data.append([date, doc_no, debit, credit, balance, description])

            current_page = chunk_end + 1

    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Date', 'Doc No', 'Debit', 'Credit', 'Balance', 'Description'])
        writer.writerows(all_data)

    print(f"Data extracted and saved to {output_csv}")

# Usage
pdf_path = 'from1-09to10-09.pdf'
output_csv = f'account_statement_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
extract_table_data(pdf_path, output_csv, start_page=1, end_page=200, chunk_size=50)