In [35]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # adjust path if needed

In [None]:
poppler_path = r'C:\Users\hp\Release-24.08.0-0\poppler-24.08.0\Library\bin'

In [38]:
# Convert PDF to images and extract text using pytesseract
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import numpy as np

# Convert PDF to images
print("Converting PDF to images...")
images = convert_from_path('testingpdf1.pdf',poppler_path=poppler_path)
print(f"Converted {len(images)} page(s) to images")

# Extract text from each image
all_text = []
for i, image in enumerate(images):
    print(f"Processing page {i+1}...")
    text = pytesseract.image_to_string(image)
    all_text.append(text)
    
# Display the extracted text from the first page
print("\
Extracted text from first page (sample):")
print(all_text[0][:500] + "..." if len(all_text[0]) > 500 else all_text[0])

Converting PDF to images...
Converted 8 page(s) to images
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Extracted text from first page (sample):
BANK NAME : PUNJAB AND SIND BANK 26-06-2023

BRANCH NAME : JALANDHAR SME 13:15:47
ADDRESS : 201 INDUSTRIAL AREA
JALANDHAR

CITY : JALANDHAR (PUNJAB)
PIN CODE : 144001
STATE : Punjab INDIA Page No: 1
IFSC Code : PSIBOQ00697
MICR Code : 144023019 Phone no:
0181-5018936
REPORT PRINTED BY : MANPRIT KAUR Sol id : 0697
Account No : 06971300000866 Nomination Registered
No
A/C Name : M/S. KAMAL BRICKS INDUSTRY Nominee Name
Address : 560 MOTA SINGH NAGAR

JALANDHAR
City : JALANDHAR (PPin Code : 144001
Te...


In [39]:
# Create a dataframe with the extracted text from each page
import pandas as pd

print("Creating DataFrame with extracted text...")
df = pd.DataFrame({"Page": list(range(1, len(all_text)+1)), "Text": all_text})

# Save the dataframe to an Excel file
output_file = 'extracted_pdf.xlsx'
df.to_excel(output_file, index=False)
print("DataFrame saved to Excel file: " + output_file)

# Display a sample of the dataframe
print(df.head())

Creating DataFrame with extracted text...
DataFrame saved to Excel file: extracted_pdf.xlsx
   Page                                               Text
0     1  BANK NAME : PUNJAB AND SIND BANK 26-06-2023\n\...
1     2  15,000.00\n26-Jul-2022 C\n2,000.00\n28-Jul-202...
2     3  BANK NAME\nBRANCH NAME\nADDRESS\n\nCITY\n\nPIN...
3     4  33,255.00\n28-Feb-2023\n34,947.00\n20-Mar -202...
4     5  BANK NAME PUNJAB AND SIND BANK 26-06-2023\nBRA...


In [41]:
# Let's try to parse the text into a more structured format
# First, let's examine the text more closely to identify patterns

# Function to process text and extract structured data
def process_bank_statement(text):
    lines = text.split('\n')

    data = []
    
    # Extract header information
    header_info = {}
    transaction_data = []
    transaction_started = False
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Check if we've reached the transaction section
        if 'Q1-APR-2022' in line or 'Open Date' in line:
            transaction_started = True
            continue
            
        if not transaction_started:
            # Process header information
            if ':' in line:
                key, value = line.split(':', 1)
                header_info[key.strip()] = value.strip()
            elif 'BANK NAME' in line:
                header_info['BANK NAME'] = line.replace('BANK NAME', '').strip()
            elif 'BRANCH NAME' in line:
                header_info['BRANCH NAME'] = line.replace('BRANCH NAME', '').strip()
            elif 'ADDRESS' in line:
                header_info['ADDRESS'] = line.replace('ADDRESS', '').strip()
            elif 'CITY' in line:
                header_info['CITY'] = line.replace('CITY', '').strip()
            elif 'PIN CODE' in line:
                header_info['PIN CODE'] = line.replace('PIN CODE', '').strip()
            elif 'STATE' in line:
                header_info['STATE'] = line.replace('STATE', '').strip()
            elif 'IFSC Code' in line:
                header_info['IFSC Code'] = line.replace('IFSC Code', '').strip()
            elif 'MICR Code' in line:
                header_info['MICR Code'] = line.replace('MICR Code', '').strip()
        else:
            # Process transaction data
            # Look for date patterns like "04-Apr-2022"
            if '-' in line and any(month in line for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']):
                parts = line.split()
                if len(parts) >= 2:
                    date = parts[0]
                    trans_type = parts[1] if len(parts) > 1 else ''
                    amount = parts[2] if len(parts) > 2 else ''
                    transaction_data.append({
                        'Date': date,
                        'Type': trans_type,
                        'Amount': amount
                    })
    
    return header_info, transaction_data

# Process each page
all_header_info = []
all_transactions = []

for i, text in enumerate(all_text):
    header_info, transactions = process_bank_statement(text)
    
    # Add page number to header info
    header_info['Page'] = i + 1
    
    all_header_info.append(header_info)
    all_transactions.extend(transactions)

# Create DataFrames
header_df = pd.DataFrame(all_header_info)
transactions_df = pd.DataFrame(all_transactions)

# Display the results
print("Bank Statement Header Information:")
print(header_df.head())

print("\
Transaction Data:")
print(transactions_df.head(10))

# Save to Excel with multiple sheets
with pd.ExcelWriter('structured_bank_statement.xlsx') as writer:
    header_df.to_excel(writer, sheet_name='Header Info', index=False)
    transactions_df.to_excel(writer, sheet_name='Transactions', index=False)

print("\
Structured data saved to 'structured_bank_statement.xlsx'")

Bank Statement Header Information:
                         BANK NAME             BRANCH NAME  \
0  PUNJAB AND SIND BANK 26-06-2023  JALANDHAR SME 13:15:47   
1                              NaN                     NaN   
2                                                            
3                              NaN                     NaN   
4  PUNJAB AND SIND BANK 26-06-2023                     NaN   

               ADDRESS                CITY PIN CODE                    STATE  \
0  201 INDUSTRIAL AREA  JALANDHAR (PUNJAB)   144001  Punjab INDIA Page No: 1   
1                  NaN                 NaN      NaN                      NaN   
2                                                                              
3                  NaN                 NaN      NaN                      NaN   
4  201 INDUSTRIAL AREA  JALANDHAR (PUNJAB)   144001                      NaN   

     IFSC Code            MICR Code           REPORT PRINTED BY  \
0  PSIBOQ00697  144023019 Phone no:  MANPRIT