In [55]:
import os
import pandas as pd
import PyPDF2
import re

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = []
        for page in pdf_reader.pages:
            text.append(page.extract_text())
        return '\n'.join(text)

# Folder path containing PDF files
folder_path = r'/Users/chiragsinghchaudhary/Downloads/drive-download-20230712T175447Z-001'

# List to store extracted data
data = []

# Iterate over PDF files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(folder_path, file_name)
        text = extract_text_from_pdf(file_path)

        # Extracting field values using regex patterns with error handling
        policy_number_match = re.search(r'Policy\s+No[.:]?\s*([^,\n]+)', text, re.IGNORECASE)
        policy_number = policy_number_match.group(1).strip() if policy_number_match else None

        proposer_name_match = re.search(r"(?:Insured's\s+Code/\s+Name|Name\s+(?:of\s+)?Insured/Proposer|Insured's\s+Name|Proposer\s+Name|Name\s+&\s+Communication\s+Address):\s*([^,\n]+)", text, re.IGNORECASE)
        proposer_name = proposer_name_match.group(1).strip() if proposer_name_match else None

        registration_number_match = re.search(r'(?:Registration\s+Mark\s+&\s+Place|Registration\s+Mark\s+&\s+No\.?|Registration\s+No[.,]|Registration\s+Number)\s*[:.-]?\s*([^,\n]+)', text, re.IGNORECASE)
        registration_number = registration_number_match.group(1).strip() if registration_number_match else None

        policy_IDV_match = re.search(r'(?:IDV\s+For\s+the\s+Vehicle|Total\s+Value(?:[-₹])?|Total\s+IDV|Total\s+Value\s+\(Rs[.]\)|For\s+Vehicle\s+\(Rs[.]\))\s*[:.-]?\s*([^,\n]+)', text, re.IGNORECASE)
        policy_IDV = policy_IDV_match.group(1).strip() if policy_IDV_match else None

        total_premium_match = re.search(r'(?:FINAL\s+PREMIUM|Total[:]|Premium\s+Paid[(]Total\s+Invoice\s+Value[)]\s+Rs[.]|Total\s+\(Rounded\s+to\s+nearest\s+rupee\)|Premium|TOTAL\s+ADD[‐-]ON[‐-]COVERS\s+PREMIUM[(]D[)]|TOTAL\s+AMOUNT\s+COLLECTED)\s*[:.-]?\s*([^,\n]+)', text, re.IGNORECASE)
        total_premium = total_premium_match.group(1).strip() if total_premium_match else None

        contact_number_match = re.search(r'(?:Contact\s+No[:]|Phone\s+[#][:]|Telephone[(]Mob[)]\s+[:]|Contact\s+[:]|Contact\s+Number|Mobile\s+[:])\s*([^,\n]+)', text, re.IGNORECASE)
        contact_number = contact_number_match.group(1).strip() if contact_number_match else None

        policy_start_date_match = re.search(r'(?:Period\s+of\s+Insurance\s+[:]|Valid\s+from\s+\(dd-mmm-yyyy\)\s+Or\s+Period\s+of\s+Insurance\s+[:])\s*([^,\n]+)', text, re.IGNORECASE)
        policy_start_date = policy_start_date_match.group(1).strip() if policy_start_date_match else None

        data.append({
            'Filename': file_name,
            'Policy Number': policy_number,
            'Proposer Name': proposer_name,
            'Registration Number': registration_number,
            'Policy IDV': policy_IDV,
            'Total Premium': total_premium,
            'Contact Number': contact_number,
            'Policy Start Date': policy_start_date
        })

# Create dataframe from extracted data
df = pd.DataFrame(data)

# Convert dataframe to structured JSON
json_data = df.to_json(orient='records', indent=4)

# Print JSON data
print(json_data)


[
    {
        "Filename":"Shriram_General_Insurance.pdf",
        "Policy Number":":10019\/31\/23\/013086 Geographical Area INDIA",
        "Proposer Name":"105799780\/ Mr. Naman Kanuga",
        "Registration Number":"Engine No",
        "Policy IDV":"IDV For",
        "Total Premium":"9",
        "Contact Number":"18002585970",
        "Policy Start Date":"From 00:00 on 31-05-2022 00:00 To 30-05-2023 Midnight Cvr Note No. - Issue date"
    },
    {
        "Filename":"Edelweiss.pdf",
        "Policy Number":"is",
        "Proposer Name":"Mr. Akshay Nandkumar Borhade Insured's GST No. :  NA",
        "Registration Number":"27AAECE2328J1ZO HSN\/SAC Code : 997134Description of Good or Service : General",
        "Policy IDV":"Accessories Accessories \/LPG Kit",
        "Total Premium":"?",
        "Contact Number":"8655309919 Carriage Goods other than sample orNA",
        "Policy Start Date":null
    },
    {
        "Filename":"FutureGenerali.pdf",
        "Policy Number":": V976481