In [1]:
!pip install pypdf



In [2]:
from pypdf import PdfReader

def discover_pdf_fields(pdf_path):
    reader = PdfReader(pdf_path)
    fields = reader.get_form_text_fields()

    if fields:
        print("Discovered Fields:")
        for field_name, field_value in fields.items():
            print(f"Field Name: {field_name}, Default Value: {field_value}")
    else:
        print("No form fields found.")
        
discover_pdf_fields("f1040.pdf")


Discovered Fields:
Field Name: f1_01[0], Default Value: None
Field Name: f1_02[0], Default Value: None
Field Name: f1_03[0], Default Value: None
Field Name: f1_04[0], Default Value: None
Field Name: f1_05[0], Default Value: None
Field Name: f1_06[0], Default Value: None
Field Name: f1_07[0], Default Value: None
Field Name: f1_08[0], Default Value: None
Field Name: f1_09[0], Default Value: None
Field Name: f1_10[0], Default Value: None
Field Name: f1_11[0], Default Value: None
Field Name: f1_12[0], Default Value: None
Field Name: f1_13[0], Default Value: None
Field Name: f1_14[0], Default Value: None
Field Name: f1_15[0], Default Value: None
Field Name: f1_16[0], Default Value: None
Field Name: f1_17[0], Default Value: None
Field Name: f1_18[0], Default Value: None
Field Name: f1_19[0], Default Value: None
Field Name: f1_20[0], Default Value: None
Field Name: f1_21[0], Default Value: None
Field Name: f1_22[0], Default Value: None
Field Name: f1_23[0], Default Value: None
Field Name: f1_

In [10]:
!pip install pikepdf

Collecting pikepdf
  Downloading pikepdf-9.5.2-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Downloading pikepdf-9.5.2-cp312-cp312-win_amd64.whl (3.5 MB)
   ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
   ---------------------------------------- 3.5/3.5 MB 34.1 MB/s eta 0:00:00
Installing collected packages: pikepdf
Successfully installed pikepdf-9.5.2


In [None]:
import io
import pypdf
import pikepdf

def extract_pypdf_fields(pdf_path):
    """
    Extract fields using PyPDF library's methods
    """
    print("\n--- PyPDF Field Extraction ---")
    try:
        reader = pypdf.PdfReader(pdf_path)
           
        # Form Fields (more comprehensive)
        if reader.get_fields():
            print("\nDetailed Form Fields:")
            for name, field in reader.get_fields().items():
                print(f"  Field Name: {name}")
                print(f"    Type: {field.get('/FT', 'Unknown')}")
                print(f"    Value: {field.get('/V', 'No Value')}")
                print(f"    Flags: {field.get('/Ff', 'No Flags')}")
        else:
            print("No form fields found using PyPDF")
    
    except Exception as e:
        print(f"PyPDF Extraction Error: {e}")

def extract_pikepdf_fields(pdf_path):
    """
    Extract fields using PikePDF library
    """
    print("\n--- PikePDF Field Extraction ---")
    try:
        pdf = pikepdf.Pdf.open(pdf_path)
        
        # Iterate through all form fields
        for page in pdf.pages:
            if '/Annots' in page:
                for annot in page['/Annots']:
                    if '/Subtype' in annot and annot['/Subtype'] == '/Widget':
                        field_details = {
                            'Name': annot.get('/T', 'Unnamed'),
                            'Type': annot.get('/FT', 'Unknown'),
                            'Value': annot.get('/V', 'No Value'),
                            'Flags': annot.get('/Ff', 'No Flags')
                        }
                        print("Field Details:")
                        for key, value in field_details.items():
                            print(f"  {key}: {value}")
    
    except Exception as e:
        print(f"PikePDF Extraction Error: {e}")

def analyze_pdf_fields(pdf_path):
    """
    Comprehensive PDF field analysis
    """
    print(f"Analyzing PDF: {pdf_path}")
    
    # Extract fields using multiple methods
    extract_pypdf_fields(pdf_path)
    
    try:
        extract_pikepdf_fields(pdf_path)
    except ImportError:
        print("PikePDF is not installed. Install with 'pip install pikepdf'")

# Example usage
def main():
    # Replace with your PDF path
    pdf_path = "f1040.pdf"
    analyze_pdf_fields(pdf_path)

if __name__ == "__main__":
    main()

In [None]:
import io
import pypdf
import pikepdf

def extract_field_labels_pypdf(pdf_path):
    """
    Attempt to extract more meaningful field labels using PyPDF
    """
    print("\n--- PyPDF Field Label Extraction ---")
    try:
        reader = pypdf.PdfReader(pdf_path)
        
        def extract_field_label(field):
            """
            Try to extract a more meaningful label from the field
            """
            # Try different potential label sources
            label_keys = ['/T', '/TU', '/TM', '/Alt']
            
            for key in label_keys:
                if key in field:
                    label = field[key]
                    # Remove any non-printable characters and strip
                    label = ''.join(char for char in label if char.isprintable()).strip()
                    if label:
                        return label
            return "Unnamed Field"

        if reader.get_fields():
            print("\nDetailed Form Field Labels:")
            for name, field in reader.get_fields().items():
                # Try to get a more descriptive label
                label = extract_field_label(field)
                print(f"  Internal Name: {name}")
                print(f"  Extracted Label: {label}")
                print(f"    Type: {field.get('/FT', 'Unknown')}")
                print(f"    Value: {field.get('/V', 'No Value')}")
                print("---")
        else:
            print("No form fields found using PyPDF")
    
    except Exception as e:
        print(f"PyPDF Label Extraction Error: {e}")

def extract_field_labels_pikepdf(pdf_path):
    """
    Attempt to extract field labels using PikePDF
    """
    print("\n--- PikePDF Field Label Extraction ---")
    try:
        pdf = pikepdf.Pdf.open(pdf_path)
        
        def get_field_label(annot):
            """
            Extract label from PDF annotation
            """
            label_keys = ['/T', '/TU', '/TM', '/Alt']
            
            for key in label_keys:
                if key in annot:
                    label = str(annot[key])
                    # Clean and filter the label
                    label = ''.join(char for char in label if char.isprintable()).strip()
                    if label:
                        return label
            return "Unnamed Field"

        found_fields = False
        for page in pdf.pages:
            if '/Annots' in page:
                for annot in page['/Annots']:
                    if '/Subtype' in annot and annot['/Subtype'] == '/Widget':
                        found_fields = True
                        label = get_field_label(annot)
                        field_details = {
                            'Extracted Label': label,
                            'Type': annot.get('/FT', 'Unknown'),
                            'Value': annot.get('/V', 'No Value')
                        }
                        print("Field Details:")
                        for key, value in field_details.items():
                            print(f"  {key}: {value}")
                        print("---")
        
        if not found_fields:
            print("No form fields found using PikePDF")
    
    except Exception as e:
        print(f"PikePDF Label Extraction Error: {e}")

def analyze_pdf_field_labels(pdf_path):
    """
    Comprehensive PDF field label analysis
    """
    print(f"Analyzing PDF: {pdf_path}")
    
    # Extract field labels using multiple methods
    extract_field_labels_pypdf(pdf_path)
    
    try:
        extract_field_labels_pikepdf(pdf_path)
    except ImportError:
        print("PikePDF is not installed. Install with 'pip install pikepdf'")

# Example usage
def main():
    # Replace with your PDF path
    pdf_path = "f1040.pdf"
    analyze_pdf_field_labels(pdf_path)

if __name__ == "__main__":
    main()

In [20]:
!pip install pikepdf pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ----- ---------------------------------- 0.8/5.6 MB 4.2 MB/s eta 0:00:02
   ------------- -------------------------- 1.8/5.6 MB 4.6 MB/s eta 0:00:01
   ------------------ --------------------- 2.6/5.6 MB 4.6 MB/s eta 0:00:01
   -------------------------- ------------- 3.7/5.6 MB 4.6 MB/s eta 0:00:01
   ------------------------------- -------- 4.5/5.6 MB 4.6 MB/s eta 0:00:01
   ---------------------------------------  5.5/5.6 MB 4.5 MB/s eta 0:00:01
   ----------------------------

In [21]:
import pikepdf
import pdfplumber

def extract_detailed_form_info(pdf_path):
    """
    Attempt to extract more detailed form field information
    """
    try:
        # First, try using pikepdf for structural analysis
        pdf = pikepdf.Pdf.open(pdf_path)
        
        print("\n--- Detailed PDF Form Field Analysis ---")
        
        # Use pdfplumber for additional text extraction
        with pdfplumber.open(pdf_path) as plumber_pdf:
            first_page = plumber_pdf.pages[0]
            
            # Extract visible text near form fields
            page_text = first_page.extract_text()
            print("\nPage Text Context:")
            print(page_text)
        
        # Detailed pikepdf annotation parsing
        for page_num, page in enumerate(pdf.pages, 1):
            if '/Annots' in page:
                print(f"\n--- Page {page_num} Annotations ---")
                for annot in page['/Annots']:
                    if '/Subtype' in annot and annot['/Subtype'] == '/Widget':
                        print("\nForm Field Details:")
                        
                        # Extract all available information
                        field_info = {
                            'Internal Name': annot.get('/T', 'No Internal Name'),
                            'Type': annot.get('/FT', 'Unknown Type'),
                            'Rectangle': annot.get('/Rect', 'No Rect'),
                            'Font': annot.get('/DA', 'No Font Info'),
                            'Max Length': annot.get('/MaxLen', 'No Max Length')
                        }
                        
                        # Print field details
                        for key, value in field_info.items():
                            print(f"  {key}: {value}")
    
    except Exception as e:
        print(f"Error extracting PDF form details: {e}")

def analyze_form_field_context(pdf_path):
    """
    Additional context analysis for form fields
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            first_page = pdf.pages[0]
            
            # Extract text and words
            words = first_page.extract_words()
            
            print("\n--- Form Field Context Analysis ---")
            for word in words:
                # You might want to filter or process words near form fields
                print(f"Word: {word['text']}, Position: {word['x0']}, {word['top']}")
    
    except Exception as e:
        print(f"Error analyzing form field context: {e}")

def main():
    # Replace with your PDF path
    pdf_path = "f1040.pdf"
    
    # Extract detailed form information
    extract_detailed_form_info(pdf_path)
    
    # Analyze form field context
    analyze_form_field_context(pdf_path)

if __name__ == "__main__":
    main()


--- Detailed PDF Form Field Analysis ---

Page Text Context:
mroF 1040 Department of the Treasury—Internal Revenue Service 2024
U.S. Individual Income Tax Return
OMB No. 1545-0074 IRS Use Only—Do not write or staple in this space.
For the year Jan. 1–Dec. 31, 2024, or other tax year beginning , 2024, ending , 20 See separate instructions.
Your first name and middle initial Last name Your social security number
If joint return, spouse’s first name and middle initial Last name Spouse’s social security number
Home address (number and street). If you have a P.O. box, see instructions. Apt. no. Presidential Election Campaign
Check here if you, or your
spouse if filing jointly, want $3
City, town, or post office. If you have a foreign address, also complete spaces below. State ZIP code
to go to this fund. Checking a
box below will not change
Foreign country name Foreign province/state/county Foreign postal code your tax or refund.
You Spouse
Filing Status Single Head of household (HOH)
Marr

In [27]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [52]:
from PyPDF2 import PdfReader
from collections import OrderedDict

def get_fields(obj, tree=None, retval=None, fileobj=None):
    """
    Extracts field data if this PDF contains interactive form fields.
    The *tree* and *retval* parameters are for recursive use.
    :param fileobj: A file object (usually a text file) to write
        a report to on all interactive form fields found.
    :return: A dictionary where each key is a field name, and each
        value is a :class:`Field<PyPDF2.generic.Field>` object. By
        default, the mapping name is used for keys.
    :rtype: dict, or ``None`` if form data could not be located.
    """
    fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
                    '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
    if retval is None:
        retval = OrderedDict()
        catalog = obj.trailer["/Root"]
        # get the AcroForm tree
        if "/AcroForm" in catalog:
            tree = catalog["/AcroForm"]
        else:
            return None
    if tree is None:
        return retval
    obj._check_kids(tree, retval, fileobj)
    for attr in fieldAttributes:
        if attr in tree:
            # Tree is a field
            obj._build_field(tree, retval, fileobj, fieldAttributes)
            break
    if "/Fields" in tree:
        fields = tree["/Fields"]
        for f in fields:
            field = f.get_object()
            obj._build_field(field, retval, fileobj, fieldAttributes)
    return retval

def get_form_fields(infile):
    with open(infile, 'rb') as file:
        infile = PdfReader(file)
        fields = get_fields(infile)
        return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())

if __name__ == '__main__':
    from pprint import pprint
    pdf_file_name = 'f1040_filled.pdf'
    pprint(get_form_fields(pdf_file_name))

OrderedDict([('f1_01[0]', ''),
             ('f1_02[0]', ''),
             ('f1_03[0]', ''),
             ('f1_04[0]', ''),
             ('f1_05[0]', ''),
             ('f1_06[0]', ''),
             ('f1_07[0]', ''),
             ('f1_08[0]', ''),
             ('f1_09[0]', ''),
             ('f1_10[0]', ''),
             ('f1_11[0]', ''),
             ('f1_12[0]', ''),
             ('f1_13[0]', ''),
             ('f1_14[0]', ''),
             ('f1_15[0]', ''),
             ('f1_16[0]', ''),
             ('f1_17[0]', ''),
             ('Address_ReadOrder[0]', ''),
             ('c1_1[0]', '/Off'),
             ('c1_2[0]', '/Off'),
             ('c1_3[0]', '/Off'),
             ('c1_3[1]', '/Off'),
             ('c1_3[2]', '/Off'),
             ('FilingStatus_ReadOrder[0]', ''),
             ('f1_18[0]', ''),
             ('c1_4[0]', '/Off'),
             ('f1_19[0]', ''),
             ('c1_5[0]', '/Off'),
             ('c1_5[1]', '/Off'),
             ('c1_6[0]', '/Off'),
             (

In [34]:
# Print the fields in a formatted way
print("PDF Form Fields:")
print("-" * 50)
print(f"{'Field Name':<30} | {'Alternate Label':<30} | {'Value':<30}")
print("-" * 50)
for entry in form_data:
    field_name, alternate_label, value = entry
    print(f"{str(field_name):<30} | {str(alternate_label):<30} | {str(value):<30}")

# Close the PDF
pdf.close()

PDF Form Fields:
--------------------------------------------------
Field Name                     | Alternate Label                | Value                         
--------------------------------------------------
topmostSubform[0].Page1[0].f1_01[0] | None                           | None                          
topmostSubform[0].Page1[0].f1_02[0] | None                           | None                          
topmostSubform[0].Page1[0].f1_03[0] | None                           | None                          
topmostSubform[0].Page1[0].f1_04[0] | None                           | None                          
topmostSubform[0].Page1[0].f1_05[0] | None                           | None                          
topmostSubform[0].Page1[0].f1_06[0] | None                           | None                          
topmostSubform[0].Page1[0].f1_07[0] | None                           | None                          
topmostSubform[0].Page1[0].f1_08[0] | None                           |

In [74]:
from PyPDF2 import PdfReader, PdfWriter
from collections import OrderedDict

def get_fields(obj, tree=None, retval=None, fileobj=None):
    # Your original function to extract fields
    fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
                    '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
    if retval is None:
        retval = OrderedDict()
        catalog = obj.trailer["/Root"]
        if "/AcroForm" in catalog:
            tree = catalog["/AcroForm"]
        else:
            return None
    if tree is None:
        return retval
    obj._check_kids(tree, retval, fileobj)
    for attr in fieldAttributes:
        if attr in tree:
            obj._build_field(tree, retval, fileobj, fieldAttributes)
            break
    if "/Fields" in tree:
        fields = tree["/Fields"]
        for f in fields:
            field = f.get_object()
            obj._build_field(field, retval, fileobj, fieldAttributes)
    return retval

def get_form_fields(infile):
    with open(infile, 'rb') as file:
        infile = PdfReader(file)
        fields = get_fields(infile)
        return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())

def fill_pdf_fields(input_pdf, output_pdf):
    # Read the input PDF and get the fields
    with open(input_pdf, 'rb') as infile:
        reader = PdfReader(infile)
        writer = PdfWriter()

        # Get form fields and values
        fields = get_form_fields(input_pdf)

        # Copy all pages and fill the fields
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            writer.add_page(page)

        # Fill the fields in the output PDF
        for field, value in fields.items():
            # Set field value by matching field names
            writer.update_page_form_field_values(reader.pages[0], {field: value})

        # Save the filled PDF
        with open(output_pdf, 'wb') as outfile:
            writer.write(outfile)

# Example usage
input_pdf = 'f1040_filled.pdf'
output_pdf = 'f1040_output.pdf'
fill_pdf_fields(input_pdf, output_pdf)


In [76]:
from PyPDF2 import PdfReader
from collections import OrderedDict

def get_fields(obj, tree=None, retval=None, fileobj=None):
    fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
                    '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
    if retval is None:
        retval = OrderedDict()
        catalog = obj.trailer["/Root"]
        if "/AcroForm" in catalog:
            tree = catalog["/AcroForm"]
        else:
            return None
    if tree is None:
        return retval
    obj._check_kids(tree, retval, fileobj)
    for attr in fieldAttributes:
        if attr in tree:
            obj._build_field(tree, retval, fileobj, fieldAttributes)
            break
    if "/Fields" in tree:
        fields = tree["/Fields"]
        for f in fields:
            field = f.get_object()
            obj._build_field(field, retval, fileobj, fieldAttributes)
    return retval

def get_form_fields(infile):
    with open(infile, 'rb') as file:
        infile = PdfReader(file)
        fields = get_fields(infile)
        
        # Let's print the raw field names and values
        raw_fields = {}
        for k, v in fields.items():
            raw_fields[k] = v.get('/V', '')
        
        print("Extracted Fields and Values:")
        for key, value in raw_fields.items():
            print(f"Field Name: {key}, Value: {value}")
            
        return raw_fields

if __name__ == '__main__':
    from pprint import pprint
    pdf_file_name = 'f1040_filled.pdf'
    pprint(get_form_fields(pdf_file_name))


Extracted Fields and Values:
Field Name: f1_01[0], Value: 
Field Name: f1_02[0], Value: 
Field Name: f1_03[0], Value: 
Field Name: f1_04[0], Value: 
Field Name: f1_05[0], Value: 
Field Name: f1_06[0], Value: 
Field Name: f1_07[0], Value: 
Field Name: f1_08[0], Value: 
Field Name: f1_09[0], Value: 
Field Name: f1_10[0], Value: 
Field Name: f1_11[0], Value: 
Field Name: f1_12[0], Value: 
Field Name: f1_13[0], Value: 
Field Name: f1_14[0], Value: 
Field Name: f1_15[0], Value: 
Field Name: f1_16[0], Value: 
Field Name: f1_17[0], Value: 
Field Name: Address_ReadOrder[0], Value: 
Field Name: c1_1[0], Value: /Off
Field Name: c1_2[0], Value: /Off
Field Name: c1_3[0], Value: /Off
Field Name: c1_3[1], Value: /Off
Field Name: c1_3[2], Value: /Off
Field Name: FilingStatus_ReadOrder[0], Value: 
Field Name: f1_18[0], Value: 
Field Name: c1_4[0], Value: /Off
Field Name: f1_19[0], Value: 
Field Name: c1_5[0], Value: /Off
Field Name: c1_5[1], Value: /Off
Field Name: c1_6[0], Value: /Off
Field Name: c1_