Skip to content

Commit

Permalink
data field and fdf parsing tests, extracts more field data
Browse files Browse the repository at this point in the history
  • Loading branch information
bengolder committed Mar 13, 2016
1 parent 01ddaec commit 24fec0c
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 33 deletions.
48 changes: 42 additions & 6 deletions src/pdftk_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import os
import re
import subprocess
from tempfile import mkstemp


class PdftkError(Exception):
pass


class PDFTKWrapper:

def __init__(self, encoding='latin-1', tmp_path=None):
Expand Down Expand Up @@ -43,7 +46,7 @@ def _write_tmp_file(self, file_obj=None, bytestring=None):
self._tmp_files.append(tmp_fp)
return tmp_fp

def _clean_up_tmp_files(self):
def clean_up_tmp_files(self):
if not self._tmp_files:
return
for i in range(len(self._tmp_files)):
Expand Down Expand Up @@ -82,13 +85,46 @@ def get_data_fields(self, fp):
return contents

def parse_fdf_fields(self, fdf_str):
yield None

def parse_xfdf_fields(self, xfdf_str):
yield None
'''Yields a series of tuples, using the escaped name of the field
followed by a dict with useful meta information about the match
https://regex101.com/r/iL6hW3/5
'''
field_pattern = re.compile(r'\/V\ (?P<value>.*)\n\/T\ \((?P<name>.*)\)')
for match in re.finditer(field_pattern, fdf_str):
# it's necessary to deal with escape slashes in the field name
# because it may otherwise fail to match the field name extracted
# from the data field dump
datum = {
'name': match.group('name'),
'escaped_name': match.group('name').replace('\\', ''),
'name_span': match.span('name'),
'value_template': match.group('value'),
'value_template_span': match.span('value')
}
yield (datum['escaped_name'], datum)

def parse_data_fields(self, data_str):
yield None
'''Pulls out a field name and type from the resulting string of
`pdftk dump_data_fields_utf8`
Uses the following regex to find types and names:
https://regex101.com/r/iL6hW3/4
'''
field_opts_key = 'FieldStateOption'
field_name_key = 'FieldName'
for field_text in data_str.split('---'):
datum = {}
for line in field_text.split('\n'):
if line.strip():
propName, value = line.split(':')
if propName == field_opts_key:
if field_opts_key not in datum:
datum[field_opts_key] = [value.strip()]
else:
datum[field_opts_key].append(value.strip())
else:
datum[propName] = value.strip()
if datum:
yield (datum[field_name_key], datum)

def run_command(self, args):
if args[0] != 'pdftk':
Expand Down
19 changes: 16 additions & 3 deletions tests/integration/test_pdftk.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from unittest import TestCase

import subprocess
import io
import os
import codecs
from src.pdftk_wrapper import (
PDFTKWrapper, PdftkError
)
Expand All @@ -18,6 +15,8 @@ def setUp(self):
self.sample_form_path = 'data/sample_pdfs/sample_form.pdf'
self.fdf_str_sample = '%FDF-1.2\n%âãÏÓ\n1 0 obj \n<<\n/FDF \n<<\n/Fields [\n<<\n/V /Yes\n/T (Language 2 Check Box)\n>> \n<<\n/V ()\n/T (Address 2 Text Box)\n>> \n<<\n/V /Off\n/T (Language 3 Check Box)\n>> \n<<\n/V ()\n/T (City Text Box)\n>> \n<<\n/V /Off\n/T (Language 1 Check Box)\n>> \n<<\n/V /Off\n/T (Driving License Check Box)\n>> \n<<\n/V ()\n/T (Given Name Text Box)\n>> \n<<\n/V /Off\n/T (Language 5 Check Box)\n>> \n<<\n/V ()\n/T (House nr Text Box)\n>> \n<<\n/V (150)\n/T (Height Formatted Field)\n>> \n<<\n/V ()\n/T (Family Name Text Box)\n>> \n<<\n/V ()\n/T (Address 1 Text Box)\n>> \n<<\n/V /Off\n/T (Language 4 Check Box)\n>> \n<<\n/V ()\n/T (Postcode Text Box)\n>>]\n>>\n>>\nendobj \ntrailer\n\n<<\n/Root 1 0 R\n>>\n%%EOF\n'
self.data_fields_str_sample = '---\nFieldType: Text\nFieldName: Given Name Text Box\nFieldNameAlt: First name\nFieldFlags: 0\nFieldValue: \nFieldJustification: Left\nFieldMaxLength: 40\n---\nFieldType: Text\nFieldName: Family Name Text Box\nFieldNameAlt: Last name\nFieldFlags: 0\nFieldValue: \nFieldJustification: Left\nFieldMaxLength: 40\n---\nFieldType: Text\nFieldName: Address 1 Text Box\nFieldFlags: 0\nFieldValue: \nFieldJustification: Left\nFieldMaxLength: 40\n---\nFieldType: Text\nFieldName: House nr Text Box\nFieldNameAlt: House and floor\nFieldFlags: 0\nFieldValue: \nFieldJustification: Left\nFieldMaxLength: 20\n---\nFieldType: Text\nFieldName: Address 2 Text Box\nFieldFlags: 0\nFieldValue: \nFieldJustification: Left\nFieldMaxLength: 40\n---\nFieldType: Text\nFieldName: Postcode Text Box\nFieldFlags: 0\nFieldValue: \nFieldJustification: Left\nFieldMaxLength: 20\n---\nFieldType: Text\nFieldName: City Text Box\nFieldFlags: 0\nFieldValue: \nFieldJustification: Left\nFieldMaxLength: 40\n---\nFieldType: Text\nFieldName: Height Formatted Field\nFieldNameAlt: Value from 40 to 250 cm\nFieldFlags: 0\nFieldValue: 150\nFieldValueDefault: 150\nFieldJustification: Left\nFieldMaxLength: 20\n---\nFieldType: Button\nFieldName: Driving License Check Box\nFieldNameAlt: Car driving license\nFieldFlags: 0\nFieldValue: Off\nFieldJustification: Left\nFieldStateOption: Off\nFieldStateOption: Yes\n---\nFieldType: Button\nFieldName: Language 1 Check Box\nFieldFlags: 0\nFieldValue: Off\nFieldJustification: Left\nFieldStateOption: Off\nFieldStateOption: Yes\n---\nFieldType: Button\nFieldName: Language 2 Check Box\nFieldFlags: 0\nFieldValue: Yes\nFieldJustification: Left\nFieldStateOption: Off\nFieldStateOption: Yes\n---\nFieldType: Button\nFieldName: Language 3 Check Box\nFieldFlags: 0\nFieldValue: Off\nFieldJustification: Left\nFieldStateOption: Off\nFieldStateOption: Yes\n---\nFieldType: Button\nFieldName: Language 4 Check Box\nFieldFlags: 0\nFieldValue: Off\nFieldJustification: Left\nFieldStateOption: Off\nFieldStateOption: Yes\n---\nFieldType: Button\nFieldName: Language 5 Check Box\nFieldFlags: 0\nFieldValue: Off\nFieldJustification: Left\nFieldStateOption: Off\nFieldStateOption: Yes\n'
self.parsed_fdf_fields = [('Language 2 Check Box', {'name': 'Language 2 Check Box', 'escaped_name': 'Language 2 Check Box', 'value_template': '/Yes', 'name_span': (61, 81), 'value_template_span': (52, 56)}), ('Address 2 Text Box', {'name': 'Address 2 Text Box', 'escaped_name': 'Address 2 Text Box', 'value_template': '()', 'name_span': (100, 118), 'value_template_span': (93, 95)}), ('Language 3 Check Box', {'name': 'Language 3 Check Box', 'escaped_name': 'Language 3 Check Box', 'value_template': '/Off', 'name_span': (139, 159), 'value_template_span': (130, 134)}), ('City Text Box', {'name': 'City Text Box', 'escaped_name': 'City Text Box', 'value_template': '()', 'name_span': (178, 191), 'value_template_span': (171, 173)}), ('Language 1 Check Box', {'name': 'Language 1 Check Box', 'escaped_name': 'Language 1 Check Box', 'value_template': '/Off', 'name_span': (212, 232), 'value_template_span': (203, 207)}), ('Driving License Check Box', {'name': 'Driving License Check Box', 'escaped_name': 'Driving License Check Box', 'value_template': '/Off', 'name_span': (253, 278), 'value_template_span': (244, 248)}), ('Given Name Text Box', {'name': 'Given Name Text Box', 'escaped_name': 'Given Name Text Box', 'value_template': '()', 'name_span': (297, 316), 'value_template_span': (290, 292)}), ('Language 5 Check Box', {'name': 'Language 5 Check Box', 'escaped_name': 'Language 5 Check Box', 'value_template': '/Off', 'name_span': (337, 357), 'value_template_span': (328, 332)}), ('House nr Text Box', {'name': 'House nr Text Box', 'escaped_name': 'House nr Text Box', 'value_template': '()', 'name_span': (376, 393), 'value_template_span': (369, 371)}), ('Height Formatted Field', {'name': 'Height Formatted Field', 'escaped_name': 'Height Formatted Field', 'value_template': '(150)', 'name_span': (415, 437), 'value_template_span': (405, 410)}), ('Family Name Text Box', {'name': 'Family Name Text Box', 'escaped_name': 'Family Name Text Box', 'value_template': '()', 'name_span': (456, 476), 'value_template_span': (449, 451)}), ('Address 1 Text Box', {'name': 'Address 1 Text Box', 'escaped_name': 'Address 1 Text Box', 'value_template': '()', 'name_span': (495, 513), 'value_template_span': (488, 490)}), ('Language 4 Check Box', {'name': 'Language 4 Check Box', 'escaped_name': 'Language 4 Check Box', 'value_template': '/Off', 'name_span': (534, 554), 'value_template_span': (525, 529)}), ('Postcode Text Box', {'name': 'Postcode Text Box', 'escaped_name': 'Postcode Text Box', 'value_template': '()', 'name_span': (573, 590), 'value_template_span': (566, 568)})]
self.parsed_data_fields = [('Given Name Text Box', {'FieldName': 'Given Name Text Box', 'FieldValue': '', 'FieldType': 'Text', 'FieldJustification': 'Left', 'FieldNameAlt': 'First name', 'FieldFlags': '0', 'FieldMaxLength': '40'}), ('Family Name Text Box', {'FieldName': 'Family Name Text Box', 'FieldValue': '', 'FieldType': 'Text', 'FieldJustification': 'Left', 'FieldNameAlt': 'Last name', 'FieldFlags': '0', 'FieldMaxLength': '40'}), ('Address 1 Text Box', {'FieldName': 'Address 1 Text Box', 'FieldJustification': 'Left', 'FieldType': 'Text', 'FieldFlags': '0', 'FieldValue': '', 'FieldMaxLength': '40'}), ('House nr Text Box', {'FieldName': 'House nr Text Box', 'FieldValue': '', 'FieldType': 'Text', 'FieldJustification': 'Left', 'FieldNameAlt': 'House and floor', 'FieldFlags': '0', 'FieldMaxLength': '20'}), ('Address 2 Text Box', {'FieldName': 'Address 2 Text Box', 'FieldJustification': 'Left', 'FieldType': 'Text', 'FieldFlags': '0', 'FieldValue': '', 'FieldMaxLength': '40'}), ('Postcode Text Box', {'FieldName': 'Postcode Text Box', 'FieldJustification': 'Left', 'FieldType': 'Text', 'FieldFlags': '0', 'FieldValue': '', 'FieldMaxLength': '20'}), ('City Text Box', {'FieldName': 'City Text Box', 'FieldJustification': 'Left', 'FieldType': 'Text', 'FieldFlags': '0', 'FieldValue': '', 'FieldMaxLength': '40'}), ('Height Formatted Field', {'FieldName': 'Height Formatted Field', 'FieldValue': '150', 'FieldType': 'Text', 'FieldJustification': 'Left', 'FieldNameAlt': 'Value from 40 to 250 cm', 'FieldValueDefault': '150', 'FieldFlags': '0', 'FieldMaxLength': '20'}), ('Driving License Check Box', {'FieldName': 'Driving License Check Box', 'FieldValue': 'Off', 'FieldStateOption': ['Off', 'Yes'], 'FieldType': 'Button', 'FieldJustification': 'Left', 'FieldNameAlt': 'Car driving license', 'FieldFlags': '0'}), ('Language 1 Check Box', {'FieldName': 'Language 1 Check Box', 'FieldJustification': 'Left', 'FieldStateOption': ['Off', 'Yes'], 'FieldType': 'Button', 'FieldFlags': '0', 'FieldValue': 'Off'}), ('Language 2 Check Box', {'FieldName': 'Language 2 Check Box', 'FieldJustification': 'Left', 'FieldStateOption': ['Off', 'Yes'], 'FieldType': 'Button', 'FieldFlags': '0', 'FieldValue': 'Yes'}), ('Language 3 Check Box', {'FieldName': 'Language 3 Check Box', 'FieldJustification': 'Left', 'FieldStateOption': ['Off', 'Yes'], 'FieldType': 'Button', 'FieldFlags': '0', 'FieldValue': 'Off'}), ('Language 4 Check Box', {'FieldName': 'Language 4 Check Box', 'FieldJustification': 'Left', 'FieldStateOption': ['Off', 'Yes'], 'FieldType': 'Button', 'FieldFlags': '0', 'FieldValue': 'Off'}), ('Language 5 Check Box', {'FieldName': 'Language 5 Check Box', 'FieldJustification': 'Left', 'FieldStateOption': ['Off', 'Yes'], 'FieldType': 'Button', 'FieldFlags': '0', 'FieldValue': 'Off'})]

def test_pdftk_errors(self):
pdftk = PDFTKWrapper()
Expand All @@ -37,11 +36,25 @@ def test_get_fdf(self):
pdftk = PDFTKWrapper()
results = pdftk.get_fdf(self.sample_form_path)
self.assertEqual(results, self.fdf_str_sample)
pdftk.clean_up_tmp_files()

def test_get_data_fields(self):
pdftk = PDFTKWrapper()
results = pdftk.get_data_fields(self.sample_form_path)
self.assertEqual(results, self.data_fields_str_sample)
pdftk.clean_up_tmp_files()

def test_parse_fdf(self):
pdftk = PDFTKWrapper()
results = list(pdftk.parse_fdf_fields(self.fdf_str_sample))
self.assertListEqual(results, self.parsed_fdf_fields)

def test_parse_data_fields(self):
pdftk = PDFTKWrapper()
results = list(pdftk.parse_data_fields(
self.data_fields_str_sample))
self.assertListEqual(results, self.parsed_data_fields)




Expand Down
27 changes: 3 additions & 24 deletions tests/unit/test_pdftk.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from unittest import TestCase
from unittest.mock import Mock, patch

from src.pdftk_wrapper import PDFTKWrapper, PdftkError


class TestPDFTK(TestCase):

def test_init(self):
Expand Down Expand Up @@ -33,27 +33,6 @@ def test_get_fdf(self):
contents_getter.assert_called_once_with(
'tmp_path.fdf', decode=True)

def test_parse_fdf_fields(self):
pdftk = PDFTKWrapper()
fdf_sample = """
Something
"""
fields = list(pdftk.parse_fdf_fields(fdf_sample))

def test_parse_xfdf_fields(self):
pdftk = PDFTKWrapper()
xfdf_sample = """
Something
"""
fields = list(pdftk.parse_xfdf_fields(xfdf_sample))

def test_parse_data_fields(self):
pdftk = PDFTKWrapper()
data_fields_sample = """
Something
"""
fields = list(pdftk.parse_data_fields(data_fields_sample))

@patch('src.pdftk_wrapper.subprocess')
def test_run_command(self, subprocess):
pdftk = PDFTKWrapper()
Expand Down Expand Up @@ -121,13 +100,13 @@ def test_clean_up_tmp_files(self, remove):
pdftk = PDFTKWrapper()
paths = [c for c in 'hello']
pdftk._tmp_files = paths
pdftk._clean_up_tmp_files()
pdftk.clean_up_tmp_files()
for p in paths:
remove.assert_any_call(p)
self.assertListEqual(pdftk._tmp_files, [])
# test with no files
remove.reset_mock()
pdftk._clean_up_tmp_files()
pdftk.clean_up_tmp_files()
remove.assert_not_called()

def test_coerce_to_file_path(self):
Expand Down

0 comments on commit 24fec0c

Please sign in to comment.