add extract cli command

followthemoney · Jun 21, 2023 · da33e5f · da33e5f
1 parent 98a843f
commit da33e5f
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 8 deletions.
diff --git a/src/pagestream/__init__.py b/src/pagestream/__init__.py
@@ -86,17 +86,15 @@ def get_embedded_documents(self):
 
                 yield pdf
 
-    def extract_to(self, output_path):
+    def extract_to(self, path: Path):
         """Output split files into path"""
-        if not output_path.exists():
-            output_path.mkdir(parents=True)
+        path.mkdir(parents=True, exist_ok=True)
 
         for pdf in self.get_embedded_documents():
             meta = pdf.open_metadata()
             title = meta['dc:title']
 
-            path = output_path.joinpath(title).with_suffix('.pdf')
-            info(f'Extracting {title} to {path}')
-            pdf.save(path)
+            info(f'Extracting {title}')
+            pdf.save(path.joinpath(title).with_suffix('.pdf'))
 
 
diff --git a/src/pagestream/cli.py b/src/pagestream/cli.py
@@ -2,6 +2,7 @@
 
 import sys
 import click
+from pathlib import Path
 import logging
 from __init__ import PDFPageStream
 
@@ -16,8 +17,15 @@ def contents(filename):
     pagestream = PDFPageStream(filename)
     for pdf in pagestream.get_embedded_documents():
         with pdf.open_metadata() as meta:
-            print(meta['dc:title'])
-        print("- " + str(len(pdf.pages)) + " pages")
+            logging.info(meta['dc:title'])
+        logging.info("- " + str(len(pdf.pages)) + " pages")
+
+@cli.command("extract", help="Extract documents in pagestream to folder")
+@click.argument('filename', type=click.Path(exists=True))
+@click.argument('path', type=click.Path())
+def extract(filename, path):
+    pagestream = PDFPageStream(filename)
+    pagestream.extract_to(Path(path))
 
 if __name__ == "__main__":
     cli()