Skip to content

Commit

Permalink
add extract cli command
Browse files Browse the repository at this point in the history
  • Loading branch information
monneyboi committed Jun 21, 2023
1 parent 98a843f commit da33e5f
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 8 deletions.
10 changes: 4 additions & 6 deletions src/pagestream/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,15 @@ def get_embedded_documents(self):

yield pdf

def extract_to(self, output_path):
def extract_to(self, path: Path):
"""Output split files into path"""
if not output_path.exists():
output_path.mkdir(parents=True)
path.mkdir(parents=True, exist_ok=True)

for pdf in self.get_embedded_documents():
meta = pdf.open_metadata()
title = meta['dc:title']

path = output_path.joinpath(title).with_suffix('.pdf')
info(f'Extracting {title} to {path}')
pdf.save(path)
info(f'Extracting {title}')
pdf.save(path.joinpath(title).with_suffix('.pdf'))


12 changes: 10 additions & 2 deletions src/pagestream/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import sys
import click
from pathlib import Path
import logging
from __init__ import PDFPageStream

Expand All @@ -16,8 +17,15 @@ def contents(filename):
pagestream = PDFPageStream(filename)
for pdf in pagestream.get_embedded_documents():
with pdf.open_metadata() as meta:
print(meta['dc:title'])
print("- " + str(len(pdf.pages)) + " pages")
logging.info(meta['dc:title'])
logging.info("- " + str(len(pdf.pages)) + " pages")

@cli.command("extract", help="Extract documents in pagestream to folder")
@click.argument('filename', type=click.Path(exists=True))
@click.argument('path', type=click.Path())
def extract(filename, path):
pagestream = PDFPageStream(filename)
pagestream.extract_to(Path(path))

if __name__ == "__main__":
cli()

0 comments on commit da33e5f

Please sign in to comment.