Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Commit

Permalink
Merge f96e7f7 into 05b7bcd
Browse files Browse the repository at this point in the history
  • Loading branch information
erikarvstedt committed May 21, 2018
2 parents 05b7bcd + f96e7f7 commit c83410d
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 119 deletions.
29 changes: 15 additions & 14 deletions docs/utilities.rst
Expand Up @@ -49,17 +49,18 @@ The Consumer
------------

The consumer script runs in an infinite loop, constantly looking at a directory
for PDF files to parse and index. The process is pretty straightforward:
for documents to parse and index. The process is pretty straightforward:

1. Look in ``CONSUMPTION_DIR`` for a PDF. If one is found, go to #2. If not,
wait 10 seconds and try again.
2. Parse the PDF with Tesseract
1. Look in ``CONSUMPTION_DIR`` for a document. If one is found, go to #2.
If not, wait 10 seconds and try again. On Linux, new documents are detected
instantly via inotify, so there's no waiting involved.
2. Parse the document with Tesseract
3. Create a new record in the database with the OCR'd text
4. Attempt to automatically assign document attributes by doing some guesswork.
Read up on the :ref:`guesswork documentation<guesswork>` for more
information about this process.
5. Encrypt the PDF and store it in the ``media`` directory under
``documents/pdf``.
5. Encrypt the document and store it in the ``media`` directory under
``documents/originals``.
6. Go to #1.


Expand All @@ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script:
$ /path/to/paperless/src/manage.py document_consumer
This starts the service that will run in a loop, consuming PDF files as they
appear in ``CONSUMPTION_DIR``.
This starts the service that will consume documents as they appear in
``CONSUMPTION_DIR``.

Note that this command runs continuously, so exiting it will mean your webserver
disappears. If you want to run this full-time (which is kind of the point)
Expand All @@ -97,8 +98,8 @@ The Exporter
------------

Tired of fiddling with Paperless, or just want to do something stupid and are
afraid of accidentally damaging your files? You can export all of your PDFs
into neatly named, dated, and unencrypted.
afraid of accidentally damaging your files? You can export all of your
documents into neatly named, dated, and unencrypted files.


.. _utilities-exporter-howto:
Expand All @@ -112,10 +113,10 @@ This too is done via the ``manage.py`` script:
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
to do with as you please. The files are accompanied with a special file,
``manifest.json`` which can be used to
:ref:`import the files <utilities-importer>` at a later date if you wish.
This will dump all of your unencrypted documents into ``/path/to/somewhere``
for you to do with as you please. The files are accompanied with a special
file, ``manifest.json`` which can be used to :ref:`import the files
<utilities-importer>` at a later date if you wish.


.. _utilities-exporter-howto-docker:
Expand Down
2 changes: 2 additions & 0 deletions paperless.conf.example
Expand Up @@ -165,6 +165,8 @@ PAPERLESS_PASSPHRASE="secret"
#PAPERLESS_CONVERT_DENSITY=300


# (This setting is ignored on Linux where inotify is used instead of a
# polling loop.)
# The number of seconds that Paperless will wait between checking
# PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory
# rarely, you may want to use a higher value than the default (10).
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -20,6 +20,7 @@ flake8==3.5.0
fuzzywuzzy==0.15.0
gunicorn==19.8.1
idna==2.6
inotify_simple==1.1.7; sys_platform == 'linux'
langdetect==1.0.7
mccabe==0.6.1
more-itertools==4.1.0
Expand Down
164 changes: 83 additions & 81 deletions src/documents/consumer.py
Expand Up @@ -3,8 +3,10 @@
import logging
import os
import re
import time
import uuid

from operator import itemgetter
from django.conf import settings
from django.utils import timezone
from paperless.db import GnuPG
Expand Down Expand Up @@ -32,21 +34,21 @@ class Consumer:
5. Delete the document and image(s)
"""

# Files are considered ready for consumption if they have been unmodified
# for this duration
FILES_MIN_UNMODIFIED_DURATION = 0.5

def __init__(self, consume=settings.CONSUMPTION_DIR,
scratch=settings.SCRATCH_DIR):

self.logger = logging.getLogger(__name__)
self.logging_group = None

self.stats = {}
self._ignore = []
self.consume = consume
self.scratch = scratch

try:
os.makedirs(self.scratch)
except FileExistsError:
pass
os.makedirs(self.scratch, exists_ok=True)

if not self.consume:
raise ConsumerError(
Expand All @@ -73,83 +75,99 @@ def log(self, level, message):
"group": self.logging_group
})

def run(self):

for doc in os.listdir(self.consume):

doc = os.path.join(self.consume, doc)
def consume_new_files(self):
"""
Find non-ignored files in consumption dir and consume them if they have
been unmodified for FILES_MIN_UNMODIFIED_DURATION.
"""
ignored_files = []
files = []
for entry in os.scandir(self.consume):
if entry.is_file():
file = (entry.path, entry.stat().st_mtime)
if file in self._ignore:
ignored_files.append(file)
else:
files.append(file)

if not os.path.isfile(doc):
continue
if not files:
return

if not re.match(FileInfo.REGEXES["title"], doc):
continue
# Set _ignore to only include files that still exist.
# This keeps it from growing indefinitely.
self._ignore[:] = ignored_files

if doc in self._ignore:
continue
files_old_to_new = sorted(files, key=itemgetter(1))

if not self._is_ready(doc):
continue
time.sleep(self.FILES_MIN_UNMODIFIED_DURATION)

if self._is_duplicate(doc):
self.log(
"info",
"Skipping {} as it appears to be a duplicate".format(doc)
)
self._ignore.append(doc)
continue
for file, mtime in files_old_to_new:
if mtime == os.path.getmtime(file):
# File has not been modified and can be consumed
if not self.try_consume_file(file):
self._ignore.append((file, mtime))

parser_class = self._get_parser_class(doc)
if not parser_class:
self.log(
"error", "No parsers could be found for {}".format(doc))
self._ignore.append(doc)
continue
def try_consume_file(self, file):
"Return True if file was consumed"

self.logging_group = uuid.uuid4()
if not re.match(FileInfo.REGEXES["title"], file):
return False

self.log("info", "Consuming {}".format(doc))
doc = file

document_consumption_started.send(
sender=self.__class__,
filename=doc,
logging_group=self.logging_group
if self._is_duplicate(doc):
self.log(
"info",
"Skipping {} as it appears to be a duplicate".format(doc)
)
return False

parsed_document = parser_class(doc)

try:
thumbnail = parsed_document.get_thumbnail()
date = parsed_document.get_date()
document = self._store(
parsed_document.get_text(),
doc,
thumbnail,
date
)
except ParseError as e:
parser_class = self._get_parser_class(doc)
if not parser_class:
self.log(
"error", "No parsers could be found for {}".format(doc))
return False

self._ignore.append(doc)
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
parsed_document.cleanup()
self.logging_group = uuid.uuid4()

continue
self.log("info", "Consuming {}".format(doc))

else:
document_consumption_started.send(
sender=self.__class__,
filename=doc,
logging_group=self.logging_group
)

parsed_document.cleanup()
self._cleanup_doc(doc)
parsed_document = parser_class(doc)

self.log(
"info",
"Document {} consumption finished".format(document)
)
try:
thumbnail = parsed_document.get_thumbnail()
date = parsed_document.get_date()
document = self._store(
parsed_document.get_text(),
doc,
thumbnail,
date
)
except ParseError as e:
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
parsed_document.cleanup()
return False
else:
parsed_document.cleanup()
self._cleanup_doc(doc)

self.log(
"info",
"Document {} consumption finished".format(document)
)

document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group
)
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group
)
return True

def _get_parser_class(self, doc):
"""
Expand Down Expand Up @@ -224,22 +242,6 @@ def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)

def _is_ready(self, doc):
"""
Detect whether ``doc`` is ready to consume or if it's still being
written to by the uploader.
"""

t = os.stat(doc).st_mtime

if self.stats.get(doc) == t:
del(self.stats[doc])
return True

self.stats[doc] = t

return False

@staticmethod
def _is_duplicate(doc):
with open(doc, "rb") as f:
Expand Down
13 changes: 9 additions & 4 deletions src/documents/mail.py
Expand Up @@ -20,7 +20,7 @@ class MailFetcherError(Exception):
pass


class InvalidMessageError(Exception):
class InvalidMessageError(MailFetcherError):
pass


Expand Down Expand Up @@ -75,6 +75,9 @@ def __init__(self, data, group=None):
continue

dispositions = content_disposition.strip().split(";")
if len(dispositions) < 2:
continue

if not dispositions[0].lower() == "attachment" and \
"filename" not in dispositions[1].lower():
continue
Expand Down Expand Up @@ -159,8 +162,10 @@ def __init__(self, consume=settings.CONSUMPTION_DIR):
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")

self._enabled = bool(self._host)
if self._enabled and Message.SECRET is None:
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")

self.last_checked = datetime.datetime.now()
self.last_checked = time.time()
self.consume = consume

def pull(self):
Expand All @@ -187,7 +192,7 @@ def pull(self):
f.write(message.attachment.data)
os.utime(file_name, times=(t, t))

self.last_checked = datetime.datetime.now()
self.last_checked = time.time()

def _get_messages(self):

Expand All @@ -205,7 +210,7 @@ def _get_messages(self):
self._connection.close()
self._connection.logout()

except Exception as e:
except MailFetcherError as e:
self.log("error", str(e))

return r
Expand Down

0 comments on commit c83410d

Please sign in to comment.