Merge f96e7f7 into 05b7bcd

the-paperless-project · May 21, 2018 · c83410d · c83410d
2 parents 05b7bcd + f96e7f7
commit c83410d
Show file tree

Hide file tree

Showing 7 changed files with 174 additions and 119 deletions.
diff --git a/docs/utilities.rst b/docs/utilities.rst
@@ -49,17 +49,18 @@ The Consumer
 ------------
 
 The consumer script runs in an infinite loop, constantly looking at a directory
-for PDF files to parse and index.  The process is pretty straightforward:
+for documents to parse and index.  The process is pretty straightforward:
 
-1. Look in ``CONSUMPTION_DIR`` for a PDF.  If one is found, go to #2.  If not,
-   wait 10 seconds and try again.
-2. Parse the PDF with Tesseract
+1. Look in ``CONSUMPTION_DIR`` for a document.  If one is found, go to #2.
+   If not, wait 10 seconds and try again.  On Linux, new documents are detected
+   instantly via inotify, so there's no waiting involved.
+2. Parse the document with Tesseract
 3. Create a new record in the database with the OCR'd text
 4. Attempt to automatically assign document attributes by doing some guesswork.
    Read up on the :ref:`guesswork documentation<guesswork>` for more
    information about this process.
-5. Encrypt the PDF and store it in the ``media`` directory under
-   ``documents/pdf``.
+5. Encrypt the document and store it in the ``media`` directory under
+   ``documents/originals``.
 6. Go to #1.
 
 
@@ -74,8 +75,8 @@ The consumer is started via the ``manage.py`` script:
 
     $ /path/to/paperless/src/manage.py document_consumer
 
-This starts the service that will run in a loop, consuming PDF files as they
-appear in ``CONSUMPTION_DIR``.
+This starts the service that will consume documents as they appear in
+``CONSUMPTION_DIR``.
 
 Note that this command runs continuously, so exiting it will mean your webserver
 disappears.  If you want to run this full-time (which is kind of the point)
@@ -97,8 +98,8 @@ The Exporter
 ------------
 
 Tired of fiddling with Paperless, or just want to do something stupid and are
-afraid of accidentally damaging your files?  You can export all of your PDFs
-into neatly named, dated, and unencrypted.
+afraid of accidentally damaging your files?  You can export all of your
+documents into neatly named, dated, and unencrypted files.
 
 
 .. _utilities-exporter-howto:
@@ -112,10 +113,10 @@ This too is done via the ``manage.py`` script:
 
     $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
 
-This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
-to do with as you please.  The files are accompanied with a special file,
-``manifest.json`` which can be used to
-:ref:`import the files <utilities-importer>` at a later date if you wish.
+This will dump all of your unencrypted documents into ``/path/to/somewhere``
+for you to do with as you please.  The files are accompanied with a special
+file, ``manifest.json`` which can be used to :ref:`import the files
+<utilities-importer>` at a later date if you wish.
 
 
 .. _utilities-exporter-howto-docker:

diff --git a/paperless.conf.example b/paperless.conf.example
@@ -165,6 +165,8 @@ PAPERLESS_PASSPHRASE="secret"
 #PAPERLESS_CONVERT_DENSITY=300
 
 
+# (This setting is ignored on Linux where inotify is used instead of a
+# polling loop.)
 # The number of seconds that Paperless will wait between checking
 # PAPERLESS_CONSUMPTION_DIR.  If you tend to write documents to this directory
 # rarely, you may want to use a higher value than the default (10).

diff --git a/requirements.txt b/requirements.txt
@@ -20,6 +20,7 @@ flake8==3.5.0
 fuzzywuzzy==0.15.0
 gunicorn==19.8.1
 idna==2.6
+inotify_simple==1.1.7; sys_platform == 'linux'
 langdetect==1.0.7
 mccabe==0.6.1
 more-itertools==4.1.0

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
@@ -3,8 +3,10 @@
 import logging
 import os
 import re
+import time
 import uuid
 
+from operator import itemgetter
 from django.conf import settings
 from django.utils import timezone
 from paperless.db import GnuPG
@@ -32,21 +34,21 @@ class Consumer:
       5. Delete the document and image(s)
     """
 
+    # Files are considered ready for consumption if they have been unmodified
+    # for this duration
+    FILES_MIN_UNMODIFIED_DURATION = 0.5
+
     def __init__(self, consume=settings.CONSUMPTION_DIR,
                  scratch=settings.SCRATCH_DIR):
 
         self.logger = logging.getLogger(__name__)
         self.logging_group = None
 
-        self.stats = {}
         self._ignore = []
         self.consume = consume
         self.scratch = scratch
 
-        try:
-            os.makedirs(self.scratch)
-        except FileExistsError:
-            pass
+        os.makedirs(self.scratch, exists_ok=True)
 
         if not self.consume:
             raise ConsumerError(
@@ -73,83 +75,99 @@ def log(self, level, message):
             "group": self.logging_group
         })
 
-    def run(self):
-
-        for doc in os.listdir(self.consume):
-
-            doc = os.path.join(self.consume, doc)
+    def consume_new_files(self):
+        """
+        Find non-ignored files in consumption dir and consume them if they have
+        been unmodified for FILES_MIN_UNMODIFIED_DURATION.
+        """
+        ignored_files = []
+        files = []
+        for entry in os.scandir(self.consume):
+            if entry.is_file():
+                file = (entry.path, entry.stat().st_mtime)
+                if file in self._ignore:
+                    ignored_files.append(file)
+                else:
+                    files.append(file)
 
-            if not os.path.isfile(doc):
-                continue
+        if not files:
+            return
 
-            if not re.match(FileInfo.REGEXES["title"], doc):
-                continue
+        # Set _ignore to only include files that still exist.
+        # This keeps it from growing indefinitely.
+        self._ignore[:] = ignored_files
 
-            if doc in self._ignore:
-                continue
+        files_old_to_new = sorted(files, key=itemgetter(1))
 
-            if not self._is_ready(doc):
-                continue
+        time.sleep(self.FILES_MIN_UNMODIFIED_DURATION)
 
-            if self._is_duplicate(doc):
-                self.log(
-                    "info",
-                    "Skipping {} as it appears to be a duplicate".format(doc)
-                )
-                self._ignore.append(doc)
-                continue
+        for file, mtime in files_old_to_new:
+            if mtime == os.path.getmtime(file):
+                # File has not been modified and can be consumed
+                if not self.try_consume_file(file):
+                    self._ignore.append((file, mtime))
 
-            parser_class = self._get_parser_class(doc)
-            if not parser_class:
-                self.log(
-                    "error", "No parsers could be found for {}".format(doc))
-                self._ignore.append(doc)
-                continue
+    def try_consume_file(self, file):
+        "Return True if file was consumed"
 
-            self.logging_group = uuid.uuid4()
+        if not re.match(FileInfo.REGEXES["title"], file):
+            return False
 
-            self.log("info", "Consuming {}".format(doc))
+        doc = file
 
-            document_consumption_started.send(
-                sender=self.__class__,
-                filename=doc,
-                logging_group=self.logging_group
+        if self._is_duplicate(doc):
+            self.log(
+                "info",
+                "Skipping {} as it appears to be a duplicate".format(doc)
             )
+            return False
 
-            parsed_document = parser_class(doc)
-
-            try:
-                thumbnail = parsed_document.get_thumbnail()
-                date = parsed_document.get_date()
-                document = self._store(
-                    parsed_document.get_text(),
-                    doc,
-                    thumbnail,
-                    date
-                )
-            except ParseError as e:
+        parser_class = self._get_parser_class(doc)
+        if not parser_class:
+            self.log(
+                "error", "No parsers could be found for {}".format(doc))
+            return False
 
-                self._ignore.append(doc)
-                self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
-                parsed_document.cleanup()
+        self.logging_group = uuid.uuid4()
 
-                continue
+        self.log("info", "Consuming {}".format(doc))
 
-            else:
+        document_consumption_started.send(
+            sender=self.__class__,
+            filename=doc,
+            logging_group=self.logging_group
+        )
 
-                parsed_document.cleanup()
-                self._cleanup_doc(doc)
+        parsed_document = parser_class(doc)
 
-                self.log(
-                    "info",
-                    "Document {} consumption finished".format(document)
-                )
+        try:
+            thumbnail = parsed_document.get_thumbnail()
+            date = parsed_document.get_date()
+            document = self._store(
+                parsed_document.get_text(),
+                doc,
+                thumbnail,
+                date
+            )
+        except ParseError as e:
+            self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
+            parsed_document.cleanup()
+            return False
+        else:
+            parsed_document.cleanup()
+            self._cleanup_doc(doc)
+
+            self.log(
+                "info",
+                "Document {} consumption finished".format(document)
+            )
 
-                document_consumption_finished.send(
-                    sender=self.__class__,
-                    document=document,
-                    logging_group=self.logging_group
-                )
+            document_consumption_finished.send(
+                sender=self.__class__,
+                document=document,
+                logging_group=self.logging_group
+            )
+            return True
 
     def _get_parser_class(self, doc):
         """
@@ -224,22 +242,6 @@ def _cleanup_doc(self, doc):
         self.log("debug", "Deleting document {}".format(doc))
         os.unlink(doc)
 
-    def _is_ready(self, doc):
-        """
-        Detect whether ``doc`` is ready to consume or if it's still being
-        written to by the uploader.
-        """
-
-        t = os.stat(doc).st_mtime
-
-        if self.stats.get(doc) == t:
-            del(self.stats[doc])
-            return True
-
-        self.stats[doc] = t
-
-        return False
-
     @staticmethod
     def _is_duplicate(doc):
         with open(doc, "rb") as f:

diff --git a/src/documents/mail.py b/src/documents/mail.py
@@ -20,7 +20,7 @@ class MailFetcherError(Exception):
     pass
 
 
-class InvalidMessageError(Exception):
+class InvalidMessageError(MailFetcherError):
     pass
 
 
@@ -75,6 +75,9 @@ def __init__(self, data, group=None):
                 continue
 
             dispositions = content_disposition.strip().split(";")
+            if len(dispositions) < 2:
+                continue
+
             if not dispositions[0].lower() == "attachment" and \
                "filename" not in dispositions[1].lower():
                 continue
@@ -159,8 +162,10 @@ def __init__(self, consume=settings.CONSUMPTION_DIR):
         self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
 
         self._enabled = bool(self._host)
+        if self._enabled and Message.SECRET is None:
+            raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
 
-        self.last_checked = datetime.datetime.now()
+        self.last_checked = time.time()
         self.consume = consume
 
     def pull(self):
@@ -187,7 +192,7 @@ def pull(self):
                     f.write(message.attachment.data)
                     os.utime(file_name, times=(t, t))
 
-        self.last_checked = datetime.datetime.now()
+        self.last_checked = time.time()
 
     def _get_messages(self):
 
@@ -205,7 +210,7 @@ def _get_messages(self):
             self._connection.close()
             self._connection.logout()
 
-        except Exception as e:
+        except MailFetcherError as e:
             self.log("error", str(e))
 
         return r