Conflict

palewire · Apr 25, 2017 · f529339 · f529339
2 parents 70c893f + e2b1753
commit f529339
Show file tree

Hide file tree

Showing 8 changed files with 129 additions and 73 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -15,9 +15,8 @@ env:
     - AWS_ACCESS_KEY_ID="MOCK_ACCESS_KEY_ID"
     - AWS_SECRET_ACCESS_KEY="MOCK_SECRET_ACCESS_KEY"
   matrix:
-    - DJANGO_VERSION=1.8.17
-    - DJANGO_VERSION=1.9.12
-    - DJANGO_VERSION=1.10.6
+    - DJANGO_VERSION=1.8.18
+    - DJANGO_VERSION=1.10.7
     - DJANGO_VERSION=1.11
 
 install:

diff --git a/Makefile b/Makefile
@@ -1,4 +1,8 @@
-.PHONY: ship
+.PHONY: test ship
+
+test:
+	flake8 bakery
+	python setup.py test
 
 ship:
 	python setup.py sdist bdist_wheel

diff --git a/bakery/management/commands/__init__.py b/bakery/management/commands/__init__.py
@@ -45,6 +45,8 @@ def get_all_objects_in_bucket(
     Little utility method that handles pagination and returns
     all objects in given bucket.
     """
+    logger.debug("Retrieving bucket object list")
+
     if not s3_client:
         s3_client, s3_resource = get_s3_client()
 

diff --git a/bakery/management/commands/build.py b/bakery/management/commands/build.py
@@ -58,11 +58,11 @@ def add_arguments(self, parser):
             help="Skip collecting the media files when building."
         )
         parser.add_argument(
-            "--no-pooling",
+            "--pooling",
             action="store_true",
-            dest="no_pooling",
+            dest="pooling",
             default=False,
-            help=("Run builds one by one rather than pooling them to run concurrently.")
+            help=("Pool builds to run concurrently rather than running them one by one.")
         )
 
     def handle(self, *args, **options):
@@ -115,7 +115,7 @@ def set_options(self, *args, **options):
                 raise CommandError(self.views_unconfig_msg)
             self.view_list = settings.BAKERY_VIEWS
 
-        self.no_pooling = options.get('no_pooling')
+        self.pooling = options.get('pooling')
 
     def init_build_dir(self):
         """
@@ -219,7 +219,7 @@ def copytree_and_gzip(self, source_dir, target_dir):
                 build_list.append((source_path, target_path))
 
         # Build em all
-        if getattr(self, 'no_pooling', False):
+        if not getattr(self, 'pooling', False):
             [self.copyfile_and_gzip(*u) for u in build_list]
         else:
             cpu_count = multiprocessing.cpu_count()

diff --git a/bakery/management/commands/publish.py b/bakery/management/commands/publish.py
@@ -3,6 +3,7 @@
 import hashlib
 import logging
 import mimetypes
+import multiprocessing
 from django.conf import settings
 from multiprocessing.pool import ThreadPool
 from bakery import DEFAULT_GZIP_CONTENT_TYPES
@@ -86,30 +87,53 @@ def handle(self, *args, **options):
         self.set_options(options)
 
         # Initialize the boto connection
+        logger.debug("Connecting to s3")
+        if self.verbosity > 2:
+            self.stdout.write("Connecting to s3")
         self.s3_client, self.s3_resource = get_s3_client()
 
         # Grab our bucket
+        logger.debug("Retriving bucket {}".format(self.aws_bucket_name))
+        if self.verbosity > 2:
+            self.stdout.write("Retriving bucket {}".format(self.aws_bucket_name))
         self.bucket = self.s3_resource.Bucket(self.aws_bucket_name)
 
-        # Get a list of all keys in our s3 bucket
-        self.s3_obj_dict = self.get_all_objects_in_bucket(
-            self.aws_bucket_name,
-            self.s3_client
-        )
+        # Get a list of all keys in our s3 bucket ...
+        # ...nunless you're this is case where we're blindly pushing
+        if self.force_publish and self.no_delete:
+            self.blind_upload = True
+            logger.debug("Skipping object retrieval. We won't need to because we're blinding uploading everything.")
+            self.s3_obj_dict = {}
+        else:
+            self.blind_upload = False
+            logger.debug("Retrieving objects now published in bucket")
+            if self.verbosity > 2:
+                self.stdout.write("Retrieving objects now published in bucket")
+            self.s3_obj_dict = self.get_all_objects_in_bucket(
+                self.aws_bucket_name,
+                self.s3_client
+            )
 
         # Get a list of all the local files in our build directory
+        logger.debug("Retrieving files built locally")
+        if self.verbosity > 2:
+            self.stdout.write("Retrieving files built locally")
         self.local_file_list = self.get_local_file_list()
 
-        # Sync the two
+        # Sync local files with s3 bucket
+        logger.debug("Syncing local files with bucket")
+        if self.verbosity > 2:
+            self.stdout.write("Syncing local files with bucket")
         self.sync_with_s3()
 
         # Delete anything that's left in our keys dict
         if not self.dry_run and not self.no_delete:
             self.deleted_file_list = list(self.s3_obj_dict.keys())
             self.deleted_files = len(self.deleted_file_list)
             if self.deleted_files:
+                logger.debug("Deleting %s keys" % self.deleted_files)
                 if self.verbosity > 0:
-                    logger.debug("deleting %s keys" % self.deleted_files)
+                    self.stdout.write("Deleting %s keys" % self.deleted_files)
                 self.batch_delete_s3_objects(
                     self.deleted_file_list,
                     self.aws_bucket_name
@@ -125,23 +149,19 @@ def handle(self, *args, **options):
 
         # We're finished, print the final output
         elapsed_time = time.time() - self.start_time
+        msg = "Publish completed, %d uploaded and %d deleted files in %.2f seconds" % (
+            self.uploaded_files,
+            self.deleted_files,
+            elapsed_time
+        )
+        logger.info(msg)
         if self.verbosity > 0:
-            msg = "publish completed, %d uploaded and %d deleted files in %.2f seconds" % (
-                self.uploaded_files,
-                self.deleted_files,
-                elapsed_time
-            )
             self.stdout.write(msg)
-            logger.info(msg)
-
-        if self.verbosity > 2:
-            for f in self.uploaded_file_list:
-                logger.info("updated file: %s" % f)
-            for f in self.deleted_file_list:
-                logger.info("deleted file: %s" % f)
 
         if self.dry_run:
-            logger.info("publish executed with the --dry-run option. No content was changed on S3.")
+            logger.info("Publish executed with the --dry-run option. No content was changed on S3.")
+            if self.verbosity > 0:
+                self.stdout.write("Publish executed with the --dry-run option. No content was changed on S3.")
 
     def set_options(self, options):
         """
@@ -202,7 +222,7 @@ def set_options(self, options):
         if options.get('dry_run'):
             self.dry_run = True
             if self.verbosity > 0:
-                logger.info("executing with the --dry-run option set.")
+                logger.info("Executing with the --dry-run option set.")
         else:
             self.dry_run = False
 
@@ -233,40 +253,66 @@ def sync_with_s3(self):
         of keys in the S3 bucket.
         """
         # Create a list to put all the files we're going to update
-        update_list = []
-
-        for file_key in self.local_file_list:
-            # store a reference to the absolute path, if we have to open it
-            abs_file_path = os.path.join(self.build_dir, file_key)
-
-            # check if the file exists
-            if file_key in self.s3_obj_dict:
-                s3_etag = self.s3_obj_dict[file_key].get('ETag').strip('"')
-                local_md5 = hashlib.md5(
-                    open(abs_file_path, "rb").read()
-                ).hexdigest()
-
-                # don't upload if the md5 sums are the same
-                if s3_etag == local_md5 and not self.force_publish:
-                    pass
-                elif self.force_publish:
-                    update_list.append((file_key, abs_file_path))
-                else:
-                    update_list.append((file_key, abs_file_path))
-
-                # remove the file from the dict, we don't need it anymore
-                del self.s3_obj_dict[file_key]
-
-            # if the file doesn't exist, create it
-            else:
-                update_list.append((file_key, abs_file_path))
+        self.update_list = []
 
-        # Upload all these files
+        # Figure out which files need to be updated and upload all these files
+        logger.debug("Comparing {} local files with bucket".format(len(self.local_file_list)))
         if self.no_pooling:
-            [self.upload_to_s3(*u) for u in update_list]
+            [self.compare_local_file(f) for f in self.local_file_list]
+        else:
+            cpu_count = multiprocessing.cpu_count()
+            logger.debug("Pooling local file comparison on {} CPUs".format(cpu_count))
+            pool = ThreadPool(processes=cpu_count)
+            pool.map(self.compare_local_file, self.local_file_list)
+
+        logger.debug("Uploading {} new or updated files to bucket".format(len(self.update_list)))
+        if self.no_pooling:
+            [self.upload_to_s3(*u) for u in self.update_list]
+        else:
+            logger.debug("Pooling s3 uploads on {} CPUs".format(cpu_count))
+            pool = ThreadPool(processes=cpu_count)
+            pool.map(self.pooled_upload_to_s3, self.update_list)
+
+    def compare_local_file(self, file_key):
+        """
+        Compares a local version of a file with what's already published.
+
+        If an update is needed, the file's key is added self.update_list.
+        """
+        # Where is the file?
+        file_path = os.path.join(self.build_dir, file_key)
+
+        # If we're in force_publish mode just add it
+        if self.force_publish:
+            self.update_list.append((file_key, file_path))
+            # And quit now
+            return
+
+        # Does it exist in our s3 object list?
+        if file_key in self.s3_obj_dict:
+
+            # If it does, open up the local file and convert it to a hexdigest
+            local_data = open(file_path, "rb").read()
+            local_md5 = hashlib.md5(local_data).hexdigest()
+
+            # Now lets compare it to the hexdigest of what's on s3
+            s3_md5 = self.s3_obj_dict[file_key].get('ETag').strip('"')
+
+            # If their md5 hexdigests match, do nothing
+            if s3_md5 == local_md5:
+                pass
+            # If they don't match, we want to add it
+            else:
+                logger.debug("{} has changed".format(file_key))
+                self.update_list.append((file_key, file_path))
+
+            # Remove the file from the s3 dict, we don't need it anymore
+            del self.s3_obj_dict[file_key]
+
+        # If the file doesn't exist, queue it for creation
         else:
-            pool = ThreadPool(processes=10)
-            pool.map(self.pooled_upload_to_s3, update_list)
+            logger.debug("{} has been added".format(file_key))
+            self.update_list.append((file_key, file_path))
 
     def pooled_upload_to_s3(self, payload):
         """
@@ -302,8 +348,9 @@ def upload_to_s3(self, key, filename):
 
         # access and write the contents from the file
         if not self.dry_run:
+            logger.debug("Uploading %s" % filename)
             if self.verbosity > 0:
-                logger.debug("uploading %s" % filename)
+                self.stdout.write("Uploading %s" % filename)
             s3_obj = self.s3_resource.Object(self.aws_bucket_name, key)
             s3_obj.upload_file(filename, ExtraArgs=extra_args)
         self.uploaded_files += 1

diff --git a/bakery/tests/__init__.py b/bakery/tests/__init__.py
@@ -268,8 +268,6 @@ def test_redirect_view(self):
             "detail/badurl.html"
         )
         self.assertTrue(os.path.exists(build_path))
-        # with mock_s3():
-        #     MockRedirectView().post_publish(settings.AWS_BUCKET_NAME)
 
     def test_404_view(self):
         v = views.Buildable404View()
@@ -358,7 +356,7 @@ def test_publish_cmd(self):
         with mock_s3():
             self._create_bucket()
             call_command("build")
-            call_command("publish", no_pooling=True, verbosity=3)
+            call_command("publish", verbosity=3)
             local_file_list = []
             for (dirpath, dirnames, filenames) in os.walk(
                     settings.BUILD_DIR):
@@ -375,13 +373,15 @@ def test_publish_cmd(self):
                 self.assertIn(obj.get('Key'), local_file_list)
             call_command("unbuild")
             os.makedirs(settings.BUILD_DIR)
-            call_command("publish", no_pooling=True, verbosity=3)
+            call_command("publish", verbosity=3)
+
+            call_command("publish", no_delete=True, force_publish=True)
 
     def test_unpublish_cmd(self):
         with mock_s3():
             self._create_bucket()
             call_command("build")
-            call_command("unpublish", no_pooling=True, verbosity=3)
+            call_command("unpublish", verbosity=3)
             self.assertFalse(self._get_bucket_objects())
 
     # def test_tasks(self):
@@ -418,7 +418,7 @@ def test_cache_control(self):
             }):
                 self._create_bucket()
                 call_command("build")
-                call_command("publish", no_pooling=True, verbosity=3)
+                call_command("publish", verbosity=3)
 
                 for obj in self._get_bucket_objects():
                     s3_obj = s3.Object(
@@ -443,7 +443,7 @@ def test_batch_unpublish(self):
                 obj = s3.Object(settings.AWS_BUCKET_NAME, key)
                 obj.put('This is test object %s' % i)
                 keys.append(key)
-            call_command("unpublish", no_pooling=True, verbosity=3)
+            call_command("unpublish", verbosity=3)
             self.assertFalse(self._get_bucket_objects())
 
     def test_get_s3_client_honors_settings_over_environ(self):

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,6 +1,13 @@
 Changelog
 =========
 
+0.10.0
+------
+
+* Default pooling of file comparisons between published and local files for faster performance
+* Option to opt-in to pooling of building of files locally for faster performance
+* When ``--force`` and ``--no-delete`` options are both passed to publish command the s3 object list is not retrieved for faster performance
+
 0.9.3
 -----
 

diff --git a/setup.py b/setup.py
@@ -51,9 +51,6 @@ def run(self):
                 },
             ],
             BUILD_DIR = tempfile.mkdtemp(),
-            BAKERY_BUILD_ON_S3 = True,
-            BAKERY_AWS_BUILD_BUCKET = 'build-bucket',
-            BAKERY_AWS_PUBLISH_BUCKET = 'publish-bucket',
             STATIC_ROOT = os.path.abspath(
                  os.path.join(
                      os.path.dirname(__file__),
@@ -91,7 +88,7 @@ def run(self):
 
 setup(
     name='django-bakery',
-    version='0.10.0-rc.5',
+    version='0.10.1',
     description='A set of helpers for baking your Django site out as flat files',
     author='The Los Angeles Times Data Desk',
     author_email='datadesk@latimes.com',