dimagi · snopoke · Apr 21, 2020 · Apr 1, 2020 · Apr 3, 2020 · Apr 3, 2020
diff --git a/DEV_SETUP.md b/DEV_SETUP.md
@@ -169,7 +169,8 @@ you'll need to install `bower` and run `bower install`. Follow these steps to in
         $ sudo npm -g install bower
 
 3. Run bower with:
-
+        $ sudo chown -R $USER:$GROUP ~/.npm
+        $ sudo chown -R $USER:$GROUP ~/.config
         $ bower install
 
 

diff --git a/corehq/apps/case_importer/tracking/filestorage.py b/corehq/apps/case_importer/tracking/filestorage.py
@@ -50,7 +50,7 @@ def write_file(self, f, filename, domain):
     def get_tempfile_ref_for_contents(self, identifier):
         filename = self.get_filename(identifier)
         suffix = file_extention_from_filename(filename)
-        content = get_blob_db().get(key=identifier).read()
+        content = get_blob_db().get(key=identifier, type_code=CODES.data_import).read()
         return make_temp_file(content, suffix)
 
     @memoized

diff --git a/corehq/apps/domain/tests/test_delete_domain.py b/corehq/apps/domain/tests/test_delete_domain.py
@@ -99,7 +99,7 @@
 from corehq.apps.users.models import DomainRequest, SQLInvitation
 from corehq.apps.zapier.consts import EventTypes
 from corehq.apps.zapier.models import ZapierSubscription
-from corehq.blobs import NotFound, get_blob_db
+from corehq.blobs import NotFound, get_blob_db, CODES
 from corehq.form_processor.backends.sql.dbaccessors import (
     CaseAccessorSQL,
     FormAccessorSQL,
@@ -630,9 +630,9 @@ def test_export_delete(self):
         self.domain.delete()
 
         with self.assertRaises(NotFound):
-            blobdb.get(key=data_files[0].blob_id)
+            blobdb.get(key=data_files[0].blob_id, type_code=CODES.data_file)
 
-        with blobdb.get(key=data_files[1].blob_id) as f:
+        with blobdb.get(key=data_files[1].blob_id, type_code=CODES.data_file) as f:
             self.assertEqual(f.read(), (self.domain2.name + " csv").encode('utf-8'))
 
         self._assert_export_counts(self.domain.name, 0)

diff --git a/corehq/apps/export/models/new.py b/corehq/apps/export/models/new.py
@@ -2770,7 +2770,7 @@ def save_blob(cls, file_obj, domain, filename, description, content_type, delete
     def get_blob(self):
         db = get_blob_db()
         try:
-            blob = db.get(key=self._meta.key)
+            blob = db.get(key=self._meta.key, type_code=CODES.data_file)
         except (KeyError, NotFound) as err:
             raise NotFound(str(err))
         return blob

diff --git a/corehq/apps/hqadmin/service_checks.py b/corehq/apps/hqadmin/service_checks.py
@@ -153,7 +153,7 @@ def check_blobdb():
         parent_id="check_blobdb",
         type_code=CODES.tempfile,
     )
-    with db.get(key=meta.key) as fh:
+    with db.get(key=meta.key, type_code=CODES.tempfile) as fh:
         res = fh.read()
     db.delete(key=meta.key)
     if res == contents:

@@ -62,7 +62,7 @@ def get_restore_as_string(self):
             blob.close()
 
     def _get_restore_xml(self):
-        return get_blob_db().get(key=self.restore_blob_id)
+        return get_blob_db().get(key=self.restore_blob_id, type_code=CODES.demo_user_restore)
 
     def delete(self):
         """

diff --git a/corehq/apps/reports/views.py b/corehq/apps/reports/views.py
@@ -132,7 +132,7 @@
     DEID_EXPORT_PERMISSION,
     FORM_EXPORT_PERMISSION,
 )
-from corehq.blobs import NotFound, get_blob_db, models
+from corehq.blobs import CODES, NotFound, get_blob_db, models
 from corehq.form_processor.exceptions import CaseNotFound
 from corehq.form_processor.interfaces.dbaccessors import (
     CaseAccessors,
@@ -2063,7 +2063,7 @@ def export_report(request, domain, export_hash, format):
     report_class = meta.properties["report_class"]
 
     try:
-        report_file = db.get(export_hash)
+        report_file = db.get(export_hash, type_code=CODES.tempfile)
     except NotFound:
         return report_not_found
     with report_file:

@@ -18,3 +18,8 @@ class InvalidContext(Error):
 
 class NotFound(Error):
     """Raised when an attachment cannot be found"""
+
+
+class GzipStreamAttrAccessBeforeRead(Exception):
+    """Raised when an attribute (eg: content_length) of the
+    Gzip Stream is accessed before the stream is read completely"""
@@ -1,6 +1,6 @@
 import os
 
-from . import get_blob_db, NotFound
+from . import get_blob_db, NotFound, CODES
 from .migrate import PROCESSING_COMPLETE_MESSAGE
 from .models import BlobMeta
 from .zipdb import get_export_filename, ZipBlobDB
@@ -23,10 +23,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             print(PROCESSING_COMPLETE_MESSAGE.format(self.not_found, self.total_blobs))
 
     def process_object(self, meta):
-        from_db = get_blob_db()
         self.total_blobs += 1
         try:
-            content = from_db.get(key=meta.key)
+            content = meta.open()
         except NotFound:
             self.not_found += 1
         else:

@@ -2,12 +2,13 @@
 """
 import os
 from collections import namedtuple
+from gzip import GzipFile
 from hashlib import md5
 from os.path import commonprefix, exists, isabs, isdir, dirname, join, realpath, sep
 
 from corehq.blobs.exceptions import BadName, NotFound
 from corehq.blobs.interface import AbstractBlobDB
-from corehq.blobs.util import check_safe_key
+from corehq.blobs.util import check_safe_key, GzipCompressReadStream
 from corehq.util.metrics import metrics_counter
 
 CHUNK_SIZE = 4096
@@ -28,6 +29,8 @@ def put(self, content, **blob_meta_args):
         dirpath = dirname(path)
         if not isdir(dirpath):
             os.makedirs(dirpath)
+        if meta.compressed:
+            content = GzipCompressReadStream(content)
         length = 0
         digest = md5()
         with open(path, "wb") as fh:
@@ -42,11 +45,14 @@ def put(self, content, **blob_meta_args):
         self.metadb.put(meta)
         return meta
 
-    def get(self, key):
+    def get(self, key=None, type_code=None, meta=None):
+        key = self._validate_get_args(key, type_code, meta)
         path = self.get_path(key)
         if not exists(path):
             metrics_counter('commcare.blobdb.notfound')
             raise NotFound(key)
+        if meta and meta.compressed:
+            return GzipFile(path)
         return open(path, "rb")
 
     def size(self, key):

@@ -1,5 +1,6 @@
 from abc import ABCMeta, abstractmethod
 
+from . import CODES
 from .metadata import MetaDB
 
 NOT_SET = object()
@@ -52,15 +53,36 @@ def put(self, content, **blob_meta_args):
         raise NotImplementedError
 
     @abstractmethod
-    def get(self, key):
-        """Get a blob
+    def get(self, key=None, type_code=None, meta=None):
+        """Get a blob.
 
         :param key: Blob key.
+        :param type_code: Blob type code.
+        :param meta: BlobMeta instance.
+
+        key and type_code are required if meta is not provided. If meta
+        is provided, then key and type_code should be None. For type_code
+        form_xml, meta is required.
+
         :returns: A file-like object in binary read mode. The returned
         object should be closed when finished reading.
         """
         raise NotImplementedError
 
+    @staticmethod
+    def _validate_get_args(key, type_code, meta):
+        if key is not None or type_code is not None:
+            if meta is not None:
+                raise ValueError("'key' and 'meta' are mutually exclusive")
+            if type_code == CODES.form_xml:
+                raise ValueError("form XML must be loaded with 'meta' argument")
+            if key is None or type_code is None:
+                raise ValueError("'key' must be specified with 'type_code'")
+            return key
+        if meta is None:
+            raise ValueError("'key' and 'type_code' or 'meta' is required")
+        return meta.key
+
     @abstractmethod
     def exists(self, key):
         """Check if blob exists

@@ -31,6 +31,7 @@ def new(self, **blob_meta_args):
                     "keyword arguments are incompatible with `meta` argument")
             return blob_meta_args["meta"]
         timeout = blob_meta_args.pop("timeout", None)
+        blob_meta_args['compressed'] = blob_meta_args.get('type_code') == CODES.form_xml
         meta = BlobMeta(**blob_meta_args)
         if not meta.domain:
             raise TypeError("domain is required")

@@ -260,7 +260,7 @@ def migrate(self, doc):
         meta = doc["_obj_not_json"]
         self.total_blobs += 1
         try:
-            content = self.db.old_db.get(key=meta.key)
+            content = meta.open(db=self.db.old_db)
         except NotFound:
             if not self.db.new_db.exists(key=meta.key):
                 self.save_backup(doc)

@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+from django.db import migrations, models
+
+from corehq.sql_db.migrations import partitioned
+
+
+@partitioned
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('blobs', '0010_auto_20191023_0938'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='blobmeta',
+            name='compressed',
+            field=models.NullBooleanField(),
+        ),
+    ]
@@ -188,7 +188,8 @@ def fetch_attachment(self, name, stream=False):
                     return super(BlobMixin, self) \
                         .fetch_attachment(name, stream=stream)
                 raise NotFound(name)
-            blob = db.get(key=key)
+            meta = db.metadb.get(parent_id=self._id, key=key)
+            blob = meta.open()
         except NotFound:
             raise ResourceNotFound(
                 "{model} {model_id} attachment: {name!r}".format(

@@ -3,6 +3,7 @@
 
 from django.db.models import (
     BigIntegerField,
+    NullBooleanField,
     CharField,
     DateTimeField,
     IntegerField,
@@ -59,6 +60,7 @@ class BlobMeta(PartitionedModel, Model):
     properties = NullJsonField(default=dict)
     created_on = DateTimeField(default=datetime.utcnow)
     expires_on = DateTimeField(default=None, null=True)
+    compressed = NullBooleanField()
 
     class Meta:
         unique_together = [
@@ -91,13 +93,18 @@ def is_image(self):
         """Use content type to check if blob is an image"""
         return (self.content_type or "").startswith("image/")
 
-    def open(self):
+    def open(self, db=None):
         """Get a file-like object containing blob content
 
         The returned object should be closed when it is no longer needed.
         """
-        from . import get_blob_db
-        return get_blob_db().get(key=self.key)
+        from . import get_blob_db, CODES
+        if self.type_code == CODES.form_xml:
+            kwargs = {'meta': self}
+        else:
+            kwargs = {'key': self.key, 'type_code': self.type_code}
+        db = db or get_blob_db()
+        return db.get(**kwargs)
 
     def blob_exists(self):
         from . import get_blob_db

@@ -2,10 +2,11 @@
 import weakref
 from contextlib import contextmanager
 from io import RawIOBase, UnsupportedOperation
+from gzip import GzipFile
 
 from corehq.blobs.exceptions import NotFound
 from corehq.blobs.interface import AbstractBlobDB
-from corehq.blobs.util import check_safe_key
+from corehq.blobs.util import check_safe_key, GzipCompressReadStream
 from corehq.util.metrics import metrics_counter, metrics_histogram_timer
 from dimagi.utils.logging import notify_exception
 
@@ -73,17 +74,23 @@ def put(self, content, **blob_meta_args):
                 s3_bucket.copy(source, meta.key)
         else:
             content.seek(0)
-            meta.content_length = get_file_size(content)
-            self.metadb.put(meta)
+            if meta.compressed:
+                content = GzipCompressReadStream(content)
             with self.report_timing('put', meta.key):
                 s3_bucket.upload_fileobj(content, meta.key)
+            meta.content_length = get_file_size(content)
+            self.metadb.put(meta)
         return meta
 
-    def get(self, key):
+    def get(self, key=None, type_code=None, meta=None):
+        key = self._validate_get_args(key, type_code, meta)
         check_safe_key(key)
         with maybe_not_found(throw=NotFound(key)), self.report_timing('get', key):
             resp = self._s3_bucket().Object(key).get()
-        return BlobStream(resp["Body"], self, key)
+        blobstream = BlobStream(resp["Body"], self, key)
+        if meta and meta.compressed:
+            return GzipFile(blobstream)
+        return blobstream
 
     def size(self, key):
         check_safe_key(key)
@@ -189,6 +196,9 @@ def is_not_found(err, not_found_codes=["NoSuchKey", "NoSuchBucket", "404"]):
 
 
 def get_file_size(fileobj):
+    if isinstance(fileobj, GzipCompressReadStream):
+        return fileobj.content_length
+
     # botocore.response.StreamingBody has a '_content_length' attribute
     length = getattr(fileobj, "_content_length", None)
     if length is not None:

@@ -1,6 +1,9 @@
+import gzip
+import tempfile
 from unittest import TestCase
 
 import corehq.blobs.util as mod
+from corehq.blobs.exceptions import GzipStreamAttrAccessBeforeRead
 
 
 class TestRandomUrlId(TestCase):
@@ -16,3 +19,42 @@ def test_random_id_length(self):
 
     def test_random_id_randomness(self):
         self.assertEqual(len(set(self.ids)), self.sample_size, self.ids)
+
+
+class TestGzipCompressReadStream(TestCase):
+
+    def _is_gzip_compressed(self, file_):
+        with gzip.open(file_, 'r') as f:
+            try:
+                f.read(1)
+                return True
+            except OSError:
+                return False
+
+    def test_compression(self):
+        with tempfile.NamedTemporaryFile() as f:
+            f.write(b"x")
+            compress_stream = mod.GzipCompressReadStream(f)
+            with tempfile.NamedTemporaryFile() as compressed_f:
+                compressed_f.write(compress_stream.read())
+                self.assertTrue(self._is_gzip_compressed(compressed_f))
+
+    def test_content_length_access(self):
+        with tempfile.NamedTemporaryFile() as f:
+            f.seek(10)
+            f.write(b"x")
+            compress_stream = mod.GzipCompressReadStream(f)
+
+            # Try to read content_length without reading the stream
+            with self.assertRaises(GzipStreamAttrAccessBeforeRead):
+                compress_stream.content_length  # noqa
+
+            # Try to read content_length after partially reading the stream
+            content_length = len(compress_stream.read(5))
+            with self.assertRaises(GzipStreamAttrAccessBeforeRead):
+                compress_stream.content_length  # noqa
+
+            # Read content_length after completely reading the stream and check
+            # that it's correct
+            content_length += len(compress_stream.read())
+            self.assertEqual(compress_stream.content_length, content_length)