Skip to content
Permalink
Browse files

Completes two performance tweaks for bulk files, making them about 50…

…% faster.

Profiling FTW.
  • Loading branch information
mlissner committed Sep 25, 2014
1 parent 64f7b77 commit a0e4326d98e9f501ec3e69955d6b5650471686e8
Showing with 32 additions and 13 deletions.
  1. +29 −4 alert/api/management/commands/cl_make_bulk_data.py
  2. +3 −9 alert/lib/string_utils.py
@@ -17,7 +17,7 @@ def mkdir_p(path):
"""Makes a directory path, but doesn't crash if the path already exists."""
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
@@ -51,7 +51,7 @@ def swap_archives(self, obj_type_str):
mkdir_p(os.path.join(settings.DUMP_DIR, '%s' % obj_type_str))
for f in os.listdir('/tmp/bulk/%s' % obj_type_str):
shutil.move('/tmp/bulk/%s/%s' % (obj_type_str, f),
os.path.join(settings.DUMP_DIR, '%ss' % obj_type_str))
os.path.join(settings.DUMP_DIR, '%s' % obj_type_str, f))

def make_archive(self, obj_type_str, obj_type, api_resource_obj):
"""Generate compressed archives containing the contents of an object
@@ -65,6 +65,29 @@ def make_archive(self, obj_type_str, obj_type, api_resource_obj):
We do this by making a dict of open file handles and adding each item
to the correct two files: The all.tar.gz file and the {court}.tar.gz
file.
This function takes longer to run than almost any in the codebase and
has been the subject of some profiling. The top results are as follows:
ncalls tottime percall cumtime percall filename:lineno(function)
138072 5.007 0.000 6.138 0.000 {method 'sub' of '_sre.SRE_Pattern' objects}
6001 4.452 0.001 4.608 0.001 {method 'execute' of 'psycopg2._psycopg.cursor' objects}
24900 3.623 0.000 3.623 0.000 {built-in method compress}
2807031/69163 2.923 0.000 8.216 0.000 copy.py:145(deepcopy)
2427852 0.952 0.000 1.130 0.000 encoder.py:37(replace)
Conclusions:
1. sub is from string_utils.py, where we nuke bad chars. Could remove
this code by sanitizing all future input to system and fixing any
current issues. Other than that, it's already optimized.
1. Next up is DB waiting. Queries could be optimized to make this
better.
1. Next is compression, which we've turned down as much as possible
already (compresslevel=1 for most bulk files =3 for all.tar.gz).
1. Encoding and copying bring up the rear. Not much to do there, and
gains are limited. Could install a faster json decoder, but Python
2.7's json implementation is already written in C. Not sure how to
remove the gazillion copy's that are happening.
"""
courts = Court.objects.all()
self.stdout.write(' - Creating %s bulk %s files '
@@ -77,11 +100,13 @@ def make_archive(self, obj_type_str, obj_type, api_resource_obj):
for court in courts:
tar_files[court.pk] = tarfile.open(
'/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk),
mode='w:gz'
mode='w:gz',
compresslevel=1,
)
tar_files['all'] = tarfile.open(
'/tmp/bulk/%s/all.tar.gz' % obj_type_str,
mode='w:gz'
mode='w:gz',
compresslevel=3,
)

# Make the archives
@@ -43,18 +43,12 @@ def filter_invalid_XML_chars(input):
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
This strips out everything else.
See: http://stackoverflow.com/a/25920392/64911
"""
def isValidXMLChar(char):
codepoint = ord(char)
return ( # Ordered by presumed frequency.
0x20 <= codepoint <= 0xD7FF or
codepoint in (0x9, 0xA, 0xD) or
0xE000 <= codepoint <= 0xFFFD or
0x10000 <= codepoint <= 0x10FFFF
)
if isinstance(input, basestring):
# Only do str, unicode, etc.
return filter(isValidXMLChar, input)
return re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+', '', input)
else:
return input

0 comments on commit a0e4326

Please sign in to comment.
You can’t perform that action at this time.