Permalink
Browse files

Completes two performance tweaks for bulk files, making them about 50…

…% faster.

Profiling FTW.
  • Loading branch information...
mlissner committed Sep 25, 2014
1 parent 64f7b77 commit a0e4326d98e9f501ec3e69955d6b5650471686e8
Showing with 32 additions and 13 deletions.
  1. +29 −4 alert/api/management/commands/cl_make_bulk_data.py
  2. +3 −9 alert/lib/string_utils.py
@@ -17,7 +17,7 @@ def mkdir_p(path):
"""Makes a directory path, but doesn't crash if the path already exists."""
try:
os.makedirs(path)
- except OSError as exc: # Python >2.5
+ except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
@@ -51,7 +51,7 @@ def swap_archives(self, obj_type_str):
mkdir_p(os.path.join(settings.DUMP_DIR, '%s' % obj_type_str))
for f in os.listdir('/tmp/bulk/%s' % obj_type_str):
shutil.move('/tmp/bulk/%s/%s' % (obj_type_str, f),
- os.path.join(settings.DUMP_DIR, '%ss' % obj_type_str))
+ os.path.join(settings.DUMP_DIR, '%s' % obj_type_str, f))
def make_archive(self, obj_type_str, obj_type, api_resource_obj):
"""Generate compressed archives containing the contents of an object
@@ -65,6 +65,29 @@ def make_archive(self, obj_type_str, obj_type, api_resource_obj):
We do this by making a dict of open file handles and adding each item
to the correct two files: The all.tar.gz file and the {court}.tar.gz
file.
+
+ This function takes longer to run than almost any in the codebase and
+ has been the subject of some profiling. The top results are as follows:
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 138072 5.007 0.000 6.138 0.000 {method 'sub' of '_sre.SRE_Pattern' objects}
+ 6001 4.452 0.001 4.608 0.001 {method 'execute' of 'psycopg2._psycopg.cursor' objects}
+ 24900 3.623 0.000 3.623 0.000 {built-in method compress}
+ 2807031/69163 2.923 0.000 8.216 0.000 copy.py:145(deepcopy)
+ 2427852 0.952 0.000 1.130 0.000 encoder.py:37(replace)
+
+ Conclusions:
+ 1. sub is from string_utils.py, where we nuke bad chars. Could remove
+ this code by sanitizing all future input to system and fixing any
+ current issues. Other than that, it's already optimized.
+ 1. Next up is DB waiting. Queries could be optimized to make this
+ better.
+ 1. Next is compression, which we've turned down as much as possible
+ already (compresslevel=1 for most bulk files =3 for all.tar.gz).
+ 1. Encoding and copying bring up the rear. Not much to do there, and
+ gains are limited. Could install a faster json decoder, but Python
+ 2.7's json implementation is already written in C. Not sure how to
+ remove the gazillion copy's that are happening.
"""
courts = Court.objects.all()
self.stdout.write(' - Creating %s bulk %s files '
@@ -77,11 +100,13 @@ def make_archive(self, obj_type_str, obj_type, api_resource_obj):
for court in courts:
tar_files[court.pk] = tarfile.open(
'/tmp/bulk/%s/%s.tar.gz' % (obj_type_str, court.pk),
- mode='w:gz'
+ mode='w:gz',
+ compresslevel=1,
)
tar_files['all'] = tarfile.open(
'/tmp/bulk/%s/all.tar.gz' % obj_type_str,
- mode='w:gz'
+ mode='w:gz',
+ compresslevel=3,
)
# Make the archives
View
@@ -43,18 +43,12 @@ def filter_invalid_XML_chars(input):
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
This strips out everything else.
+
+ See: http://stackoverflow.com/a/25920392/64911
"""
- def isValidXMLChar(char):
- codepoint = ord(char)
- return ( # Ordered by presumed frequency.
- 0x20 <= codepoint <= 0xD7FF or
- codepoint in (0x9, 0xA, 0xD) or
- 0xE000 <= codepoint <= 0xFFFD or
- 0x10000 <= codepoint <= 0x10FFFF
- )
if isinstance(input, basestring):
# Only do str, unicode, etc.
- return filter(isValidXMLChar, input)
+ return re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+', '', input)
else:
return input

0 comments on commit a0e4326

Please sign in to comment.