Skip to content

Commit

Permalink
coala: Add file caching
Browse files Browse the repository at this point in the history
In this commit, coalib.misc.Caching is introduced which contains a
class Cache which contains methods to act as a caching mechanism
that will be useful to determine which files are new or changed and
run coala only on those files to improve coala run time speed.
  • Loading branch information
adtac committed May 25, 2016
1 parent 4d9f856 commit ee667e1
Show file tree
Hide file tree
Showing 8 changed files with 562 additions and 0 deletions.
162 changes: 162 additions & 0 deletions coalib/misc/Caching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import calendar
import time
import hashlib
import os

from coalib.misc.CachingUtilities import (
pickle_load, pickle_dump, time_consistent, update_time_db,
delete_cache_files)


class FileCache:
"""
Example/Tutorial:
>>> from pyprint.NullPrinter import NullPrinter
>>> from coalib.output.printers.LogPrinter import LogPrinter
>>> import copy, time
>>> log_printer = LogPrinter(NullPrinter())
To initialize the cache create an instance for the project:
>>> cache = FileCache(log_printer, "test", flush_cache=True)
Now we can track new files by running:
>>> cache.track_new_files(["a.c", "b.c"])
Since all cache operations are lazy (for performance), we need to
explicitly write the cache to disk for persistence in future uses:
(Note: The cache will automatically figure out the write location)
>>> cache.write()
Let's go into the future:
>>> time.sleep(1)
Let's create a new instance to simulate a separate run:
>>> cache = FileCache(log_printer, "test", flush_cache=False)
>>> old_data = copy.deepcopy(cache.data)
We can mark a file as changed by doing:
>>> cache.add_to_changed_files({"a.c"})
Again write to disk after calculating the new cache times for each file:
>>> cache.write()
>>> new_data = cache.data
Since we marked 'a.c' as a changed file:
>>> old_data["a.c"] == new_data["a.c"]
True
Since 'b.c' was untouched after the second run, its time was updated
to the latest value:
>>> old_data["b.c"] < new_data["b.c"]
True
"""

def __init__(self, log_printer, project_dir, flush_cache=False):
"""
Initialize FileCache.
:param log_printer: A LogPrinter object to use for logging.
:param project_dir: The root directory of the project to be used
as a key identifier.
:param flush_cache: Flush the cache and rebuild it.
"""
self.log_printer = log_printer
self.project_dir = project_dir
self.md5sum = hashlib.md5(self.project_dir.encode("utf-8")).hexdigest()
self.current_time = calendar.timegm(time.gmtime())
if not flush_cache and not time_consistent(log_printer, self.md5sum):
log_printer.warn("It seems like you went back in time - your "
"system time is behind the last recorded run "
"time on this project. The cache will "
"be flushed and rebuilt.")
flush_cache = True
if not flush_cache:
self.data = pickle_load(log_printer, self.md5sum, {})
else:
self.data = {}
delete_cache_files(log_printer, [self.md5sum])
log_printer.info("The file cache was successfully flushed.")
self.changed_files = set()

def __enter__(self):
return self

def write(self):
"""
Update the last run time on the project for each file
to the current time.
"""
for file_name in self.data:
if file_name not in self.changed_files:
self.data[file_name] = self.current_time
pickle_dump(self.log_printer, self.md5sum, self.data)
update_time_db(self.log_printer, self.md5sum, self.current_time)
self.changed_files = set()

def __exit__(self, type, value, traceback):
"""
Update the last run time on the project for each file
to the current time.
"""
self.write()

def add_to_changed_files(self, changed_files):
"""
Keep track of changed files in ``changed_files`` for future use in
``write``.
:param changed_files: A set of files that had changed since the last
run time.
"""
self.changed_files.update(changed_files)

def track_new_files(self, new_files):
"""
Start tracking new files given in ``new_files`` by adding them to the
database.
:param new_files: The list of new files that need to be tracked.
These files are initialized with their last
modified tag as -1.
"""
for new_file in new_files:
self.data[new_file] = -1

def get_changed_files(self, files):
"""
Extract the list of files that had changed (or are new) with respect to
the cache data available.
:param files: The list of collected files.
:return: The list of files that had changed since the last cache.
"""
changed_files = []

if self.data == {}:
# The first run on this project. So all files are new
# and must be returned irrespective of whether caching is turned on.
new_files = files
else:
new_files = []
for file_path in files:
if file_path in self.data and self.data[file_path] > -1:
if int(os.path.getmtime(file_path)) > self.data[file_path]:
changed_files.append(file_path)
else:
new_files.append(file_path)

self.track_new_files(new_files)
self.add_to_changed_files(changed_files)

return changed_files + new_files
194 changes: 194 additions & 0 deletions coalib/misc/CachingUtilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import calendar
import hashlib
import os
import pickle
import time

from coalib.output.Tagging import get_user_data_dir


def get_cache_data_path(log_printer, filename):
"""
Get the full path of ``filename`` present in the user's cache directory.
:param log_printer: A LogPrinter object to use for logging.
:param filename: The file whose path needs to be expanded.
:return: Full path of the file, assuming it's present in the
user's config directory.
"""
return os.path.join(get_user_data_dir(
log_printer, action="caching"), filename)


def delete_cache_files(log_printer, files):
"""
Delete the cache files after displaying a warning saying the cache
is corrupted and will be removed.
:param log_printer: A LogPrinter object to use for logging.
:param files: The list of files to be deleted.
:return: True if all the given files were successfully deleted.
False otherwise.
"""
error_files = []
for file_name in files:
file_path = get_cache_data_path(log_printer, file_name)
cache_dir = os.path.dirname(file_path)
try:
os.remove(file_path)
except OSError:
error_files.append(file_name)

if len(error_files) > 0:
error_files = ", ".join(error_files)
log_printer.warn("There was a problem deleting the following "
"files: " + error_files + ". Please delete "
"them manually from '" + cache_dir + "'")
return False

return True


def pickle_load(log_printer, filename, fallback=None):
"""
Get the data stored in ``filename`` present in the user
config directory. Example usage:
>>> from pyprint.NullPrinter import NullPrinter
>>> from coalib.output.printers.LogPrinter import LogPrinter
>>> log_printer = LogPrinter(NullPrinter())
>>> test_data = {"answer": 42}
>>> pickle_dump(log_printer, "test_file", test_data)
>>> pickle_load(log_printer, "test_file")
{'answer': 42}
>>> pickle_load(log_printer, "nonexistant_file")
>>> pickle_load(log_printer, "nonexistant_file", fallback=42)
42
:param log_printer: A LogPrinter object to use for logging.
:param filename: The name of the file present in the user config
directory.
:param fallback: Return value to fallback to in case the file doesn't
exist.
:return: Data that is present in the file, if the file exists.
Otherwise the ``default`` value is returned.
"""
filename = get_cache_data_path(log_printer, filename)
if not os.path.isfile(filename):
return fallback
with open(filename, "rb") as f:
try:
return pickle.load(f)
except (pickle.UnpicklingError, EOFError) as e:
log_printer.warn("The caching database is corrupted and will "
"be removed. Each project will be re-cached "
"automatically in the next run time.")
delete_cache_files(log_printer, files=[filename])
return fallback


def pickle_dump(log_printer, filename, data):
"""
Write ``data`` into the file ``filename`` present in the user
config directory.
:param log_printer: A LogPrinter object to use for logging.
:param filename: The name of the file present in the user config
directory.
:param data: Data to be serialized and written to the file using
pickle.
"""
filename = get_cache_data_path(log_printer, filename)
with open(filename, "wb") as f:
pickle.dump(data, f)


def time_consistent(log_printer, project_hash):
"""
Verify if time is consistent with the last time was run. That is,
verify that the last run time is in the past. Otherwise, the
system time was changed and we need to flush the cache and rebuild.
:param log_printer: A LogPrinter object to use for logging.
:param project_hash: A MD5 hash of the project directory to be used
as the key.
:return: Returns True if the time is consistent and as
expected; False otherwise.
"""
time_db = pickle_load(log_printer, "time_db", {})
if project_hash not in time_db:
# This is the first time coala is run on this project, so the cache
# will be new automatically.
return True
return time_db[project_hash] <= calendar.timegm(time.gmtime())


def update_time_db(log_printer, project_hash, current_time=None):
"""
Update the last run time on the project.
:param log_printer: A LogPrinter object to use for logging.
:param project_hash: A MD5 hash of the project directory to be used
as the key.
:param current_time: Current time in epoch format. Not giving this
argument would imply using the current system time.
"""
if not current_time:
current_time = calendar.timegm(time.gmtime())
time_db = pickle_load(log_printer, "time_db", {})
time_db[project_hash] = current_time
pickle_dump(log_printer, "time_db", time_db)


def get_settings_hash(sections):
"""
Compute and return a unique hash for the settings.
:param sections: A dict containing the settings for each section.
:return: A MD5 hash that is unique to the settings used.
"""
settings = []
for section in sections:
settings.append(str(sections[section]))
return hashlib.md5(str(settings).encode("utf-8")).hexdigest()


def settings_changed(log_printer, settings_hash):
"""
Determine if the settings have changed since the last run with caching.
:param log_printer: A LogPrinter object to use for logging.
:param settings_hash: A MD5 hash that is unique to the settings used.
:return: Return True if the settings hash has changed
Return False otherwise.
"""
project_hash = hashlib.md5(os.getcwd().encode("utf-8")).hexdigest()

settings_hash_db = pickle_load(log_printer, "settings_hash_db", {})
if project_hash not in settings_hash_db:
# This is the first time coala is run on this project, so the cache
# will be flushed automatically.
return False

result = settings_hash_db[project_hash] != settings_hash
if result:
log_printer.warn("Since the configuration settings have been "
"changed since the last run, the "
"cache will be flushed and rebuilt.")

return result


def update_settings_db(log_printer, settings_hash):
"""
Update the config file last modification date.
:param log_printer: A LogPrinter object to use for logging.
:param settings_hash: A MD5 hash that is unique to the settings used.
"""
project_hash = hashlib.md5(os.getcwd().encode("utf-8")).hexdigest()

settings_hash_db = pickle_load(log_printer, "settings_hash_db", {})
settings_hash_db[project_hash] = settings_hash
pickle_dump(log_printer, "settings_hash_db", settings_hash_db)
Loading

0 comments on commit ee667e1

Please sign in to comment.