-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
In this commit, coalib.misc.Caching is introduced which contains a class Cache which contains methods to act as a caching mechanism that will be useful to determine which files are new or changed and run coala only on those files to improve coala run time speed.
- Loading branch information
Showing
8 changed files
with
562 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
import calendar | ||
import time | ||
import hashlib | ||
import os | ||
|
||
from coalib.misc.CachingUtilities import ( | ||
pickle_load, pickle_dump, time_consistent, update_time_db, | ||
delete_cache_files) | ||
|
||
|
||
class FileCache: | ||
""" | ||
Example/Tutorial: | ||
>>> from pyprint.NullPrinter import NullPrinter | ||
>>> from coalib.output.printers.LogPrinter import LogPrinter | ||
>>> import copy, time | ||
>>> log_printer = LogPrinter(NullPrinter()) | ||
To initialize the cache create an instance for the project: | ||
>>> cache = FileCache(log_printer, "test", flush_cache=True) | ||
Now we can track new files by running: | ||
>>> cache.track_new_files(["a.c", "b.c"]) | ||
Since all cache operations are lazy (for performance), we need to | ||
explicitly write the cache to disk for persistence in future uses: | ||
(Note: The cache will automatically figure out the write location) | ||
>>> cache.write() | ||
Let's go into the future: | ||
>>> time.sleep(1) | ||
Let's create a new instance to simulate a separate run: | ||
>>> cache = FileCache(log_printer, "test", flush_cache=False) | ||
>>> old_data = copy.deepcopy(cache.data) | ||
We can mark a file as changed by doing: | ||
>>> cache.add_to_changed_files({"a.c"}) | ||
Again write to disk after calculating the new cache times for each file: | ||
>>> cache.write() | ||
>>> new_data = cache.data | ||
Since we marked 'a.c' as a changed file: | ||
>>> old_data["a.c"] == new_data["a.c"] | ||
True | ||
Since 'b.c' was untouched after the second run, its time was updated | ||
to the latest value: | ||
>>> old_data["b.c"] < new_data["b.c"] | ||
True | ||
""" | ||
|
||
def __init__(self, log_printer, project_dir, flush_cache=False): | ||
""" | ||
Initialize FileCache. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param project_dir: The root directory of the project to be used | ||
as a key identifier. | ||
:param flush_cache: Flush the cache and rebuild it. | ||
""" | ||
self.log_printer = log_printer | ||
self.project_dir = project_dir | ||
self.md5sum = hashlib.md5(self.project_dir.encode("utf-8")).hexdigest() | ||
self.current_time = calendar.timegm(time.gmtime()) | ||
if not flush_cache and not time_consistent(log_printer, self.md5sum): | ||
log_printer.warn("It seems like you went back in time - your " | ||
"system time is behind the last recorded run " | ||
"time on this project. The cache will " | ||
"be flushed and rebuilt.") | ||
flush_cache = True | ||
if not flush_cache: | ||
self.data = pickle_load(log_printer, self.md5sum, {}) | ||
else: | ||
self.data = {} | ||
delete_cache_files(log_printer, [self.md5sum]) | ||
log_printer.info("The file cache was successfully flushed.") | ||
self.changed_files = set() | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
def write(self): | ||
""" | ||
Update the last run time on the project for each file | ||
to the current time. | ||
""" | ||
for file_name in self.data: | ||
if file_name not in self.changed_files: | ||
self.data[file_name] = self.current_time | ||
pickle_dump(self.log_printer, self.md5sum, self.data) | ||
update_time_db(self.log_printer, self.md5sum, self.current_time) | ||
self.changed_files = set() | ||
|
||
def __exit__(self, type, value, traceback): | ||
""" | ||
Update the last run time on the project for each file | ||
to the current time. | ||
""" | ||
self.write() | ||
|
||
def add_to_changed_files(self, changed_files): | ||
""" | ||
Keep track of changed files in ``changed_files`` for future use in | ||
``write``. | ||
:param changed_files: A set of files that had changed since the last | ||
run time. | ||
""" | ||
self.changed_files.update(changed_files) | ||
|
||
def track_new_files(self, new_files): | ||
""" | ||
Start tracking new files given in ``new_files`` by adding them to the | ||
database. | ||
:param new_files: The list of new files that need to be tracked. | ||
These files are initialized with their last | ||
modified tag as -1. | ||
""" | ||
for new_file in new_files: | ||
self.data[new_file] = -1 | ||
|
||
def get_changed_files(self, files): | ||
""" | ||
Extract the list of files that had changed (or are new) with respect to | ||
the cache data available. | ||
:param files: The list of collected files. | ||
:return: The list of files that had changed since the last cache. | ||
""" | ||
changed_files = [] | ||
|
||
if self.data == {}: | ||
# The first run on this project. So all files are new | ||
# and must be returned irrespective of whether caching is turned on. | ||
new_files = files | ||
else: | ||
new_files = [] | ||
for file_path in files: | ||
if file_path in self.data and self.data[file_path] > -1: | ||
if int(os.path.getmtime(file_path)) > self.data[file_path]: | ||
changed_files.append(file_path) | ||
else: | ||
new_files.append(file_path) | ||
|
||
self.track_new_files(new_files) | ||
self.add_to_changed_files(changed_files) | ||
|
||
return changed_files + new_files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
import calendar | ||
import hashlib | ||
import os | ||
import pickle | ||
import time | ||
|
||
from coalib.output.Tagging import get_user_data_dir | ||
|
||
|
||
def get_cache_data_path(log_printer, filename): | ||
""" | ||
Get the full path of ``filename`` present in the user's cache directory. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param filename: The file whose path needs to be expanded. | ||
:return: Full path of the file, assuming it's present in the | ||
user's config directory. | ||
""" | ||
return os.path.join(get_user_data_dir( | ||
log_printer, action="caching"), filename) | ||
|
||
|
||
def delete_cache_files(log_printer, files): | ||
""" | ||
Delete the cache files after displaying a warning saying the cache | ||
is corrupted and will be removed. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param files: The list of files to be deleted. | ||
:return: True if all the given files were successfully deleted. | ||
False otherwise. | ||
""" | ||
error_files = [] | ||
for file_name in files: | ||
file_path = get_cache_data_path(log_printer, file_name) | ||
cache_dir = os.path.dirname(file_path) | ||
try: | ||
os.remove(file_path) | ||
except OSError: | ||
error_files.append(file_name) | ||
|
||
if len(error_files) > 0: | ||
error_files = ", ".join(error_files) | ||
log_printer.warn("There was a problem deleting the following " | ||
"files: " + error_files + ". Please delete " | ||
"them manually from '" + cache_dir + "'") | ||
return False | ||
|
||
return True | ||
|
||
|
||
def pickle_load(log_printer, filename, fallback=None): | ||
""" | ||
Get the data stored in ``filename`` present in the user | ||
config directory. Example usage: | ||
>>> from pyprint.NullPrinter import NullPrinter | ||
>>> from coalib.output.printers.LogPrinter import LogPrinter | ||
>>> log_printer = LogPrinter(NullPrinter()) | ||
>>> test_data = {"answer": 42} | ||
>>> pickle_dump(log_printer, "test_file", test_data) | ||
>>> pickle_load(log_printer, "test_file") | ||
{'answer': 42} | ||
>>> pickle_load(log_printer, "nonexistant_file") | ||
>>> pickle_load(log_printer, "nonexistant_file", fallback=42) | ||
42 | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param filename: The name of the file present in the user config | ||
directory. | ||
:param fallback: Return value to fallback to in case the file doesn't | ||
exist. | ||
:return: Data that is present in the file, if the file exists. | ||
Otherwise the ``default`` value is returned. | ||
""" | ||
filename = get_cache_data_path(log_printer, filename) | ||
if not os.path.isfile(filename): | ||
return fallback | ||
with open(filename, "rb") as f: | ||
try: | ||
return pickle.load(f) | ||
except (pickle.UnpicklingError, EOFError) as e: | ||
log_printer.warn("The caching database is corrupted and will " | ||
"be removed. Each project will be re-cached " | ||
"automatically in the next run time.") | ||
delete_cache_files(log_printer, files=[filename]) | ||
return fallback | ||
|
||
|
||
def pickle_dump(log_printer, filename, data): | ||
""" | ||
Write ``data`` into the file ``filename`` present in the user | ||
config directory. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param filename: The name of the file present in the user config | ||
directory. | ||
:param data: Data to be serialized and written to the file using | ||
pickle. | ||
""" | ||
filename = get_cache_data_path(log_printer, filename) | ||
with open(filename, "wb") as f: | ||
pickle.dump(data, f) | ||
|
||
|
||
def time_consistent(log_printer, project_hash): | ||
""" | ||
Verify if time is consistent with the last time was run. That is, | ||
verify that the last run time is in the past. Otherwise, the | ||
system time was changed and we need to flush the cache and rebuild. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param project_hash: A MD5 hash of the project directory to be used | ||
as the key. | ||
:return: Returns True if the time is consistent and as | ||
expected; False otherwise. | ||
""" | ||
time_db = pickle_load(log_printer, "time_db", {}) | ||
if project_hash not in time_db: | ||
# This is the first time coala is run on this project, so the cache | ||
# will be new automatically. | ||
return True | ||
return time_db[project_hash] <= calendar.timegm(time.gmtime()) | ||
|
||
|
||
def update_time_db(log_printer, project_hash, current_time=None): | ||
""" | ||
Update the last run time on the project. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param project_hash: A MD5 hash of the project directory to be used | ||
as the key. | ||
:param current_time: Current time in epoch format. Not giving this | ||
argument would imply using the current system time. | ||
""" | ||
if not current_time: | ||
current_time = calendar.timegm(time.gmtime()) | ||
time_db = pickle_load(log_printer, "time_db", {}) | ||
time_db[project_hash] = current_time | ||
pickle_dump(log_printer, "time_db", time_db) | ||
|
||
|
||
def get_settings_hash(sections): | ||
""" | ||
Compute and return a unique hash for the settings. | ||
:param sections: A dict containing the settings for each section. | ||
:return: A MD5 hash that is unique to the settings used. | ||
""" | ||
settings = [] | ||
for section in sections: | ||
settings.append(str(sections[section])) | ||
return hashlib.md5(str(settings).encode("utf-8")).hexdigest() | ||
|
||
|
||
def settings_changed(log_printer, settings_hash): | ||
""" | ||
Determine if the settings have changed since the last run with caching. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param settings_hash: A MD5 hash that is unique to the settings used. | ||
:return: Return True if the settings hash has changed | ||
Return False otherwise. | ||
""" | ||
project_hash = hashlib.md5(os.getcwd().encode("utf-8")).hexdigest() | ||
|
||
settings_hash_db = pickle_load(log_printer, "settings_hash_db", {}) | ||
if project_hash not in settings_hash_db: | ||
# This is the first time coala is run on this project, so the cache | ||
# will be flushed automatically. | ||
return False | ||
|
||
result = settings_hash_db[project_hash] != settings_hash | ||
if result: | ||
log_printer.warn("Since the configuration settings have been " | ||
"changed since the last run, the " | ||
"cache will be flushed and rebuilt.") | ||
|
||
return result | ||
|
||
|
||
def update_settings_db(log_printer, settings_hash): | ||
""" | ||
Update the config file last modification date. | ||
:param log_printer: A LogPrinter object to use for logging. | ||
:param settings_hash: A MD5 hash that is unique to the settings used. | ||
""" | ||
project_hash = hashlib.md5(os.getcwd().encode("utf-8")).hexdigest() | ||
|
||
settings_hash_db = pickle_load(log_printer, "settings_hash_db", {}) | ||
settings_hash_db[project_hash] = settings_hash | ||
pickle_dump(log_printer, "settings_hash_db", settings_hash_db) |
Oops, something went wrong.