From 3dcf75c1bb805e0c6cd580b2320f220d09f8a48a Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Sun, 24 May 2020 15:19:42 +0300 Subject: [PATCH] Clear inactive cache files Cache files that wasn't accessed in last 30 days will be automatically garbage collected. This collection happens when the `save_module` is called via a lock system that would make it happen only one time per day. --- parso/_compatibility.py | 31 ++++++++++++++++++++--- parso/cache.py | 56 ++++++++++++++++++++++++++++++++++++++++- parso/file_io.py | 8 ++++++ test/test_cache.py | 55 +++++++++++++++++++++++++++++----------- 4 files changed, 132 insertions(+), 18 deletions(-) diff --git a/parso/_compatibility.py b/parso/_compatibility.py index a7f8ca8e..43e1d86a 100644 --- a/parso/_compatibility.py +++ b/parso/_compatibility.py @@ -2,6 +2,7 @@ To ensure compatibility from Python ``2.7`` - ``3.3``, a module has been created. Clearly there is huge need to use conforming syntax. """ +import os import sys import platform @@ -44,11 +45,11 @@ def u(string): try: - # Python 2.7 + # Python 3.3+ FileNotFoundError = FileNotFoundError except NameError: - # Python 3.3+ - FileNotFoundError = IOError + # Python 2.7 (both IOError + OSError) + FileNotFoundError = EnvironmentError def utf8_repr(func): @@ -67,3 +68,27 @@ def wrapper(self): return func else: return wrapper + +if sys.version_info < (3, 5): + """ + A super-minimal shim around listdir that behave like + scandir for the information we need. + """ + class _DirEntry: + + def __init__(self, name, basepath): + self.name = name + self.basepath = basepath + + @property + def path(self): + return os.path.join(self.basepath, self.name) + + def stat(self): + # won't follow symlinks + return os.lstat(os.path.join(self.basepath, self.name)) + + def scandir(dir): + return [_DirEntry(name, dir) for name in os.listdir(dir)] +else: + from os import scandir diff --git a/parso/cache.py b/parso/cache.py index 841d7665..04e9ce7f 100644 --- a/parso/cache.py +++ b/parso/cache.py @@ -13,7 +13,8 @@ except: import pickle -from parso._compatibility import FileNotFoundError +from parso._compatibility import FileNotFoundError, scandir +from parso.file_io import FileIO LOG = logging.getLogger(__name__) @@ -21,6 +22,13 @@ """ Cached files should survive at least a few minutes. """ + +_CACHED_FILE_MAXIMUM_SURVIVAL = 60 * 60 * 24 * 30 +""" +Maximum time for a cached file to survive if it is not +accessed within. +""" + _CACHED_SIZE_TRIGGER = 600 """ This setting limits the amount of cached files. It's basically a way to start @@ -82,6 +90,20 @@ def _get_default_cache_path(): ``$XDG_CACHE_HOME/parso`` is used instead of the default one. """ +_CACHE_CLEAR_THRESHOLD = 60 * 60 * 24 + +def _get_cache_clear_lock(cache_path = None): + """ + The path where the cache lock is stored. + + Cache lock will prevent continous cache clearing and only allow garbage + collection once a day (can be configured in _CACHE_CLEAR_THRESHOLD). + """ + cache_path = cache_path or _get_default_cache_path() + + return FileIO(os.path.join(cache_path, "{}-cache-lock".format(_VERSION_TAG))) + + parser_cache = {} @@ -173,6 +195,7 @@ def save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_pat _set_cache_item(hashed_grammar, path, item) if pickling and path is not None: _save_to_file_system(hashed_grammar, path, item, cache_path=cache_path) + _remove_cache_and_update_lock(cache_path = cache_path) def _save_to_file_system(hashed_grammar, path, item, cache_path=None): @@ -187,6 +210,37 @@ def clear_cache(cache_path=None): parser_cache.clear() +def clear_inactive_cache( + cache_path=None, + inactivity_threshold=_CACHED_FILE_MAXIMUM_SURVIVAL, +): + cache_path = _get_default_cache_path() + if not os.path.exists(cache_path): + return False + for version_path in os.listdir(cache_path): + version_path = os.path.join(cache_path, version_path) + if not os.path.isdir(version_path): + continue + for file in scandir(version_path): + if ( + file.stat().st_atime + _CACHED_FILE_MAXIMUM_SURVIVAL + <= time.time() + ): + os.remove(file.path) + else: + return True + + +def _remove_cache_and_update_lock(cache_path = None): + lock = _get_cache_clear_lock(cache_path=cache_path) + clear_lock_time = lock.get_last_modified() + if ( + clear_lock_time is None # first time + or clear_lock_time + _CACHE_CLEAR_THRESHOLD <= time.time() + ): + if clear_inactive_cache(cache_path = cache_path): + lock._touch() + def _get_hashed_path(hashed_grammar, path, cache_path=None): directory = _get_cache_directory_path(cache_path=cache_path) diff --git a/parso/file_io.py b/parso/file_io.py index 94fe08e6..2218dd70 100644 --- a/parso/file_io.py +++ b/parso/file_io.py @@ -1,4 +1,5 @@ import os +from parso._compatibility import FileNotFoundError class FileIO(object): @@ -22,6 +23,13 @@ def get_last_modified(self): # Might raise FileNotFoundError, OSError for Python 2 return None + def _touch(self): + try: + os.utime(self.path, None) + except FileNotFoundError: + file = open(self.path, 'a') + file.close() + def __repr__(self): return '%s(%s)' % (self.__class__.__name__, self.path) diff --git a/test/test_cache.py b/test/test_cache.py index ebf1303e..94d3b3a5 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -2,13 +2,18 @@ Test all things related to the ``jedi.cache`` module. """ -from os import unlink +import os +import os.path import pytest import time -from parso.cache import _NodeCacheItem, save_module, load_module, \ - _get_hashed_path, parser_cache, _load_from_file_system, _save_to_file_system +from parso.cache import (_CACHED_FILE_MAXIMUM_SURVIVAL, _VERSION_TAG, + _get_cache_clear_lock, _get_hashed_path, + _load_from_file_system, _NodeCacheItem, + _remove_cache_and_update_lock, _save_to_file_system, + clear_inactive_cache, load_module, parser_cache, + save_module) from parso import load_grammar from parso import cache from parso import file_io @@ -16,15 +21,13 @@ @pytest.fixture() -def isolated_jedi_cache(monkeypatch, tmpdir): - """ - Set `jedi.settings.cache_directory` to a temporary directory during test. - - Same as `clean_jedi_cache`, but create the temporary directory for - each test case (scope='function'). - """ - monkeypatch.setattr(cache, '_default_cache_path', str(tmpdir)) - +def isolated_parso_cache(monkeypatch, tmpdir): + """Set `parso.cache._default_cache_path` to a temporary directory + during the test. """ + cache_path = str(os.path.join(str(tmpdir), "__parso_cache")) + monkeypatch.setattr(cache, '_default_cache_path', cache_path) + monkeypatch.setattr(cache, '_get_default_cache_path', lambda *args, **kwargs: cache_path) + return cache_path def test_modulepickling_change_cache_dir(tmpdir): """ @@ -57,7 +60,7 @@ def load_stored_item(hashed_grammar, path, item, cache_path): return item -@pytest.mark.usefixtures("isolated_jedi_cache") +@pytest.mark.usefixtures("isolated_parso_cache") def test_modulepickling_simulate_deleted_cache(tmpdir): """ Tests loading from a cache file after it is deleted. @@ -84,7 +87,7 @@ def test_modulepickling_simulate_deleted_cache(tmpdir): save_module(grammar._hashed, io, module, lines=[]) assert load_module(grammar._hashed, io) == module - unlink(_get_hashed_path(grammar._hashed, path)) + os.unlink(_get_hashed_path(grammar._hashed, path)) parser_cache.clear() cached2 = load_module(grammar._hashed, io) @@ -139,3 +142,27 @@ def test_cache_last_used_update(diff_cache, use_file_io): node_cache_item = next(iter(parser_cache.values()))[p] assert now < node_cache_item.last_used < time.time() + +def test_inactive_cache(tmpdir, isolated_parso_cache): + parser_cache.clear() + test_subjects = "abcdef" + for path in test_subjects: + parse('somecode', cache=True, path=os.path.join(str(tmpdir), path)) + raw_cache_path = os.path.join(isolated_parso_cache, _VERSION_TAG) + assert os.path.exists(raw_cache_path) + paths = os.listdir(raw_cache_path) + a_while_ago = time.time() - _CACHED_FILE_MAXIMUM_SURVIVAL + old_paths = set() + for path in paths[:len(test_subjects) // 2]: # make certain number of paths old + os.utime(os.path.join(raw_cache_path, path), (a_while_ago, a_while_ago)) + old_paths.add(path) + # nothing should be cleared while the lock is on + assert os.path.exists(_get_cache_clear_lock().path) + _remove_cache_and_update_lock() # it shouldn't clear anything + assert len(os.listdir(raw_cache_path)) == len(test_subjects) + assert old_paths.issubset(os.listdir(raw_cache_path)) + + os.utime(_get_cache_clear_lock().path, (a_while_ago, a_while_ago)) + _remove_cache_and_update_lock() + assert len(os.listdir(raw_cache_path)) == len(test_subjects) // 2 + assert not old_paths.intersection(os.listdir(raw_cache_path))