Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
410 lines (351 sloc) 15.6 KB
#!/usr/bin/env python
# -*- Mode: Python; tab-width: 8; py-indent-offset: 8; indent-tabs-mode: nil -*-
# polipo_trimcache version 0.2
# Written by J.P. Larocque, <>, OpenPGP 0xF61D2E61
# This software is in the public domain.
# Polipo is a small, caching HTTP proxy with nifty features like
# client and server IPv6 support, pipelining, and the caching of
# partial objects. See:
# Polipo doesn't have any built-in facility for keeping its cache
# directory under a given fixed size. This script implements that: it
# trims your cache directory to a target size by removing the files with
# oldest access times until the cache size is smaller than or equal to
# the target size.
# This script requires Python 2.2 or greater.
# Some notes about polipo_trimcache's operation:
# * Send Polipo SIGUSR1 prior to and SIGUSR2 after running this
# script, as if you were using Polipo's own cache-purging
# facility. This is explained in the manual:
# * If you run Polipo non-root (as you should), you should also run
# this script as the same user. It is conceivable that, given a
# carefully-crafted cache directory, polipo_trimcache could mangle
# with things outside the cache, either because of bugs in the
# script or in Python. Restricting this script to execution by
# the same or another isolated user will protect you from problems
# a cracker could cause if Polipo were cracked. (No current
# security issues with any of these pieces of software are known
# by the author at the time of writing.)
# * Hidden files and files other than regular files and directories
# are skipped. This means symlinks are not honored, with the
# exception of the root cache directory. This shouldn't matter
# for normal use.
# * On systems that report it, polipo_trimcache uses st_blocks
# from stat(). If st_blocks is 0 for a given file/directory, it
# uses st_size instead.
# * polipo_trimcache removes directories when it removes the last
# file from them (except for the root cache directory).
# Bugs: (both of which are caused by the last point above)
# * Empty directories aren't removed if they didn't contain any
# files to begin with.
# * Dry-run mode never pretends to remove directories, since it
# never removes files, enabling the condition for directories to
# be removed.
# The latest version of this program can be found at the URL:
# This program has been signed with my public key. Its fingerprint is:
# 5612 10A8 4986 2D85 A995 252B 4C02 5E02 F61D 2E61
# The detached signature can be found at the URL:
# Changelog:
# version 0.2 (2004/Aug/10)
# Cleaned up for initial release.
# version 0.1 (2004/Jun/27)
# Initially written.
help = \
"""Usage: polipo_trimcache [-fnpvVDh?] POLIPO_CACHE_DIR TARGET_SIZE
-f, --force
Force questionable behavior. Currently this means
continuing even when the TARGET_SIZE is less than 2KB.
-n, --no-op, --dry-run
Don't make any changes to the cache contents. Print
the files that would be deleted instead.
-p, --precise-expiry
Determines last-access time of cache files by opening
them and reading their X-Polipo-Access header. The
default behavior is to use the last-modification time
of the file, which should be faster. Analogous to the
Polipo configuration option "preciseExpiry".
-v, --verbose
Verbose operation. Prints every stage of the operation,
and reports hidden or special files.
-V, --version
Print version number, and exit.
-D, --debug
Debugging mode. Report each directory on cache
transversal, and each unlink/rmdir operation.
-h, -?, --help
Print usage information.
POLIPO_CACHE_DIR specifies the root of the Polipo cache. TARGET_SIZE
specifies the size polipo_trimcache trims down to, expressed in bytes.
(You may specify megabytes or gigabytes by appending "M" or "G" to the
size argument. The quantifiers [KMGTPEZY] are supported (future-
polipo_trimcache /var/local/cache/polipo 256M
Keeps said cache directory under 256 megabytes.
version = "0.2"
last_modified = "2004/August/10"
import os, os.path, sys, re, stat, time, getopt, math, rfc822
me = os.path.basename (sys.argv[0])
# These are defaults, not hard-coded settings.
force_mode = 0
no_op_mode = 0
precise_expiry_mode = 0
verbose_mode = 0
debug_mode = 0
def main (args):
global force_mode, no_op_mode, precise_expiry_mode, verbose_mode, debug_mode
options, extra = getopt.getopt (args, "fnpvVDh?", ("force", "no-op", "noop", "dry-run", "precise-expiry",
"verbose", "help", "version", "debug"))
except getopt.GetoptError, msg:
usage ()
return 1
for option, argument in options:
if option in ("-f", "--force"):
force_mode = 1
elif option in ("-n", "--no-op", "--noop", "--dry-run"):
no_op_mode = 1
elif option in ("-p", "--precise-expiry"):
precise_expiry_mode = 1
elif option in ("-v", "--verbose"):
verbose_mode = 1
elif option in ("-h", "-?", "--help"):
sys.stdout.write (help)
return 0
elif option in ("-V", "--version"):
print "%s version %s (%s)" % (me, version, last_modified)
return 0
elif option in ("-D", "--debug"):
debug_mode = 1
if len (extra) != 2:
usage ()
return 1
cache_dir = extra[0]
target_size = hr2num (extra[1])
verbose_msg ("Transversing cache...")
cobjs, cdirs = transverse_cache (cache_dir)
verbose_msg ("Sorting and counting...")
cobjs.sort (lambda o1, o2: cmp (o2.last_access, o1.last_access))
total_size = 0L
for cfile in cobjs + cdirs:
total_size += cfile.size
verbose_msg ("Cache is currently %sB (%d files, %d dirs)." % (num2hr (total_size), len (cobjs), len (cdirs)))
verbose_msg ("Trimming...")
files_removed = dirs_removed = 0
latest_trimmed = None
while cobjs and (total_size > target_size):
# Delete the oldest-used cache object, and possibly its parent directory.
victim = cobjs.pop ()
if no_op_mode:
print victim.path
debug_msg ("Removing file: %s" % victim.path)
os.unlink (victim.path)
total_size -= victim.size
files_removed += 1
latest_trimmed = victim.last_access
# Now delete parent and grandparent directories, if empty.
victim = victim.parent
while victim:
victim.children -= 1
# We add the check for files with listdir in case there's any files we skipped over (hidden, special).
if (victim.children == 0) and (not os.listdir (victim.path)):
if no_op_mode:
debug_msg ("Removing directory [NOOP]: %s" % victim.path)
debug_msg ("Removing directory: %s" % victim.path)
os.rmdir (victim.path)
total_size -= victim.size
cdirs.remove (victim)
dirs_removed += 1
# Parent directory is now the next candidate for removal.
victim = victim.parent
verbose_msg ("Cache trimmed (-%d files, -%d dirs) to %sB (%d files, %d dirs)." % \
(files_removed, dirs_removed, num2hr (total_size), len (cobjs), len (cdirs)))
if latest_trimmed is not None:
verbose_msg ("Latest file removed was %s old" % secs2thr (int (time.time ()) - latest_trimmed))
def warn (msg):
sys.stderr.write ("%s: %s\n" % (me, msg))
def usage ():
warn ("usage: %s [-fnpvVDh?] POLIPO_CACHE_DIR TARGET_SIZE" % me)
def verbose_msg (msg):
if verbose_mode: warn (msg)
def debug_msg (msg):
if debug_mode: warn (msg)
class CacheFile:
"""Superclass for any kind of (non-special) file found in the Polipo cache tree."""
class CacheDir (CacheFile):
"""Class for directories only."""
def __init__ (self, parent = None):
self.parent = parent
self.children = None
self.size = None
self.path = None
def __repr__ (self):
# Still waiting on that ternary operator. OH WAIT, those "decrease readability".
if self.children is None: children = "?"
else: children = str (self.children)
if self.size is None: size = "?"
else: size = str (self.size)
if self.path is None: path = "?"
else: path = "\"%s\"" % self.path
return "<CacheDir instance, path=%s, children=%s, size=%s, has_parent=%d>" % \
(path, children, size, self.parent != None)
class CacheObject (CacheFile):
"""Class for regular files (cached HTTP objects) only."""
def __init__ (self, parent = None):
self.parent = parent
self.last_access = None
self.size = None
self.path = None
def __repr__ (self):
if self.last_access is None: last_access = "?"
else: last_access = time.strftime ("[%Y/%m/%d %H:%M:%S]", time.localtime (self.last_access))
if self.size is None: size = "?"
else: size = str (self.size)
if self.path is None: path = "?"
else: path = "\"%s\"" % self.path
return "<CacheObject instance, path=%s, last_access=%s, size=%s, has_parent=%d>" % \
(path, last_access, size, self.parent != None)
def transverse_cache (dir, parent = None):
"""Recursively transverses a given directory. Returns a tuple of 0) a list of all found files as CacheObject instances, and 1) a list of all found directories as CacheDir instances (except for the given root directory). Note that the return values are flat lists and not some sort of tree structure. If parent is given, each file in the given directory will be assigned its value as the parent directory, and the children attribute of the parent will be assigned the number of children found in the given directory."""
debug_msg ("Entering directory: %s" % dir)
cobjs = []
cdirs = []
children = 0
for file in os.listdir (dir):
file_full = os.path.join (dir, file)
if file.startswith ("."):
verbose_msg ("Skipping hidden file: %s" % file_full)
file_stat = os.lstat (file_full)
if stat.S_ISDIR (file_stat.st_mode):
cfile = CacheDir (parent)
cdirs.append (cfile)
sub_cobjs, sub_cdirs = transverse_cache (file_full, cfile)
cobjs += sub_cobjs
cdirs += sub_cdirs
del (sub_cobjs, sub_cdirs)
elif stat.S_ISREG (file_stat.st_mode):
cfile = CacheObject (parent)
cobjs.append (cfile)
if precise_expiry_mode:
cfile.last_access = get_precise_access (file_full)
# If there is no precise access-time information, we use something "very old":
if cfile.last_access == None:
cfile.last_access = 0
cfile.last_access = file_stat.st_mtime
verbose_msg ("Skipping special file: %s" % file_full)
cfile.size = file_stat.st_blocks * 512
# Directories seem to have an st_blocks value of 0.
if cfile.size == 0:
cfile.size = file_stat.st_size
except AttributeError:
cfile.size = file_stat.st_size
cfile.path = file_full
children += 1
if parent:
parent.children = children
return cobjs, cdirs
def get_precise_access (file_name):
"""Open a Polipo cache file and return its last access time, determined from X-Polipo-Access."""
access = None
file = open (file_name)
# For HTTP header line:
file.readline ()
line_num = 1
while 1:
line_num += 1
line = file.readline ()
if line == "\r\n": break
header, value = line.split (": ", 1)
if header.lower () == "x-polipo-access":
access = int (rfc822.mktime_tz (rfc822.parsedate_tz (value)))
except ValueError:
whine ("Reading \"%s\":%d" % (file_name, line_num))
file.close ()
return access
def whine (msg):
"""Warns the given message and information on the current exception being handled."""
exc_class, exc_instance = sys.exc_info ()[:2]
exc_args = exc_instance.args
warn ("%s: %s: %s" % (msg, exc_class, exc_args))
hr2num_re = re.compile (r'^([0-9]+(?:\.[0-9]+)?) *([KMGTPEZY]?)$', re.I)
quants_dict = {
'K': 1024L ** 1,
'M': 1024L ** 2,
'G': 1024L ** 3,
'T': 1024L ** 4,
# It's uh, future-proof...
'P': 1024L ** 5,
'E': 1024L ** 6,
'Z': 1024L ** 7,
'Y': 1024L ** 8,
quants_tups = quants_dict.items ()
quants_tups.sort (lambda t1, t2: cmp (t1[1], t2[1]))
quants_tups_dec = quants_tups
quants_tups_dec.reverse ()
tquants_dict = {
'm': 60,
'h': 60 * 60,
'd': 60 * 60 * 24,
'w': 60 * 60 * 24 * 7,
tquants_tups = tquants_dict.items ()
tquants_tups.sort (lambda t1, t2: cmp (t1[1], t2[1]))
tquants_tups_dec = tquants_tups
tquants_tups_dec.reverse ()
def hr2num (hr):
"""Given a string with an integral or floating-pointing number and an optional quantifier, return a long of the expressed amount."""
match = hr2num_re.match (hr)
if match:
amount = float ( (1))
quant = (2)
if quant:
amount *= quants_dict[quant.upper ()]
return long (math.ceil (amount))
raise ValueError, "bad human-readable expression (\"NN.N[KMGTPEZY]\")"
def num2hr (num):
"""Given an integer, return a string with the same amount expressed with a quantifier."""
num = long (num)
for quant_abbr, quant_amount in quants_tups_dec:
if num >= quant_amount:
return "%.2f%s" % (float (num) / quant_amount, quant_abbr)
return str (num)
def secs2thr (seconds):
output = ""
for quant_abbr, quant_amount in tquants_tups_dec:
if seconds >= quant_amount:
q_amount = int (seconds) / quant_amount
seconds -= q_amount * quant_amount
output += "%d%s" % (q_amount, quant_abbr)
if seconds == math.floor (seconds):
output += "%ds" % seconds
output += "%.1fs" % seconds
return output
if __name__ == "__main__":
sys.exit (main (sys.argv[1:]))