In [187]:
import random
import csv
from typing import Optional
import cachetools
from math import inf
from functools import partial

random.seed(42)

RAM_SIZE = 20 * 1024 * 1024 
DISK_SIZE = 20 * 1024 * 1024 

In [188]:
page_views={}
with open("pageviews.csv", "r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        page_views[line["article"].replace(" ","_")] = int(line["views"])
del page_views[".xxx"] # Not found - causes conflicts so yeet

In [189]:
page_sizes = {}
with open("page_size.csv","r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        page_sizes[line["Article"]] = int(line["Size"]) # Note: Size is uncompressed
print(f"Total = {sum(page_sizes.values())} bytes")

Total = 468900428 bytes


In [190]:
POPULAR_FILES = set()
total = 0
for file in sorted(page_views, key=lambda pg: page_views[pg], reverse=True):
    size = page_sizes[file]
    if total + size // 10 <= DISK_SIZE:
        total += size // 10
        POPULAR_FILES.add(file)
print(len(POPULAR_FILES))

434


In [191]:
SMALLEST_FILES = set()
total = 0
for file, size in sorted(page_sizes.items(), key=lambda x: x[1]):
    if total + size // 10 <= DISK_SIZE:
        total += size // 10
        SMALLEST_FILES.add(file)
print(len(SMALLEST_FILES))

714


In [192]:
LARGEST_FILES = set()
total = 0
for file, size in sorted(page_sizes.items(), key=lambda x: x[1], reverse=True):
    if total + size // 10 <= DISK_SIZE:
        total += size // 10
        LARGEST_FILES.add(file)
print(len(LARGEST_FILES))

203


In [193]:
from ortools.algorithms import pywrapknapsack_solver
# TODO: Knapsack it
values = [page_views[page] for page in sorted(page_views)]
weights = [[page_sizes[page]//10 for page in sorted(page_views)]]
capacity = [DISK_SIZE]

solver = pywrapknapsack_solver.KnapsackSolver(
    pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER, "PageKnapsack"
)

solver.Init(values,weights,capacity)
ans = solver.Solve()


In [194]:
KNAPSACK_FILES = {
    file
    for i, file in enumerate(sorted(page_views))
    if solver.BestSolutionContains(i)
}
len(KNAPSACK_FILES)

679

In [195]:
ZIP_FILE = KNAPSACK_FILES


class CacheStrategy:

    def __init__(self, max_size=0, page_sizes=page_sizes) -> None:
        self.max_size = max_size
        self.cache = {}
        self.size = 0
        self.total_hits = 0
        self.total_requests = 0
        self.page_sizes = page_sizes

    def get(self, key) -> Optional[str]:
        self.total_requests += 1

        in_cache = key in ZIP_FILE or key in self.cache
        self.total_hits += in_cache

        if key in ZIP_FILE:
            return True
        if key in self.cache:
            return self.cache.get(key)
        else:
            self.set(key, self.page_sizes[key])
            return None

    def set(self, key, size) -> None:
        raise NotImplementedError

In [196]:
class NoStrategy(CacheStrategy):
    def set(self, key, value) -> None:
        pass


class CacheToolStrategy(CacheStrategy):

    def __init__(self, max_size=inf, cachetools_strategy=None) -> None:
        super().__init__()
        self.max_size = max_size
        self.cache = cachetools_strategy(maxsize=max_size)

    def set(self, key, size) -> None:
        while self.size + size > self.max_size and len(self.cache):
            self.size -= self.cache.popitem()[1]
        if self.size + size <= self.max_size:
            self.cache[key] = size
            self.size += size


# TODO: Prefetching Cache

In [197]:
with_max_size = partial(CacheToolStrategy, max_size=RAM_SIZE)
strategies = {
    "No cache": NoStrategy(max_size=RAM_SIZE),
    "LRU": with_max_size(cachetools_strategy=cachetools.LRUCache),
    "LFU": with_max_size(cachetools_strategy=cachetools.LFUCache),
    # "FIFO": with_max_size(cachetools_strategy=cachetools.FIFOCache),
    # "MRU": with_max_size(cachetools_strategy=cachetools.MRUCache),
    # "RR": with_max_size(cachetools_strategy=cachetools.RRCache),
}

In [198]:
SEQUENCE_SIZE = 10**6
articles = list(page_views.keys())
views = [page_views[article] for article in articles]
sequence = random.choices(population=articles, weights=views, k=SEQUENCE_SIZE)

for article in sequence:
    for strategy in strategies.values():
        strategy.get(article)

for name, obj in strategies.items():
    print(f"{name} hit rate: {obj.total_hits / obj.total_requests}")

No cache hit rate: 0.842694
LRU hit rate: 0.855867
LFU hit rate: 0.855719
