In [17]:
import random
import requests
import pandas as pd
import time

r = requests.get("https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/stations.txt")
r.raise_for_status()
stations = r.text.strip().split("\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)

In [5]:
workload[:10]

['USC00479190',
 'USC00473250',
 'US1WIWS0006',
 'US1WIWS0006',
 'USC00479190',
 'US1WIWS0006',
 'USC00478540',
 'US1WIRC0041',
 'US1WIWS0032',
 'USC00473250']

Implementing a FIFO cache:

In [26]:
cache = {}  # key=station, value=DataFrame for that station
CACHE_SIZE = 5
evict_order = []   # FIFO cache (first is oldest)

# stats
hits = [] # hit=True, miss=False
ms_latencies = []

def get_station(station):
    start = time.time()
    if station in cache:
        hits.append(True)
        #print("hit")
        df = cache[station]
    else:
        hits.append(False)
        #print("miss")
        
        # should we evict?
        if len(cache) >= CACHE_SIZE:
            victim = evict_order.pop(0)
            cache.pop(victim)           
            
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                             names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)
        cache[station] = df
        evict_order.append(station)
    end = time.time()
    ms_latencies.append((end - start) * 1000)
    return df

for station in workload:
    df = get_station(station)
    #print(hits[-1], ms_latencies[-1])

print(f"FIFO cache hit rate = {sum(hits) / len(hits)}")
print(f"FIFO average latency = {sum(ms_latencies) / len(ms_latencies)} ms")

FIFO cache hit rate = 0.53
FIFO average latency = 99.55347061157227 ms


LRU Cache (very similar to FIFO, except when we get a hit):

In [27]:
cache = {}  # key=station, value=DataFrame for that station
CACHE_SIZE = 5
evict_order = []   # LRU cache (first is least recently used)

# stats
hits = [] # hit=True, miss=False
ms_latencies = []

def get_station(station):
    start = time.time()
    if station in cache:
        hits.append(True)
        #print("hit")
        df = cache[station]
        evict_order.remove(station)
        evict_order.append(station)
    else:
        hits.append(False)
        #print("miss")
        
        # should we evict?
        if len(cache) >= CACHE_SIZE:
            victim = evict_order.pop(0)
            cache.pop(victim)           
            
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                             names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)
        cache[station] = df
        evict_order.append(station)
    end = time.time()
    ms_latencies.append((end - start) * 1000)
    return df

for station in workload:
    df = get_station(station)
    #print(hits[-1], ms_latencies[-1])

print(f"LRU cache hit rate = {sum(hits) / len(hits)}")
print(f"LRU average latency = {sum(ms_latencies) / len(ms_latencies)} ms")

LRU cache hit rate = 0.61
LRU average latency = 71.95860624313354 ms
