In [1]:
import random
import requests
import pandas as pd

r = requests.get("https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/stations.txt")
r.raise_for_status()
stations = r.text.strip().split("\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)

In [4]:
workload[:5]

['US1WIWS0008', 'US1WIMC0007', 'US1WICB0011', 'US1WICB0011', 'US1WICB0011']

In [5]:
station = "US1WIWS0008"
df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                         names=["station", "date", "element", "value", "m", "q", "s", "obs"])

In [8]:
df.head(3)

Unnamed: 0,station,date,element,value,m,q,s,obs
0,US1WIWS0008,20081117,PRCP,191,,,N,1710
1,US1WIWS0008,20081117,SNOW,20,,,N,1710
2,US1WIWS0008,20081117,SNWD,20,,,N,1710


In [27]:
# FIFO policy
import time

cache_size = 3
cache = {}    # key=station name, value=DataFrame for that station

# evict from the front, because most recent will be at the end
# TODO: use a better data structure so it's not O(N)
evict_order = []

# stats
hits = []   # True(hit)/False(miss)
ms_latencies = []

def get_station(station):
    start = time.time()
    if station in cache:
        print("hit", end=", ")
        df = cache[station]
        hits.append(True)
    else:
        print("miss", end=", ")
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                             names=["station", "date", "element", "value", "m", "q", "s", "obs"])
        cache[station] = df
        evict_order.append(station)

        # evict?
        if len(cache) > cache_size:
            victim_station = evict_order.pop(0) # what has been in the queue the longest?
            cache.pop(victim_station)
        hits.append(False)
    end = time.time()
    ms = (end-start) * 1000
    ms_latencies.append(ms)

    return df

miss, hit, 

In [28]:
for station in workload:
    df = get_station(station)
    #print(station, evict_order)

hit, miss, miss, hit, hit, hit, hit, miss, miss, hit, hit, hit, miss, miss, miss, hit, miss, miss, hit, miss, hit, hit, hit, miss, miss, miss, miss, hit, miss, hit, miss, miss, miss, miss, miss, hit, hit, miss, hit, miss, miss, hit, miss, miss, miss, miss, miss, hit, miss, miss, hit, miss, miss, miss, hit, hit, hit, hit, hit, miss, miss, miss, miss, miss, miss, miss, hit, miss, miss, miss, miss, miss, hit, miss, miss, miss, hit, miss, hit, hit, hit, hit, miss, hit, miss, hit, hit, miss, miss, hit, hit, hit, miss, miss, miss, miss, hit, hit, miss, miss, 

In [29]:
# hit rate
print(sum(hits) / len(hits))

# avg latency
print(sum(ms_latencies) / len(ms_latencies), "ms")

0.4117647058823529
92.27583688848159 ms


In [32]:
# LRU policy (same as FIFO, expect when we have a hit)
import time

cache_size = 3
cache = {}    # key=station name, value=DataFrame for that station

# evict from the front, because most recent will be at the end
# TODO: use a better data structure so it's not O(N)
evict_order = []

# stats
hits = []   # True(hit)/False(miss)
ms_latencies = []

def get_station_lru(station):
    start = time.time()
    if station in cache:
        print("hit", end=", ")
        df = cache[station]
        evict_order.remove(station)
        evict_order.append(station)
        hits.append(True)
    else:
        print("miss", end=", ")
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                             names=["station", "date", "element", "value", "m", "q", "s", "obs"])
        cache[station] = df
        evict_order.append(station)

        # evict?
        if len(cache) > cache_size:
            victim_station = evict_order.pop(0) # what has been in the queue the longest?
            cache.pop(victim_station)
        hits.append(False)
    end = time.time()
    ms = (end-start) * 1000
    ms_latencies.append(ms)

    return df

In [33]:
for station in workload:
    df = get_station_lru(station)

miss, miss, miss, hit, hit, hit, hit, miss, miss, hit, hit, hit, miss, miss, hit, hit, miss, miss, hit, miss, hit, hit, hit, miss, miss, miss, miss, hit, miss, hit, miss, miss, miss, miss, miss, hit, hit, miss, hit, miss, miss, hit, miss, miss, miss, miss, miss, hit, miss, miss, hit, miss, miss, hit, hit, hit, hit, hit, hit, miss, hit, miss, miss, miss, miss, miss, hit, miss, miss, miss, miss, miss, hit, miss, hit, miss, hit, miss, hit, hit, hit, hit, miss, hit, miss, hit, miss, miss, miss, hit, hit, hit, miss, miss, miss, miss, hit, hit, miss, miss, 

In [34]:
# hit rate
print(sum(hits) / len(hits))

# avg latency
print(sum(ms_latencies) / len(ms_latencies), "ms")

0.43
90.22230863571167 ms
