In [10]:
import random
import requests
import pandas as pd

r = requests.get("https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/stations.txt")
r.raise_for_status()
stations = r.text.strip().split("\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)

In [11]:
workload[:10]

['US1WIKN0028',
 'US1WIKN0028',
 'US1WILN0004',
 'USC00479218',
 'US1WIKN0028',
 'US1WIKN0028',
 'US1WIWN0009',
 'US1WIWN0009',
 'USC00479218',
 'USC00479218']

In [12]:
station = 'USC00478267'
df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                         names=["station", "date", "element", "value", "m", "q", "s", "obs"])
df.head(3)

  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",


Unnamed: 0,station,date,element,value,m,q,s,obs
0,USC00478267,19050301,PRCP,0,,,0,
1,USC00478267,19050302,PRCP,0,,,0,
2,USC00478267,19050303,PRCP,0,T,,0,


In [23]:
import time

In [29]:
# FIFO

cache_size = 3
cache = {}   # key=station name, value=DataFrame for that station
evict_order = []     # start of list contains items to be evicted (end of list is freshest)
# TODO: use a faster data struct for evict_order than is not O(N) for pop(0)

# stats
hits = [] # True(hit), False(miss)
ms_latencies = []

def get_station(station):
    start = time.time()
    if station in cache:
        print("hit", end=", ")
        hits.append(True)
        df = cache[station]
    else:
        print("miss", end=", ")
        hits.append(False)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                             names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)

        cache[station] = df
        evict_order.append(station)
        
        # should we evict?
        if len(cache) > cache_size:
            victim = evict_order.pop(0)  # pop from the front
            cache.pop(victim)

    end = time.time()
    ms = (end-start) * 1000
    ms_latencies.append(ms)

    return df

for station in workload:
    df = get_station(station)
    #print(station, evict_order)

miss, hit, miss, miss, hit, hit, miss, hit, hit, hit, miss, miss, miss, hit, hit, miss, hit, hit, miss, hit, miss, miss, miss, hit, miss, hit, hit, miss, miss, miss, miss, hit, hit, miss, miss, miss, miss, hit, hit, miss, miss, hit, miss, hit, miss, hit, miss, hit, miss, miss, miss, miss, miss, miss, miss, miss, miss, hit, miss, miss, miss, hit, miss, hit, hit, miss, hit, miss, miss, hit, miss, miss, miss, miss, miss, miss, hit, miss, hit, miss, hit, hit, hit, hit, hit, miss, hit, miss, miss, miss, miss, miss, hit, hit, hit, hit, miss, hit, hit, hit, 

In [30]:
sum(hits) / len(hits)

0.43

In [31]:
sum(ms_latencies) / len(ms_latencies)

65.28526067733765

In [32]:
# LRU (exactly the same, EXCEPT for the "hit" case)

cache_size = 3
cache = {}   # key=station name, value=DataFrame for that station
evict_order = []     # start of list contains items to be evicted (end of list is freshest)
# TODO: use a faster data struct for evict_order than is not O(N) for pop(0)

# stats
hits = [] # True(hit), False(miss)
ms_latencies = []

def get_station(station):
    start = time.time()
    if station in cache:
        print("hit", end=", ")
        hits.append(True)
        df = cache[station]

        evict_order.remove(station)
        evict_order.append(station)
    else:
        print("miss", end=", ")
        hits.append(False)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                             names=["station", "date", "element", "value", "m", "q", "s", "obs"], low_memory=False)

        cache[station] = df
        evict_order.append(station)
        
        # should we evict?
        if len(cache) > cache_size:
            victim = evict_order.pop(0)  # pop from the front
            cache.pop(victim)

    end = time.time()
    ms = (end-start) * 1000
    ms_latencies.append(ms)

    return df

for station in workload:
    df = get_station(station)
    #print(station, evict_order)

miss, hit, miss, miss, hit, hit, miss, hit, hit, hit, miss, miss, hit, hit, hit, miss, hit, miss, miss, hit, hit, hit, miss, hit, miss, hit, hit, miss, miss, miss, miss, hit, hit, miss, miss, miss, miss, hit, hit, miss, miss, hit, miss, hit, miss, miss, miss, miss, miss, miss, miss, miss, miss, miss, miss, miss, miss, hit, miss, miss, hit, hit, miss, hit, hit, miss, hit, miss, miss, hit, hit, miss, hit, miss, miss, miss, hit, miss, hit, miss, hit, hit, hit, hit, hit, miss, hit, hit, miss, hit, miss, miss, hit, hit, hit, hit, miss, hit, miss, hit, 

In [33]:
sum(hits) / len(hits)

0.47

In [34]:
sum(ms_latencies) / len(ms_latencies)

60.01779317855835