# CACHE

In [12]:
import random
import requests
import pandas as pd

r = requests.get("https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/stations.txt")
r.raise_for_status() # throws exception if return code is not 200

stations = r.text.strip().split("\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)

In [13]:
workload[:10]

['USW00094855',
 'USC00477725',
 'USW00094855',
 'US1WIVR0004',
 'US1WIBY0032',
 'US1WIBY0032',
 'US1WIVR0004',
 'US1WIBY0032',
 'USC00476357',
 'US1WIMW0067']

In [14]:
station = 'US1WIBY0032'
df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                         names=["station", "date", "element", "value", "m", "q", "s", "obs"])


In [15]:
df.head(3)

Unnamed: 0,station,date,element,value,m,q,s,obs
0,US1WIBY0032,20210331,PRCP,0,T,,N,700.0
1,US1WIBY0032,20210401,PRCP,0,,,N,730.0
2,US1WIBY0032,20210402,PRCP,0,,,N,800.0


In [20]:
import time

## FIFO

In [25]:
# FIFO

cache_size = 2
cache = {} # key=station_name, value=DataFrame_for_that_station
evict_order = [] # start of list contains items to be evicted (end of list is freshest)
# TODO: use a faster datastructure for evict_order that is not O(N) for pop(0)


# stats
hits = [] # True(hit), False(miss)
ms_latencies = []


def get_station(station):
    start = time.time()
    if station in cache:
        # print('hit', end=", ")
        hits.append(True)
        df = cache[station]
    else:
        # print('miss', end=", ")
        hits.append(False)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                         names=["station", "date", "element", "value", "m", "q", "s", "obs"])
        cache[station] = df
        evict_order.append(station)
        if len(cache) > cache_size:
            # EVICT
            victim = evict_order.pop(0)
            cache.pop(victim)
    end = time.time()
    ms = (end - start) * 1000
    ms_latencies.append(ms)
    return df


for station in workload:
    df = get_station(station)
    # print(station, evict_order)

  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://p

In [26]:
# hit rate
sum(hits) / len(hits)

0.32

In [27]:
# average latency
sum(ms_latencies) / len(ms_latencies)

160.2871799468994

## LRU

In [28]:
cache_size = 5
cache = {} # key=station_name, value=DataFrame_for_that_station
evict_order = [] # start of list contains items to be evicted (end of list is freshest)
# TODO: use a faster datastructure for evict_order that is not O(N) for pop(0)


# stats
hits = [] # True(hit), False(miss)
ms_latencies = []


def get_station(station):
    start = time.time()
    if station in cache:
        # print('hit', end=", ")
        hits.append(True)
        df = cache[station]

        evict_order.remove(station)
        evict_order.append(station)
    else:
        # print('miss', end=", ")
        hits.append(False)
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
                         names=["station", "date", "element", "value", "m", "q", "s", "obs"])
        cache[station] = df
        evict_order.append(station)
        if len(cache) > cache_size:
            # EVICT
            victim = evict_order.pop(0)
            cache.pop(victim)
    end = time.time()
    ms = (end - start) * 1000
    ms_latencies.append(ms)
    return df


for station in workload:
    df = get_station(station)
    # print(station, evict_order)

  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs544/data/wi-stations/{station}.csv.gz",
  df = pd.read_csv(f"https://p

In [29]:
# hit rate
sum(hits) / len(hits)

0.59

In [30]:
# average latency
sum(ms_latencies) / len(ms_latencies)

75.25295734405518