In [2]:
import pandas as pd
import requests, random

r = requests.get("https://pages.cs.wisc.edu/~harter/cs639/data/wi-stations/stations.txt")
r.raise_for_status()
stations = r.text.strip().split("\n")
stations = random.sample(stations, k=10)
workload = random.choices(stations, k=100, weights=[0.3, 0.2] + [0.5/8]*8)

In [4]:
workload[:10]

['US1WIGT0002',
 'US1WIWN0009',
 'USC00471720',
 'US1WIWN0009',
 'US1WIGT0002',
 'US1WIGT0002',
 'USC00471720',
 'US1WIWK0047',
 'US1WIGT0002',
 'US1WIWK0047']

In [5]:
df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs639/data/wi-stations/US1WIGT0002.csv.gz",
                         names=["station", "date", "element", "value", "m", "q", "s", "obs"])

In [7]:
df.head()

Unnamed: 0,station,date,element,value,m,q,s,obs
0,US1WIGT0002,20151107,PRCP,0,,,N,700
1,US1WIGT0002,20151109,PRCP,0,,,N,700
2,US1WIGT0002,20151110,PRCP,0,,,N,700
3,US1WIGT0002,20151111,PRCP,0,,,N,700
4,US1WIGT0002,20151112,PRCP,180,,,N,700


In [20]:
import time

cache_size = 6
cache = {} # key is the station, value in the DF
evict_order = []   # evict from the front (TODO: choose more efficient structures)

# stats
hits = []         # 1 is a hit, 0 is a miss
latencies = []    # milliseconds

def get_df(station):
    start = time.time()   # seconds passed since Jan 1, 1970
    #print("GET", station)
    if station in cache:
        #print("hit")
        hits.append(1)
        
        # LRU updates the evict order whenever something is used (even the hit case)
        evict_order.remove(station)
        evict_order.append(station)
        df = cache[station]
    else:
        #print("miss")
        hits.append(0)
        
        df = pd.read_csv(f"https://pages.cs.wisc.edu/~harter/cs639/data/wi-stations/{station}.csv.gz",
                         names=["station", "date", "element", "value", "m", "q", "s", "obs"])
        cache[station] = df
        evict_order.append(station)
        # should I evict?
        if len(cache) > cache_size:
            #print("evict")
            victim = evict_order.pop(0)
            cache.pop(victim)
        
    #print("CACHE", evict_order)
    end = time.time()
    latencies.append((end-start) * 1000)
    return df

for station in workload:
    df = get_df(station)
    
print("Hit Rate:", sum(hits) / len(hits))
print("Avg Latency:", sum(latencies) / len(latencies))

Hit Rate: 0.7
Avg Latency: 37.075936794281006


In [23]:
# tail latency: latency at a high percentile (like 0.9, 0.99)
import numpy as np
print("90th percentile latency (tail latency):", np.quantile(latencies, 0.9))

90th percentile latency (tail latency): 128.31897735595703


In [11]:
get_df('US1WIKN0002')

hit


Unnamed: 0,station,date,element,value,m,q,s,obs
0,US1WIKN0002,20070913,PRCP,0,,,N,800
1,US1WIKN0002,20070914,PRCP,0,,,N,800
2,US1WIKN0002,20070915,PRCP,0,,,N,800
3,US1WIKN0002,20070916,PRCP,0,,,N,815
4,US1WIKN0002,20070917,PRCP,0,,,N,800
...,...,...,...,...,...,...,...,...
485,US1WIKN0002,20080502,PRCP,0,,,N,800
486,US1WIKN0002,20080503,PRCP,99,,,N,846
487,US1WIKN0002,20080501,SNOW,0,,,N,930
488,US1WIKN0002,20080502,SNOW,0,,,N,800
