In [1]:
# make sure we are working in module directory
repo_root = !git rev-parse --show-toplevel
module_path = repo_root[0] + "/backend/heatflask"
%cd $module_path

# Make cells wider
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

/home/efrem/dev/heatflask/backend/heatflask


In [2]:
# %%writefile Streams.py
"""
***  For Jupyter notebook ***
Paste one of these Jupyter magic directives to the top of a cell
 and run it, to do these things:
    %%cython --annotate    # Compile and run the cell
    %load Streams.py         # Load Streams.py file into this (empty) cell
    %%writefile Streams.py   # Write the contents of this cell to Streams.py
"""

import os
import time
import datetime
from logging import getLogger
import msgpack
import polyline
import asyncio

import DataAPIs
from DataAPIs import db
import Strava
import StreamCodecs

log = getLogger(__name__)
log.propagate = True

COLLECTION_NAME = "streams"
CACHE_PREFIX = "S:"

SECS_IN_HOUR = 60 * 60
SECS_IN_DAY = 24 * SECS_IN_HOUR

MONGO_TTL = int(os.environ.get("MONGO_STREAMS_TTL", 10)) * SECS_IN_DAY
REDIS_TTL = int(os.environ.get("REDIS_STREAMS_TTL", 4)) * SECS_IN_HOUR


class Box:
    collection = None


myBox = Box()


async def get_collection():
    if myBox.collection is None:
        myBox.collection = await DataAPIs.init_collection(
            COLLECTION_NAME, ttl=MONGO_TTL, cache_prefix=CACHE_PREFIX
        )
    return myBox.collection


POLYLINE_PRECISION = 6


def encode_streams(activity_id, rjson):
    return msgpack.packb(
        {
            "id": activity_id,
            "t": StreamCodecs.rld_encode(rjson["time"]["data"]),
            "a": StreamCodecs.rld_encode(rjson["altitude"]["data"]),
            "p": polyline.encode(rjson["latlng"]["data"], POLYLINE_PRECISION),
        }
    )


def decode_streams(msgpacked_streams):
    d = msgpack.unpackb(msgpacked_streams)
    return {
        "id": d["id"],
        "time": StreamCodecs.rld_decode(d["t"], dtype="u2"),
        "altitude": StreamCodecs.rld_decode(d["a"], dtype="i2"),
        "latlng": polyline.decode(d["p"], POLYLINE_PRECISION),
    }


def mongo_doc(activity_id, stream_data, ts=None):
    return {
        "_id": int(activity_id),
        "mpk": stream_data,
        "ts": ts or datetime.datetime.now(),
    }


def cache_key(aid):
    return f"{CACHE_PREFIX}{aid}"


async def strava_import(activity_ids, **user):
    uid = int(user["_id"])

    strava = Strava.AsyncClient(uid, **user["auth"])
    await strava.update_access_token()
    coll = await get_collection()

    mongo_docs = []
    now = datetime.datetime.now()
    aiterator = strava.get_many_streams(activity_ids)
    
    async with db.redis.pipeline(transaction=True) as pipe:
        async for aid, streams in aiterator:
            packed = encode_streams(aid, streams)
            
            # queue packed streams to be redis cached 
            pipe = pipe.setex(cache_key(aid), REDIS_TTL, packed)
        
            mongo_docs.append(mongo_doc(aid, packed, ts=now))
            
            abort_signal = yield aid, packed

            if abort_signal:
                await Strava.AsyncClient.abort(aiterator)
                break
                
        await pipe.execute()
    await coll.insert_many(mongo_docs)


async def aiter_query(activity_ids=None, user=None):
    if not activity_ids:
        return

    # First we check Redis cache
    t0 = time.perf_counter()
    keys = [cache_key(aid) for aid in activity_ids]
    redis_response = await db.redis.mget(keys)

    # Reset TTL for those cached streams that were hit
    async with db.redis.pipeline(transaction=True) as pipe:
        for k, val in zip(keys, redis_response):
            if val:
                pipe = pipe.expire(k, REDIS_TTL)
        await pipe.execute()

    t1 = time.perf_counter()
    local_result = [(a, s) for a, s in zip(activity_ids, redis_response) if s]
    log.debug(
        "retrieved %d streams from Redis in %d", len(local_result), (t1 - t0) * 1000
    )
    # activity IDs of cache misses
    activity_ids = [a for a, s in zip(activity_ids, redis_response) if not s]
    if activity_ids:
        # Next we query MongoDB for any cache misses
        t0 = time.perf_counter()
        streams = await get_collection()
        query = {"_id": {"$in": activity_ids}}
        exclusions = {"ts": False}
        
        cursor = streams.find(query, projection=exclusions)
        mongo_result = [(doc["_id"], doc["mpk"]) async for doc in cursor]
        local_result.extend(mongo_result)
        mongo_result_ids = [_id for _id, mpk in mongo_result]

        # Cache the mongo hits
        async with db.redis.pipeline(transaction=True) as pipe:
            for aid, s in mongo_result:
                pipe = pipe.setex(cache_key(aid), REDIS_TTL, s)
            await pipe.execute()

        # Update TTL for mongo hits
        await streams.update_many(
            {"_id": {"$in": mongo_result_ids}},
            {"$set": {"ts": datetime.datetime.utcnow()}},
        )
        elapsed = (time.perf_counter() - t0) * 1000
        log.debug("retrieved %d streams from Mongo in %d", len(mongo_result), elapsed)

        activity_ids = list(set(activity_ids) - set(mongo_result_ids))

    streams_import = None
    first_fetch = None
    if activity_ids and (user is not None):
        # Start a fetch process going. We will get back to this...
        t0 = time.perf_counter()
        streams_import = strava_import(activity_ids, **user)
        first_fetch = asyncio.create_task(streams_import.__anext__())

    # Yield all the results from Redis and Mongo
    for item in local_result:
        abort_signal = yield item
        if abort_signal:
            log.info("Local Streams query aborted")
            if streams_import:
                await Strava.AsyncClient.abort(streams_import)
            break

    if streams_import:
        # Now we yield results of fetches as they come in 
        item1 = await first_fetch
        abort_signal = yield item1
        imported_items = [item1]

        if not abort_signal:
            async for item in streams_import:
                imported_items.append(item)
                abort_signal = yield item
                if abort_signal:
                    break

        if abort_signal:
            Strava.AsyncClient.abort(streams_import)
            log.info("Remote Streams query aborted")

        t1 = time.perf_counter()
        log.debug(
            "retrieved %d streams from Strava in %d",
            len(imported_items),
            (t1 - t0) * 1000,
        )
        imported_ids = set(aid for aid, mpk in imported_items)
        missing_ids = set(activity_ids) - imported_ids
        if missing_ids:
            log.info("unable to import streams for %s", missing_ids)


async def query(**kwargs):
    return [s async for s in aiter_query(**kwargs)]


async def delete(activity_ids):
    if not activity_ids:
        return
    streams = await get_collection()
    mongo_result = await streams.delete_many({"_id": {"$in": activity_ids}})
    keys = [cache_key(aid) for aid in activity_ids]
    redis_result = await db.redis.delete(*keys)

async def clear_cache():
    streams_keys = await db.redis.keys(cache_key("*"))
    if streams_keys:
        return await db.redis.delete(*streams_keys)
    
def stats():
    return DataAPIs.stats(COLLECTION_NAME)


def drop():
    return DataAPIs.drop(COLLECTION_NAME)

In [3]:
import logging
logging.basicConfig(level="DEBUG")

await DataAPIs.connect()

N_FETCH = 15

import Index
result = await Index.query(limit=N_FETCH)
activity_ids = [d["_id"] for d in result["docs"]]
result

DEBUG:DataAPIs:initialized MongoDB and Redis
DEBUG:Index:queried 15 activities in 3ms
DEBUG:Index:ts update in 2ms


{'docs': [{'_id': 6679845090,
   'ts': datetime.datetime(2022, 2, 16, 20, 8, 37, 380000),
   'U': 15972102,
   'N': 'Afternoon Walk',
   'D': 3052.6,
   'T': 4484,
   't': 'Walk',
   's': 1644816953,
   'o': -28800.0,
   '#a': 1,
   '#p': 2,
   'v': 'only_me',
   'c': False,
   'p': True,
   'B': {'SW': [37.71028137207031, -122.5067367553711],
    'NE': [37.72166061401367, -122.5013198852539]}},
  {'_id': 6678989222,
   'ts': datetime.datetime(2022, 2, 16, 20, 8, 37, 380000),
   'U': 15972102,
   'N': 'Barefoot Running Meetup',
   'D': 5222.8,
   'T': 1996,
   't': 'Run',
   's': 1644805008,
   'o': -28800.0,
   '#a': 1,
   '#p': 4,
   'v': 'everyone',
   'c': False,
   'p': False,
   'B': {'SW': [37.798580169677734, -122.26313781738281],
    'NE': [37.810691833496094, -122.249267578125]}},
  {'_id': 6673550638,
   'ts': datetime.datetime(2022, 2, 16, 20, 8, 37, 380000),
   'U': 15972102,
   'N': 'Morning Hike',
   'D': 11438.1,
   'T': 6935,
   't': 'Hike',
   's': 1644717719,
   'o':

In [4]:
import asyncio
import Users
import Strava

admin = await Users.get(Users.ADMIN[0])
admin

{'_id': 15972102,
 'access_count': 2,
 'auth': {'expires_at': 1644955224,
  'refresh_token': '05867993a2d0c5b60c51653636a9c295348551f3',
  'access_token': '5dabace655c26d1372321c53f5dad173ec2bd674'},
 'city': 'Oakland',
 'country': 'United States',
 'firstname': '👣',
 'lastname': 'Efrem',
 'profile': 'https://dgalywyr863hv.cloudfront.net/pictures/athletes/15972102/9131294/7/medium.jpg',
 'state': 'California',
 'ts': datetime.datetime(2022, 2, 15, 18, 58, 32, 199000),
 'username': 'bfef'}

In [5]:
q = await query(activity_ids=activity_ids, user=admin)

DEBUG:__main__:retrieved 15 streams from Redis in 6


In [6]:
a,b = zip(*q)
a,b

((6679845090,
  6678989222,
  6673550638,
  6663463299,
  6658564883,
  6653842193,
  6648182478,
  6643389712,
  6637042534,
  6636927789,
  6622734064,
  6618144574,
  6611297682,
  6606841366,
  6601470766),
 (b"\x84\xa2id\xcf\x00\x00\x00\x01\x8e&X\xe2\xa1t\xc49\x01\x00\x00\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\xff\xff\x01\x95\xa1a\xc5\x03\xbd\x00/\x00\x80\x00\x0e\x01\x00\x00\x00\x01\x80\x00\x07\x01\x80\x00\x13\xff\x80\x00\x05\x01\x80\x008\x01\x80\x00\x15\x01\x80\x00\x11\xff\x80\x00\x17\x01\x80\x00\x05\x01\x80\x00\x13\x01\x80\x00\x0b\xff\x80\x00\x13\xff\x80\x00M\x01\x80\x00/\xff\x80\x00\x0b\x01\x80\x00\x07\xff\x80\x005\xff\x80\x00\x05\x01\x00\x01\x80\x00\x05\x01\x80\x001\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x80\x00\x05\x01\x80\x00\x05\x01\x00\x00\x00\xff\x80\x00\x07\x01\x80\x00\x05\x01\x80\x00\x0b\xff\x00\x00

In [7]:
await stats()

{'ns': 'heatflask.streams',
 'size': 152627,
 'count': 15,
 'avgObjSize': 10175,
 'storageSize': 208896,
 'freeStorageSize': 61440,
 'capped': False,
 'wiredTiger': {'metadata': {'formatVersion': 1},
  'creationString': 'access_pattern_hint=none,allocation_size=4KB,app_metadata=(formatVersion=1),assert=(commit_timestamp=none,durable_timestamp=none,read_timestamp=none,write_timestamp=off),block_allocation=best,block_compressor=snappy,cache_resident=false,checksum=on,colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=,name=),exclusive=false,extractor=,format=btree,huffman_key=,huffman_value=,ignore_in_memory_cache_size=false,immutable=false,import=(enabled=false,file_metadata=,repair=false),internal_item_max=0,internal_key_max=0,internal_key_truncate=true,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=64MB,log=(enabled=true),lsm=(auto_throttle=true,bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8,bloo

In [8]:
await DataAPIs.disconnect()