In [2]:
# make sure we are working in module directory
repo_root = !git rev-parse --show-toplevel
module_path = repo_root[0] + "/backend/heatflask"
%cd $module_path

# Make cells wider
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

/home/efrem/dev/heatflask/backend/heatflask


In [2]:
# %load Index.py
"""
***  For Jupyter notebook ***
Paste one of these Jupyter magic directives to the top of a cell
 and run it, to do these things:
    %%cython --annotate    # Compile and run the cell
    %load Index.py         # Load Index.py file into this (empty) cell
    %%writefile Index.py   # Write the contents of this cell to Index.py
"""

import os
import polyline
import numpy as np
from logging import getLogger
import datetime
import time
from pymongo import DESCENDING

import DataAPIs
import Strava
import Utility

log = getLogger(__name__)
log.propagate = True

COLLECTION_NAME = "index"

SECS_IN_HOUR = 60 * 60
SECS_IN_DAY = 24 * SECS_IN_HOUR

# How long we store Index entry in MongoDB
INDEX_TTL = int(os.environ.get("INDEX_TTL", 10)) * SECS_IN_DAY


class Box:
    collection = None


myBox = Box()


async def get_collection():
    if myBox.collection is None:
        myBox.collection = await DataAPIs.init_collection(
            COLLECTION_NAME, ttl=INDEX_TTL
        )
    return myBox.collection


def polyline_bounds(poly):
    try:
        latlngs = np.array(polyline.decode(poly), dtype=np.float32)
    except Exception:
        return

    lats = latlngs[:, 0]
    lngs = latlngs[:, 1]

    return {
        "SW": (float(lats.min()), float(lngs.min())),
        "NE": (float(lats.max()), float(lngs.max())),
    }


# MongoDB short field names speed up data transfer to/from
# remote DB server
ACTIVITY_ID = "_id"
TIMESTAMP = "ts"
USER_ID = "U"
ACTIVITY_NAME = "N"
DISTANCE_METERS = "D"
TIME_SECONDS = "T"
ACTIVITY_TYPE = "t"
UTC_START_TIME = "s"
UTC_LOCAL_OFFSET = "o"
N_ATHLETES = "#a"
N_PHOTOS = "#p"
FLAG_COMMUTE = "c"
FLAG_PRIVATE = "p"
LATLNG_BOUNDS = "B"
VISIBILITY = "v"


# see https://developers.strava.com/docs/reference/#api-models-SummaryActivity
def mongo_doc(
    # From Strava SummaryActivity record
    id=None,
    athlete=None,
    name=None,
    distance=None,
    moving_time=None,
    elapsed_time=None,
    type=None,
    start_date=None,
    utc_offset=None,
    athlete_count=None,
    total_photo_count=None,
    map=None,
    commute=None,
    private=None,
    visibility=None,
    # my additions
    _id=None,
    ts=None,
    **and_more
):
    if not (start_date and map and map.get("summary_polyline")):
        return

    utc_start_time = int(Utility.to_datetime(start_date).timestamp())
    return Utility.cleandict(
        {
            TIMESTAMP: ts or datetime.datetime.utcnow(),
            ACTIVITY_ID: int(_id or id),
            USER_ID: int(athlete["id"]),
            ACTIVITY_NAME: name,
            DISTANCE_METERS: distance,
            TIME_SECONDS: elapsed_time,
            ACTIVITY_TYPE: type,
            UTC_START_TIME: utc_start_time,
            UTC_LOCAL_OFFSET: utc_offset,
            N_ATHLETES: athlete_count,
            N_PHOTOS: total_photo_count,
            VISIBILITY: visibility,
            FLAG_COMMUTE: commute,
            FLAG_PRIVATE: private,
            LATLNG_BOUNDS: polyline_bounds(map["summary_polyline"]),
        }
    )


async def import_user_entries(**user):
    t0 = time.perf_counter()

    uid = int(user["_id"])

    # we assume the access_token is current
    strava = Strava.AsyncClient(uid, **user["auth"])
    await strava.update_access_token()
    now = datetime.datetime.utcnow()
    docs = [mongo_doc(**A, ts=now) async for A in strava.get_index() if A is not None]
    docs = filter(None, docs)
    t1 = time.perf_counter()
    fetch_time = (t1 - t0) * 1000

    index = await get_collection()
    await delete_user_entries(**user)
    insert_result = await index.insert_many(docs, ordered=False)
    insert_time = (time.perf_counter() - t1) * 1000
    count = len(insert_result.inserted_ids)
    log.debug(
        "fetched %s entries in %dms, insert_many %dms", count, fetch_time, insert_time
    )


async def delete_user_entries(**user):
    uid = int(user["_id"])
    index = await get_collection()
    return await index.delete_many({USER_ID: int(uid)})


SORT_SPECS = [(UTC_START_TIME, DESCENDING)]


async def query(
    user_id=None,
    activity_ids=None,
    exclude_ids=None,
    after=None,
    before=None,
    limit=None,
    activity_type=None,
    commute=None,
    private=None,
    visibility=None,
    #
    update_ts=True,
):
    query = {}
    projection = None

    if activity_ids:
        activity_ids = set(int(aid) for aid in activity_ids)

    if exclude_ids:
        exclude_ids = set(int(aid) for aid in exclude_ids)

    limit = int(limit) if limit else 0

    if user_id:
        query[USER_ID] = int(user_id)
        projection = {USER_ID: False}

    if before or after:
        query[UTC_START_TIME] = Utility.cleandict(
            {
                "$lt": None if before is None else Utility.to_epoch(before),
                "$gte": None if after is None else Utility.to_epoch(after),
            }
        )

    if activity_ids:
        query[ACTIVITY_ID] = {"$in": activity_ids}

    if activity_type:
        query[ACTIVITY_TYPE] = {"$in": activity_type}

    if visibility:
        # ["everyone", "followers", "only_me"]
        query[VISIBILITY] = {"$in": visibility}

    if private is not None:
        query[FLAG_PRIVATE] = private

    if commute is not None:
        query[FLAG_COMMUTE] = commute

    to_delete = None

    index = await get_collection()

    result = {}

    if exclude_ids:
        t0 = time.perf_counter()
        cursor = index.find(
            filter=query,
            projection={ACTIVITY_ID: True},
            sort=SORT_SPECS,
            limit=limit,
        )

        # These are the ids of activities that matched the query
        query_ids = set([doc[ACTIVITY_ID] async for doc in cursor])

        to_fetch = list(query_ids - exclude_ids)
        to_delete = list(exclude_ids - query_ids)

        result["triage"] = to_delete
        query = {ACTIVITY_ID: {"$in": to_fetch}}

        elapsed = (time.perf_counter() - t0) * 1000
        log.debug("queried %d ids in %dms", len(query_ids), elapsed)

    t0 = time.perf_counter()
    cursor = index.find(
        filter=query,
        projection=projection,
        sort=SORT_SPECS,
        limit=limit,
    )

    docs = await cursor.to_list(length=None)
    result["docs"] = docs

    t1 = time.perf_counter()
    elapsed = (t1 - t0) * 1000
    log.debug("queried %d activities in %dms", len(docs), elapsed)

    if update_ts:
        update_result = await index.update_many(
            {"_id": {"$in": [a[ACTIVITY_ID] for a in docs]}},
            {"$set": {TIMESTAMP: datetime.datetime.utcnow()}},
        )
        elapsed = (time.perf_counter() - t1) * 1000
        log.debug("ts update in %dms", elapsed)
    return result


def stats():
    return DataAPIs.stats(COLLECTION_NAME)


def drop():
    return DataAPIs.drop(COLLECTION_NAME)


ATYPE_SPECS = [
    "Ride",
    "Run",
    "Swim",
    "Walk",
    "Hike",
    "Alpine Ski",
    "Backcountry Ski",
    "Canoe",
    "Crossfit",
    "E-Bike Ride",
    "Elliptical",
    "Handcycle",
    "Ice Skate",
    "Inline Skate",
    "Kayak",
    "Kitesurf Session",
    "Nordic Ski",
    "Rock Climb",
    "Roller Ski",
    "Row",
    "Snowboard",
    "Snowshoe",
    "Stair Stepper",
    "Stand Up Paddle",
    "Surf",
    "Velomobile ",
    "Virtual Ride",
    "Virtual Run",
    "Weight Training",
    "Windsurf Session",
    "Wheelchair",
    "Workout",
    "Yoga",
]


In [3]:
# Example Strava ActivitySummary
A = {'resource_state': 2,
 'athlete': {'id': 15972102, 'resource_state': 1},
 'name': 'Afternoon Shred',
 'distance': 3301.7,
 'moving_time': 1346,
 'elapsed_time': 1378,
 'total_elevation_gain': 50.1,
 'type': 'Surfing',
 'id': 6663463299,
 'start_date': '2022-02-10T21:49:17Z',
 'start_date_local': '2022-02-10T13:49:17Z',
 'timezone': '(GMT-08:00) America/Los_Angeles',
 'utc_offset': -28800.0,
 'location_city': None,
 'location_state': None,
 'location_country': 'United States',
 'achievement_count': 0,
 'kudos_count': 0,
 'comment_count': 0,
 'athlete_count': 1,
 'photo_count': 0,
 'map': {'id': 'a6663463299',
  'summary_polyline': 'cr{eFjowhVd@RVED]K[A[BMHKLAZBTJHAHGCEPK@k@LUFCVEP?DBL@DFd@FVNLCFIDCJWPMBa@G[O]EY?SBOd@a@FQBYH[AUIi@BQFILGNBPHNCh@WLARKDEBQCKBMNc@XMJBFH@f@Kr@c@lACN@XRf@?`@K`@@HDHZPV`@JFF@DIPCLg@NUn@YJALD\\N\\TF?HCN@JMVE@EBk@AWFYFIXSLGXENIJ]Jg@FKFCJHNVF^`@vANPLXRJJJP\\DDNBBERCF?FDVBHBHPBb@DTDfACz@Gp@IPHGBMAAGPORKZS\\e@~A_AXe@D]Cc@K_@OM?G@IJYhAMHi@Hc@TYHk@D[?g@LqAr@_@d@CRA\\Jl@?NEVOZWR_ANcAX[@QNO^CZ`@`BD`@@\\GnAQrAIb@OLI?GCKMGo@GSU]IU?UGQBc@EUe@o@Qm@E]RuBC[O_@IEMBUA]J{@^[DOASGEGIWIc@GMGEKA_@VQR{@|A]d@EBG?_@OKIQ@OCKIGY?i@DaBM]e@g@_@{@DYFQb@_ABWCMe@_@_@MSCs@Ps@Ce@?Cb@FC@FD@?E?BBB',
  'resource_state': 2},
 'trainer': False,
 'commute': False,
 'manual': False,
 'private': False,
 'visibility': 'everyone',
 'flagged': False,
 'gear_id': None,
 'start_latlng': [37.829625, -122.18629333],
 'end_latlng': [37.83219167, -122.187075],
 'start_latitude': 37.829625,
 'start_longitude': -122.187075,
 'average_speed': 2.453,
 'max_speed': 6.48,
 'has_heartrate': True,
 'average_heartrate': 115.2,
 'max_heartrate': 142.0,
 'heartrate_opt_out': False,
 'display_hide_heartrate_option': True,
 'elev_high': 428.1,
 'elev_low': 353.1,
 'upload_id': 7086217566,
 'upload_id_str': '7086217566',
 'external_id': '2022-02-10_22-13-35_2404963f-543a-485f-aca1-04e4184b7c08.tcx',
 'from_accepted_tag': False,
 'pr_count': 0,
 'total_photo_count': 0,
 'has_kudoed': False,
 'suffer_score': 7.0}

In [4]:
import logging
logging.basicConfig(level="DEBUG")
log = logging.getLogger()

await DataAPIs.connect()

mongo_doc(**A)

DEBUG:DataAPIs:initialized MongoDB and Redis


{'ts': datetime.datetime(2022, 2, 15, 19, 1, 3, 256413),
 '_id': 6663463299,
 'U': 15972102,
 'N': 'Afternoon Shred',
 'D': 3301.7,
 'T': 1378,
 't': 'Surfing',
 's': 1644558557,
 'o': -28800.0,
 '#a': 1,
 '#p': 0,
 'v': 'everyone',
 'c': False,
 'p': False,
 'B': {'SW': (37.82292175292969, -122.19149780273438),
  'NE': (37.83229064941406, -122.18283081054688)}}

In [5]:
import Users
efrem = await Users.get(Users.ADMIN[0])
efrem

{'_id': 15972102,
 'access_count': 2,
 'auth': {'expires_at': 1644955224,
  'refresh_token': '05867993a2d0c5b60c51653636a9c295348551f3',
  'access_token': '5dabace655c26d1372321c53f5dad173ec2bd674'},
 'city': 'Oakland',
 'country': 'United States',
 'firstname': '👣',
 'lastname': 'Efrem',
 'profile': 'https://dgalywyr863hv.cloudfront.net/pictures/athletes/15972102/9131294/7/medium.jpg',
 'state': 'California',
 'ts': datetime.datetime(2022, 2, 15, 18, 58, 32, 199000),
 'username': 'bfef'}

In [6]:
await import_user_entries(**efrem)

DEBUG:Strava:access token is current
DEBUG:Strava:opening new aiohttp session
DEBUG:Strava:access token is current
DEBUG:Strava:getting user index
DEBUG:Strava:Page 1 requested
DEBUG:Strava:Page 2 requested
DEBUG:Strava:Page 3 requested
DEBUG:Strava:Page 4 requested
DEBUG:Strava:Page 5 requested
DEBUG:Strava:Page 6 requested
DEBUG:Strava:Page 7 requested
DEBUG:Strava:Page 8 requested
DEBUG:Strava:Page 9 requested
DEBUG:Strava:Page 10 requested
DEBUG:Strava:Page 1 retrieved in 4927
DEBUG:Strava:Page 11 requested
DEBUG:Strava:Page 3 retrieved in 5340
DEBUG:Strava:Page 12 requested
DEBUG:Strava:Page 2 retrieved in 5478
DEBUG:Strava:Page 13 requested
DEBUG:Strava:Page 8 retrieved in 5814
DEBUG:Strava:Page 14 requested
DEBUG:Strava:Page 6 retrieved in 6243
DEBUG:Strava:Page 15 requested
DEBUG:Strava:Page 9 retrieved in 6315
DEBUG:Strava:Page 16 requested
DEBUG:Strava:Page 7 retrieved in 6578
DEBUG:Strava:Page 17 requested
DEBUG:Strava:Page 10 retrieved in 6664
DEBUG:Strava:Page 18 requested

In [7]:
result = await query(after="2021")

DEBUG:root:queried 346 activities in 9ms
DEBUG:root:ts update in 22ms


In [8]:
result

{'docs': [{'_id': 6679845090,
   'ts': datetime.datetime(2022, 2, 15, 19, 1, 3, 452000),
   'U': 15972102,
   'N': 'Afternoon Walk',
   'D': 3052.6,
   'T': 4484,
   't': 'Walk',
   's': 1644816953,
   'o': -28800.0,
   '#a': 1,
   '#p': 2,
   'v': 'only_me',
   'c': False,
   'p': True,
   'B': {'SW': [37.71028137207031, -122.5067367553711],
    'NE': [37.72166061401367, -122.5013198852539]}},
  {'_id': 6678989222,
   'ts': datetime.datetime(2022, 2, 15, 19, 1, 3, 452000),
   'U': 15972102,
   'N': 'Barefoot Running Meetup',
   'D': 5222.8,
   'T': 1996,
   't': 'Run',
   's': 1644805008,
   'o': -28800.0,
   '#a': 1,
   '#p': 4,
   'v': 'everyone',
   'c': False,
   'p': False,
   'B': {'SW': [37.798580169677734, -122.26313781738281],
    'NE': [37.810691833496094, -122.249267578125]}},
  {'_id': 6673550638,
   'ts': datetime.datetime(2022, 2, 15, 19, 1, 3, 452000),
   'U': 15972102,
   'N': 'Morning Hike',
   'D': 11438.1,
   'T': 6935,
   't': 'Hike',
   's': 1644717719,
   'o': -2

In [9]:
import DataAPIs
await DataAPIs.stats("index")

{'ns': 'heatflask.index',
 'size': 752698,
 'count': 3531,
 'avgObjSize': 213,
 'storageSize': 446464,
 'freeStorageSize': 221184,
 'capped': False,
 'wiredTiger': {'metadata': {'formatVersion': 1},
  'creationString': 'access_pattern_hint=none,allocation_size=4KB,app_metadata=(formatVersion=1),assert=(commit_timestamp=none,durable_timestamp=none,read_timestamp=none,write_timestamp=off),block_allocation=best,block_compressor=snappy,cache_resident=false,checksum=on,colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=,name=),exclusive=false,extractor=,format=btree,huffman_key=,huffman_value=,ignore_in_memory_cache_size=false,immutable=false,import=(enabled=false,file_metadata=,repair=false),internal_item_max=0,internal_key_max=0,internal_key_truncate=true,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=64MB,log=(enabled=true),lsm=(auto_throttle=true,bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8,bloom

In [10]:
await DataAPIs.disconnect()