In [1]:
import os
import sys
import pickle
import ast
import importlib
# from multiprocessing import cpu_count, Pool

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from enum import Enum
from pathlib import Path
from uuid import UUID
from bson.objectid import ObjectId
from collections import defaultdict

%matplotlib inline

In [2]:
emission_path = Path(os.getcwd()).parent.parent / 'my_emission_server' / 'e-mission-server'
sys.path.append(str(emission_path))

# Also add the home (viz_scripts) to the path
sys.path.append('../viz_scripts')

In [3]:
import scaffolding

import emission.core.get_database as edb
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.trip_queries as esdt

storage not configured, falling back to sample, default configuration
URL not formatted, defaulting to openpath_prod_ride2own
Connecting to database URL localhost


In [4]:
class Sections(Enum):
    DURATION = 0
    DISTANCE = 1
    MODE = 2
    
# A lightweight wrapper around the lists to ease management.
class SectionData:
    def __init__(self, section_dict):
        self._section_dict = section_dict
    
    def get_section_data(self, section: Sections):
        if section not in self._section_dict:
            return []
        return self._section_dict[section]
    
SENSED_MODE_DICT = {
    'IN_VEHICLE': 'car',
    'BICYCLING': 'p_micro',
    ''
}

In [5]:
REQUIRED = {
    "Stage_database": [Sections.DURATION],
    "openpath_prod_durham": [Sections.DURATION],
    "openpath_prod_mm_masscec": [Sections.DURATION, Sections.DISTANCE, Sections.MODE],
    "openpath_prod_ride2own": [Sections.DURATION, Sections.DISTANCE, Sections.MODE],
    "openpath_prod_uprm_nicr": [Sections.DURATION, Sections.DISTANCE, Sections.MODE]
}

In [9]:
def get_section_data(df: pd.DataFrame, db_name: str):
    
    def _get_section_data(row):
        
        user_id, trip_id = UUID(row.user_id), ObjectId(row.cleaned_trip)
        
        section_data = esdt.get_sections_for_trip(
            key='analysis/cleaned_section', user_id=user_id, trip_id=trip_id
        )
        
        if section_data is None or len(section_data) == 0:
            section_data = esdt.get_sections_for_trip(
                key='analysis/inferred_section', user_id=user_id, trip_id=trip_id
            )
        
        if section_data is None or len(section_data) == 0:
            return None
        
        required = REQUIRED[db_name]
        
        section_dict = dict()
        
        if Sections.DURATION in required:
            section_dict[Sections.DURATION] = [x.data.duration for x in section_data]
        else:
            section_dict[Sections.DURATION] = ast.literal_eval(x.section_durations)
        
        if Sections.DISTANCE in required:
            section_dict[Sections.DISTANCE] = [x.data.distance for x in section_data]
        else:
            section_dict[Sections.DISTANCE] = ast.literal_eval(x.section_distances)
        
        if Sections.MODE in required:
            section_dict[Sections.MODE] = [x.data.sensed_mode for x in section_data]
        else:
            section_dict[Sections.MODE] = ast.literal_eval(x.section_modes)
        
        return SectionData(section_dict)
    
    
    section_series = df.apply(lambda x: _get_section_data(x), axis=1)
    
    return section_series

In [10]:
df = pd.read_csv('../data/filtered_data/trips__openpath_prod_ride2own.csv')

section_info = get_section_data(df, 'openpath_prod_ride2own')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sections.loc[:, 'user_id'] = sections.loc[:, 'user_id'].apply(lambda x: UUID(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sections.loc[:, 'cleaned_trip'] = sections.loc[:, 'cleaned_trip'].apply(lambda x: ObjectId(x))


In [11]:
section_info

0       <__main__.SectionData object at 0x165720d00>
1       <__main__.SectionData object at 0x165720880>
2       <__main__.SectionData object at 0x1657209d0>
3       <__main__.SectionData object at 0x165720f70>
4       <__main__.SectionData object at 0x165720fa0>
                            ...                     
5817    <__main__.SectionData object at 0x1659ebd60>
5818    <__main__.SectionData object at 0x1659eb400>
5819    <__main__.SectionData object at 0x1659ebf70>
5820    <__main__.SectionData object at 0x1659ebf10>
5821    <__main__.SectionData object at 0x1659eba60>
Length: 5822, dtype: object

In [19]:
modes = set()
for s in section_info:
    modes.update(s.get_section_data(Sections.MODE))

print(modes)

{<MotionTypes.UNKNOWN: 4>, <MotionTypes.AIR_OR_HSR: 11>, <MotionTypes.BICYCLING: 1>, <MotionTypes.WALKING: 7>, <MotionTypes.ON_FOOT: 2>, <MotionTypes.IN_VEHICLE: 0>}


In [None]:
def compute_argmax(db_name: str, df:pd.DataFrame, sections: pd.Series):
    # Evaluate once.
    distance_mask = Sections.DISTANCE in REQUIRED[db_name]
    mode_mask = Sections.MODE in REQUIRED[db_name]
    
    for ix, section in enumerate(sections):
        distance = ast.literal_eval(
            section.get_section_data(Sections.DISTANCE) if distance_mask else df.loc[ix, 'section_distances']
        )
        
        duration = ast.literal_eval(section.get_section_data(Sections.DISTANCE))
        
        mode = ast.literal_eval(
            section.get_section_data(Sections.DISTANCE) if distance_mask else df.loc[ix, 'section_modes']
        )
        
        