## Collect labeled trips from the database and expand user inputs.
You need to have the database running before running this notebook.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from uuid import UUID

import matplotlib.pyplot as plt

import confusion_matrix_handling as cm_handling
from confusion_matrix_handling import MODE_MAPPING_DICT
import get_EC
import helper_functions as hf

import sklearn.model_selection as skm

from sklearn.model_selection import KFold
from sklearn import linear_model

import scipy

METERS_TO_MILES = 0.000621371 # 1 meter = 0.000621371 miles

df_EI = pd.read_csv(r'Public_Dashboard/auxiliary_files/energy_intensity.csv') # r stands for raw string, only matters if the path is on Windows

In [3]:
import database_related_functions as drf  # all the emission server functions for this notebook are in here.
user_list, os_map, uuid_program_map = drf.get_participants_programs_and_operating_systems()
#print(len(user_list), len(os_map), len(uuid_program_map))

storage not configured, falling back to sample, default configuration
URL not formatted, defaulting to "Stage_database"
Connecting to database URL localhost
Number of participants with operating system information in each program:
{'sc': 26, 'fc': 32, '4c': 16, 'stage': 88, 'pc': 41, 'vail': 12, 'cc': 56, '84Q9SsrH': 1, 'cwZazZLJ': 1, 'CudLAeg8': 1, 'sxxcLqbK': 1, 'Q8T7QTXK': 1, '5KEGHHuf': 1, 'e9MaNVU7': 1, '7c797MRD': 1, 'rhBZukxY': 1, 'k36cxmfA': 1, 'FmxVf8u6': 1, 'pNpK4pFg': 1, 'F3jxHLSW': 1}


In [None]:
# Takes 6 to 14 minutes on Macbook Pro for all ceo data + stage + prepilot.
# Takes ~ 1 min 45 s to 2 min 45 s on Macbook Pro for all ceo data up to May 2022.
expanded_labeled_trips = drf.get_expanded_labeled_trips(user_list)
expanded_labeled_trips['os'] = expanded_labeled_trips.user_id.map(os_map)
expanded_labeled_trips['program'] = expanded_labeled_trips['user_id'].map(uuid_program_map)

expanded_labeled_trips = expanded_labeled_trips.drop(labels = ['source', 'end_fmt_time', 'end_loc', 'raw_trip',
    'start_fmt_time', 'start_loc','start_local_dt_year', 'start_local_dt_month', 'start_local_dt_day',
    'start_local_dt_hour', 'start_local_dt_minute', 'start_local_dt_second',
    'start_local_dt_weekday', 'start_local_dt_timezone',
    'end_local_dt_year', 'end_local_dt_month', 'end_local_dt_day',
    'end_local_dt_hour', 'end_local_dt_minute', 'end_local_dt_second',
    'end_local_dt_weekday', 'end_local_dt_timezone'], axis = 1)

expanded_labeled_trips['distance_miles'] = expanded_labeled_trips.distance*METERS_TO_MILES

# Group together the prepilot participants
prepilot_list = ['84Q9SsrH','cwZazZLJ','CudLAeg8','sxxcLqbK','Q8T7QTXK','5KEGHHuf','e9MaNVU7','7c797MRD','rhBZukxY','k36cxmfA','FmxVf8u6','F3jxHLSW']
expanded_labeled_trips['program'] = expanded_labeled_trips.program.replace(prepilot_list, "prepilot")

In [None]:
expanded_labeled_trips.program.value_counts()

In [None]:
len(expanded_labeled_trips)

In [None]:
%store expanded_labeled_trips

### Checking the trip counts in each program.
These match the numbers shown by shankari around Feb 7, 2023 under "with minipilot" in this comment thread
https://github.com/e-mission/em-public-dashboard/pull/47

In [4]:
confirmed_trips_df = drf.get_confirmed_trips(user_list)

In [5]:
%store confirmed_trips_df

Stored 'confirmed_trips_df' (DataFrame)


In [13]:
confirmed_trips_df['program'] = confirmed_trips_df['user_id'].map(uuid_program_map)
prepilot_list = ['84Q9SsrH','cwZazZLJ','CudLAeg8','sxxcLqbK','Q8T7QTXK','5KEGHHuf','e9MaNVU7','7c797MRD','rhBZukxY','k36cxmfA','FmxVf8u6','F3jxHLSW','pNpK4pFg']
confirmed_trips_df['program'] = confirmed_trips_df.program.replace(prepilot_list, "prepilot")
print(f"Number of confirmed trips in the database: {len(confirmed_trips_df)}")
confirmed_trips_df.program.value_counts()

Number of confirmed trips in the database: 241123


cc          75184
pc          51182
stage       37302
fc          32429
sc          17984
4c          14417
vail         9133
prepilot     3492
Name: program, dtype: int64

In [14]:
# labeling percents by program
expanded_labeled_trips.program.value_counts()/confirmed_trips_df.program.value_counts()

4c          0.365402
cc          0.383512
fc          0.362361
pc          0.349517
prepilot    0.694444
sc          0.505783
stage       0.290253
vail        0.695938
Name: program, dtype: float64