In [None]:
year = None
month = None
program = "prepilot"
include_replaced_modes_as_valid = False # Flip this when we want to get results versus generate the replaced_mode correction graphs
input_dataset = "ONLY_SENSED" # "ONLY_LABELED", "ONLY_SENSED" or "BEST_AVAILABLE" for sensitivity analysis
LABEL_ASSIST_THRESHOLD = 0.3

In [None]:
import pandas as pd

import emission.core.get_database as edb
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.storage.decorations.trip_queries as esdt
import emission.storage.decorations.timeline as esdl
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt
import scaffolding
from uuid import UUID

%matplotlib inline

In [None]:
# Split UUIDs by program
program_uuid_map = {}
for ue in edb.get_uuid_db().find():
    program = ue['user_email'].split("_")[0]
    if program in program_uuid_map.keys():
        program_uuid_map[program].append(str(ue['uuid']))
    else:
        print(f"Found new program {program}, creating new list")
        program_uuid_map[program] = []
        program_uuid_map[program].append(str(ue['uuid']))

In [None]:
uuid_program_list = []
for ue in edb.get_uuid_db().find():
    program = ue['user_email'].split("_")[0]
    uuid_program_list.append({"program": program, "opcode": ue["user_email"], "user_id_str": str(ue['uuid'])})

In [None]:
program_uuid_map.keys()

In [None]:
uuid_program_df = pd.DataFrame.from_dict(uuid_program_list)
uuid_program_df.head()

In [None]:
uuid_program_df.loc[uuid_program_df.groupby("program").groups['4c']]

In [None]:
program_uuid_map['4c']

In [None]:
tq = scaffolding.get_time_query(year, month)
participant_ct_df = scaffolding.load_all_participant_trips(program, tq)

In [None]:
participant_ct_df["user_id_str"] = participant_ct_df.user_id.apply(lambda u: str(u))
len(participant_ct_df.user_id_str.unique())

In [None]:
trip_program_df = participant_ct_df.merge(uuid_program_df, on="user_id_str")

In [None]:
trip_program_df.sample(n=50, random_state=123)[["program", "user_id_str", "opcode", "_id", "start_fmt_time", "end_fmt_time"]]

In [None]:
labeled_ct = scaffolding.filter_labeled_trips(trip_program_df)

In [None]:
label_summary_df = pd.DataFrame({"total_trips": trip_program_df.groupby("program").source.count(), "labeled_trips": labeled_ct.groupby("program").source.count()})
label_summary_df

In [None]:
label_summary_df["label_pct"] = (label_summary_df.labeled_trips / label_summary_df.total_trips) * 100

In [None]:
label_summary_df

In [None]:
label_summary_df.label_pct.plot(kind="bar", ylabel="% of labeled trips", title="Variation in labeling percentage across programs")

In [None]:
label_user_summary_df = pd.DataFrame({"total_trips": trip_program_df.groupby(["program", "user_id_str"]).source.count(), "labeled_trips": labeled_ct.groupby(["program", "user_id_str"]).source.count()})

In [None]:
label_user_summary_df["label_pct"] = (label_user_summary_df.labeled_trips / label_user_summary_df.total_trips) * 100
label_user_summary_df

In [None]:
label_user_summary_df.label_pct.plot(kind="bar", xticks=[])

In [None]:
flattened_label_user_summary_df = label_user_summary_df.reset_index()

In [None]:
flattened_label_user_summary_df

In [None]:
ax = flattened_label_user_summary_df.boxplot(column=['label_pct'], by="program")
type(ax)
ax.set_title("")
ax.set_ylabel("percentage of labeled trips")
ax.get_figure().suptitle("Labeling rate by program")

In [None]:
flattened_label_user_summary_df.dropna().sort_values(by="label_pct").plot.bar(y="label_pct", xticks=[])

In [None]:
import seaborn as sns

In [None]:
sns.barplot(data=flattened_label_user_summary_df, x="program", y="label_pct")