In [None]:
year = None
month = None
program = "prepilot"
include_replaced_modes_as_valid = False # Flip this when we want to get results versus generate the replaced_mode correction graphs
input_dataset = "ONLY_SENSED" # "ONLY_LABELED", "ONLY_SENSED" or "BEST_AVAILABLE" for sensitivity analysis
LABEL_ASSIST_THRESHOLD = 0.3

In [None]:
import pandas as pd

import emission.core.get_database as edb
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.storage.decorations.trip_queries as esdt
import emission.storage.decorations.timeline as esdl
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.timeseries.timequery as estt
import scaffolding
from uuid import UUID

%matplotlib inline

### Load

In [None]:
def fix_minipilot_tokens():
    minipilot_uuid_list = [
        UUID("576e37c7-ab7e-4c03-add7-02486bc3f42e"),
        UUID("8b563348-52b3-4e3e-b046-a0aaf4fcea15"),
        UUID("5079bb93-c9cf-46d7-a643-dfc86bb05605"),
        UUID("feabfccd-dd6c-4e8e-8517-9d7177042483"),
        UUID("113aef67-400e-4e21-a29f-d04e50fc42ea"),
        UUID("c8b9fe22-86f8-449a-b64f-c18a8d20eefc"),
        UUID("e7b24d99-324d-4d6d-b247-9edc87d3c848"),
        UUID("1044195f-af9e-43d4-9407-60594e5e9938"),
        UUID("898b1a5e-cdd4-4a0c-90e4-942fa298e456"),
        UUID("1d292b85-c549-409a-a10d-746e957582a0"),
        UUID("cb3222a7-1e72-4a92-8b7b-2c4795402497"),
        UUID("efdbea3b-eef6-48fc-9558-7585f4ad6f24"),
        UUID("960835ac-9d8a-421d-8b8a-bf816f8a4b92"),
    ]
    for uuid in minipilot_uuid_list:
        curr_ue = edb.get_uuid_db().find_one({"uuid": uuid})
        curr_token = curr_ue["user_email"]
        if curr_token.split("_")[0] != "prepilot":
            curr_update_result = edb.get_uuid_db().update_one({"uuid": uuid}, {"$set": {"user_email": "prepilot_"+curr_token}})
            print("update result for %s is %s" % (uuid, curr_update_result.raw_result))

In [None]:
fix_minipilot_tokens()

In [None]:
# Split UUIDs by program
program_uuid_map = {}
for ue in edb.get_uuid_db().find():
    program = ue['user_email'].split("_")[0]
    if program in program_uuid_map.keys():
        program_uuid_map[program].append(str(ue['uuid']))
    else:
        print(f"Found new program {program}, creating new list")
        program_uuid_map[program] = []
        program_uuid_map[program].append(str(ue['uuid']))

In [None]:
uuid_program_list = []
for ue in edb.get_uuid_db().find():
    program = ue['user_email'].split("_")[0]
    uuid_program_list.append({"program": program, "opcode": ue["user_email"], "user_id_str": str(ue['uuid'])})

In [None]:
program_uuid_map.keys()

In [None]:
uuid_program_df = pd.DataFrame.from_dict(uuid_program_list)
uuid_program_df.head()

In [None]:
uuid_program_df.loc[uuid_program_df.groupby("program").groups['4c']]

In [None]:
program_uuid_map['4c']

In [None]:
tq = scaffolding.get_time_query(year, month)
participant_ct_df = scaffolding.load_all_participant_trips(program, tq)

In [None]:
participant_ct_df["user_id_str"] = participant_ct_df.user_id.apply(lambda u: str(u))
len(participant_ct_df.user_id_str.unique())

### Transform

In [None]:
trip_program_df = participant_ct_df.merge(uuid_program_df, on="user_id_str")

In [None]:
trip_program_df.sample(n=50, random_state=123)[["program", "user_id_str", "opcode", "_id", "start_fmt_time", "end_fmt_time"]]

In [None]:
labeled_ct = scaffolding.filter_labeled_trips(trip_program_df)

In [None]:
label_summary_df = pd.DataFrame({"total_trips": trip_program_df.groupby("program").source.count(), "labeled_trips": labeled_ct.groupby("program").source.count()})
label_summary_df

In [None]:
label_summary_df["label_pct"] = (label_summary_df.labeled_trips / label_summary_df.total_trips) * 100

In [None]:
label_summary_df

In [None]:
label_user_summary_df = pd.DataFrame({"total_trips": trip_program_df.groupby(["program", "user_id_str"]).source.count(), "labeled_trips": labeled_ct.groupby(["program", "user_id_str"]).source.count()})

In [None]:
label_user_summary_df["label_pct"] = (label_user_summary_df.labeled_trips / label_user_summary_df.total_trips) * 100
label_user_summary_df

In [None]:
label_summary_df.drop("prepilot")

In [None]:
label_user_summary_df.drop("prepilot")

### Plot

In [None]:
label_summary_df.label_pct.plot(kind="bar", ylabel="% of labeled trips", title="Variation in labeling percentage across programs")

In [None]:
label_summary_df.drop("prepilot").label_pct.plot(kind="bar", ylabel="% of labeled trips", title="Variation in labeling percentage across programs")

In [None]:
ax=label_user_summary_df.label_pct.plot(kind="bar", xticks=[])
ax.set_xlabel("user")
ax.set_ylabel("Percentage of trips labeled")
ax.set_title("Percentage of trips labeled by each user")

In [None]:
ax=label_user_summary_df.drop("prepilot").label_pct.plot(kind="bar", xticks=[])
ax.set_xlabel("user")
ax.set_ylabel("Percentage of trips labeled")
ax.set_title("Percentage of trips labeled by each user")

In [None]:
flattened_label_user_summary_df = label_user_summary_df.reset_index()

In [None]:
flattened_label_user_summary_df

In [None]:
no_prepilot_flattened_label_user_summary_df = label_user_summary_df.drop("prepilot").reset_index()

In [None]:
no_prepilot_flattened_label_user_summary_df

In [None]:
ax = flattened_label_user_summary_df.boxplot(column=['label_pct'], by="program")
type(ax)
ax.set_title("")
ax.set_ylabel("percentage of labeled trips")
ax.get_figure().suptitle("Labeling rate by program")

In [None]:
ax = no_prepilot_flattened_label_user_summary_df.boxplot(column=['label_pct'], by="program")
type(ax)
ax.set_title("")
ax.set_ylabel("percentage of labeled trips")
ax.get_figure().suptitle("Labeling rate by program")

In [None]:
ax = flattened_label_user_summary_df.dropna().sort_values(by="label_pct").label_pct.plot.bar(y="label_pct", xticks=[])
ax.set_xlabel("user")
ax.set_ylabel("Percentage of trips labeled")
ax.set_title("Percentage of trips labeled by user")

In [None]:
ax = no_prepilot_flattened_label_user_summary_df.dropna().sort_values(by="label_pct").label_pct.plot.bar(y="label_pct", xticks=[])
ax.set_xlabel("user")
ax.set_ylabel("Percentage of trips labeled")
ax.set_title("Percentage of trips labeled by user")

In [None]:
import seaborn as sns

In [None]:
ax=sns.barplot(data=flattened_label_user_summary_df, x="program", y="label_pct")
ax.set_ylabel("Percentage of trips labeled")
ax.set_title("Mean labeling rate and variance per program")

In [None]:
ax=sns.barplot(data=no_prepilot_flattened_label_user_summary_df, x="program", y="label_pct")
ax.set_ylabel("Percentage of trips labeled")
ax.set_title("Mean labeling rate and variance per program")

### Describe

In [None]:
def describe_label_info(df):
    print(f"Total number of trips {len(df)} from {len(df.user_id.unique())} unique users")
    print(f"Number of trips with at least one label {len(df[df.user_input != {}])} from {len(df[df.user_input != {}].user_id.unique())} unique users")
    no_user_label_ct_df = df[df.user_input == {}]
    print(f"Trips without user specified labels {len(no_user_label_ct_df)} from {len(no_user_label_ct_df.user_id.unique())} users")
    is_empty_check = lambda ll: len(ll) == 0 if type(ll) == list else True
    print(f"Trips without user label but with inferred label {len(no_user_label_ct_df[~no_user_label_ct_df.inferred_labels.apply(is_empty_check)])}")
    print(f"Trips without user label or inferred label {len(no_user_label_ct_df[no_user_label_ct_df.inferred_labels.apply(is_empty_check)])}")
    stage_df = df[df.program == 'stage']
    print(f"Number of trips in stage = {len(df[df.program=='stage'])} from {len(df[df.program=='stage'].user_id.unique())} unique users")
    print(f"Number of trips in real programs = {len(df[df.program!='stage'])} from {len(df[df.program!='stage'].user_id.unique())} unique users")
    trip_count_df = df.groupby("program").source.count()
    user_count_df = df.groupby("program").user_id.unique().apply(lambda unique_list: len(unique_list))
    # print(trip_count_df)
    # print(user_count_df)
    count_df = pd.DataFrame({"trips": trip_count_df, "unique_users": user_count_df})
    # print(count_df)
    print(f"Program specific counts: ", count_df.to_dict())

In [None]:
# test_df = pd.DataFrame({"inferred_labels": [[], np.NaN, [1,2], [3.4]]})

In [None]:
# test_df

In [None]:
# is_empty_check = lambda ll: len(ll) == 0 if type(ll) == list else True
# test_df.inferred_labels.apply(is_empty_check)

In [None]:
describe_label_info(trip_program_df)

In [None]:
describe_label_info(trip_program_df[trip_program_df.program != "prepilot"])

In [None]:
describe_label_info(labeled_ct)

In [None]:
describe_label_info(labeled_ct[labeled_ct.program != "prepilot"])

In [None]:
# Android versions

In [None]:
profile_df = pd.json_normalize(edb.get_profile_db().find())
profile_df["user_id_str"] = profile_df.user_id.apply(lambda u: str(u))
profile_program_df = profile_df.merge(uuid_program_df, on="user_id_str")

In [None]:
profile_program_df.query("curr_platform == 'android' & program != 'stage'").client_os_version.value_counts().plot(kind='pie')

In [None]:
profile_program_df.query("curr_platform == 'ios'").client_os_version.value_counts().plot(kind='pie')

In [None]:
profile_program_df.query("curr_platform == 'ios' & program != 'stage'").client_os_version.value_counts().plot(kind='pie')

In [None]:
high_income_participants = [...]
uuid_program_df[uuid_program_df.user_id_str.isin(high_income_participants)]

In [None]:
participant_ct_df["start_ts_dt"] = participant_ct_df.start_ts.apply(lambda st: arrow.get(st))

In [None]:
participant_ct_df[participant_ct_df.user_id_str.isin(high_income_participants)].groupby("user_id_str").start_ts_dt.min()

In [None]:
participant_ct_df[participant_ct_df.user_id_str.isin(high_income_participants)].groupby("user_id_str").start_ts_dt.max()

In [None]:
participant_ct_df[participant_ct_df.user_id_str.isin(high_income_participants)].groupby('user_id_str').plot(x="start_ts_dt", y="distance", subplots=True)

In [None]:
en_survey = pd.read_csv("en_survey.csv")

In [None]:
def convert_to_uuid_format(us):
    try:
        return str(UUID(us))
    except ValueError as e:
        return None
    
en_survey["user_id_str"] = en_survey['Unique User ID (auto-filled, do not edit)'].dropna().apply(convert_to_uuid_format)

In [None]:
en_survey[en_survey.user_id_str.isin(high_income_participants)]