# Connect to DB

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import datetime
import copy
import seaborn as sns
import scipy
import psycopg2

In [2]:
username = input("Username: ")
password = input("Password: ")

# Clear output
from IPython.display import clear_output

clear_output()

In [None]:
conn = psycopg2.connect(
    user=username,
    password=password,
    host="portal-production-postgres.c1trxszive18.us-west-2.rds.amazonaws.com",
    port="5432",
    database="portal_production_postgres",
)

cursor = conn.cursor()

# Extracting Delorean Incidents

In [None]:
# Query the review database
sql = """
SELECT
*
FROM api_incident
WHERE 
data->>'cooldown_tag' = 'True'
"""

delorean_incidents = pd.read_sql(sql, conn)

In [None]:
import plotly.express as px


fig = px.histogram(
    delorean_incidents, x="title", color="title", barmode="group", height=400
)
fig.show()

In [None]:
from enum import Enum, unique


@unique
class ReviewType(Enum):
    DEFAULT_UNKNOWN = 0
    TRUE_POSITIVE = 1
    FALSE_POSITIVE = 2
    UNSURE = 3


def positive_negative_to_type(valid_count, invalid_count, unsure_count):
    if valid_count > 0 and invalid_count == 0:
        return ReviewType.TRUE_POSITIVE
    if invalid_count > 0 and valid_count == 0:
        return ReviewType.FALSE_POSITIVE
    return ReviewType.UNSURE


delorean_incidents["review_category"] = delorean_incidents.apply(
    lambda row: positive_negative_to_type(
        row["valid_feedback_count"],
        row["invalid_feedback_count"],
        row["unsure_feedback_count"],
    ),
    axis=1,
)

So the origin of the data will always be the true regular incident counts. So let's get those

In [None]:
# Query the review database
sql = """
SELECT
data->>'camera_uuid' as c_uuid
FROM api_incident
WHERE 
data->>'cooldown_tag' = 'True'
group by
c_uuid
"""

camera_uuids = pd.read_sql(sql, conn)

In [None]:
sql = """
SELECT *,
data->>'camera_uuid' as c_uuid
from api_incident
where
data->>'camera_uuid' in (
SELECT
data->>'camera_uuid' as c_uuid
FROM api_incident
WHERE 
data->>'cooldown_tag' = 'True'
group by
c_uuid 
) and created_at >
(
SELECT
MIN(created_at)
FROM api_incident
WHERE 
data->>'cooldown_tag' = 'True'
)
and 
data->>'cooldown_tag' = 'False'
"""

all_non_delorean_incidents = pd.read_sql(sql, conn)

In [None]:
all_non_delorean_incidents[
    "review_category"
] = all_non_delorean_incidents.apply(
    lambda row: positive_negative_to_type(
        row["valid_feedback_count"],
        row["invalid_feedback_count"],
        row["unsure_feedback_count"],
    ),
    axis=1,
)

In [None]:
def incident_data_to_camera_uuid(data):
    return data["camera_uuid"]

In [None]:
all_non_delorean_incidents["camera_uuid"] = all_non_delorean_incidents.apply(
    lambda row: incident_data_to_camera_uuid(row["data"]),
    axis=1,
)
delorean_incidents["camera_uuid"] = delorean_incidents.apply(
    lambda row: incident_data_to_camera_uuid(row["data"]),
    axis=1,
)

In [None]:
def evaluate_fp_rate(rows):
    n = len(rows)
    tp = len(rows[rows["review_category"] == ReviewType.TRUE_POSITIVE])
    fp = len(rows[rows["review_category"] == ReviewType.FALSE_POSITIVE])
    u_count = len(rows[rows["review_category"] == ReviewType.UNSURE])
    return (fp / n, u_count / n)


grouped_non_delorean_results = all_non_delorean_incidents.groupby(
    ["camera_uuid", "title"]
).apply(evaluate_fp_rate)

In [None]:
def incident_group_results(grouping_generator):
    grouped_delorean_results = []
    for grouping_key, df in grouping_generator:
        new_item = {}
        new_item["GroupKey"] = str(grouping_key)
        fp_rate, unsure_rate = evaluate_fp_rate(df)
        count = len(df)
        new_item["count"] = count
        new_item["fp_rate"] = fp_rate
        new_item["unsure_rate"] = unsure_rate
        grouped_delorean_results.append(new_item)

    return pd.DataFrame.from_dict(grouped_delorean_results)


g_delorean_results = incident_group_results(
    delorean_incidents.groupby(["camera_uuid", "title"])
)

In [None]:
fig = px.bar(g_delorean_results, x="GroupKey", y="fp_rate", height=1000)
fig.show()

In [None]:
g_non_delorean_results = incident_group_results(
    all_non_delorean_incidents.groupby(["camera_uuid", "title"])
)

In [None]:
fig = px.bar(g_non_delorean_results, x="GroupKey", y="fp_rate", height=1000)
fig.show()

In [None]:
g_non_delorean_results["origin"] = g_non_delorean_results.apply(
    lambda x: "non_delorean", axis=1
)
g_delorean_results["origin"] = g_delorean_results.apply(
    lambda x: "delorean", axis=1
)

In [None]:
merged = g_delorean_results.append(g_non_delorean_results)

In [None]:
fig = px.bar(
    merged,
    x="GroupKey",
    y="fp_rate",
    height=1000,
    color="origin",
    barmode="group",
)
fig.show()

In [None]:
def grouper_flat(merged_df_grouper):
    new_df = []
    for (group_id, items) in merged_df_grouper:
        if len(items) < 2:
            # just ignore for now
            continue
        new_item = {}
        for _, item in items.iterrows():
            if item["origin"] == "delorean":
                new_item["delorean_fp_rate"] = item["fp_rate"]
                new_item["delorean_count"] = item["count"]
            else:
                new_item["regular_fp_rate"] = item["fp_rate"]
                new_item["regular_count"] = item["count"]
            new_item["GroupKey"] = item["GroupKey"]
        new_df.append(new_item)
    return pd.DataFrame.from_dict(new_df)


flat_merged = grouper_flat(merged.groupby("GroupKey"))

In [None]:
fig = px.scatter(
    flat_merged,
    x="regular_fp_rate",
    y="delorean_fp_rate",
    color="GroupKey",
    hover_data=["delorean_count", "regular_count"],
)
fig.add_scatter(x=[0, 1], y=[0, 1], line={"dash": "dot", "color": "black"})
fig.show()

In [None]:
flat_merged["extrapolation_error"] = flat_merged.apply(
    lambda row: abs(row["delorean_fp_rate"] - row["regular_fp_rate"]), axis=1
)

In [None]:
fig = px.histogram(flat_merged, x="extrapolation_error", histnorm="percent")
from scipy.stats import binomtest

fig.show()

In [None]:
from scipy.stats import binomtest


def calculate_error(false_positives, n, extrapolated_rate):
    binomial_test = binomtest(false_positives, n=n, p=extrapolated_rate)
    confidence_interval = binomial_test.proportion_ci(confidence_level=0.95)
    low_error = false_positives / n - confidence_interval.low
    high_error = confidence_interval.high - false_positives / n
    return (low_error, high_error), binomial_test.pvalue


def calculate_error_for_row(row):
    extrapolated = row["regular_fp_rate"]
    delorean = row["delorean_fp_rate"]
    fp_count = int(row["delorean_count"] * delorean)
    n = row["delorean_count"]
    return calculate_error(fp_count, n, extrapolated)


def evaluate_group(delorean_grouper, regular_grouper):
    g_non_delorean_results = incident_group_results(regular_grouper)
    g_delorean_results = incident_group_results(delorean_grouper)
    g_non_delorean_results["origin"] = g_non_delorean_results.apply(
        lambda x: "non_delorean", axis=1
    )
    g_delorean_results["origin"] = g_delorean_results.apply(
        lambda x: "delorean", axis=1
    )
    merged = g_delorean_results.append(g_non_delorean_results)
    flat_merged = grouper_flat(merged.groupby("GroupKey"))
    error_rates = flat_merged.apply(calculate_error_for_row, axis=1)
    # add error bars
    flat_merged["e_minus"] = error_rates.apply(lambda x: x[0][0])
    flat_merged["e_plus"] = error_rates.apply(lambda x: x[0][1])
    flat_merged["p_value"] = error_rates.apply(lambda x: x[1])

    fig = px.scatter(
        flat_merged,
        x="regular_fp_rate",
        y="delorean_fp_rate",
        color="GroupKey",
        error_y="e_plus",
        error_y_minus="e_minus",
        hover_data=["delorean_count", "regular_count", "p_value"],
    )
    fig.add_scatter(x=[0, 1], y=[0, 1], line={"dash": "dot", "color": "black"})
    fig.show()

    fig = px.scatter(
        flat_merged,
        x="regular_fp_rate",
        y="delorean_fp_rate",
        color="GroupKey",
        hover_data=["delorean_count", "regular_count", "p_value"],
    )
    fig.add_scatter(x=[0, 1], y=[0, 1], line={"dash": "dot", "color": "black"})
    fig.show()

    flat_merged["extrapolation_error"] = flat_merged.apply(
        lambda row: abs(row["delorean_fp_rate"] - row["regular_fp_rate"]),
        axis=1,
    )
    fig = px.histogram(
        flat_merged, x="extrapolation_error", histnorm="percent"
    )
    fig.show()

    fig = px.ecdf(flat_merged, x="extrapolation_error")
    fig.show()

In [None]:
evaluate_group(
    delorean_incidents.groupby(["title"]),
    all_non_delorean_incidents.groupby(["title"]),
)

In [None]:
evaluate_group(
    delorean_incidents.groupby(["title", "camera_uuid"]),
    all_non_delorean_incidents.groupby(["title", "camera_uuid"]),
)

In [None]:
delorean_incidents["day_of_week"] = delorean_incidents.apply(
    lambda row: row["created_at"].weekday(), axis=1
)
all_non_delorean_incidents["day_of_week"] = all_non_delorean_incidents.apply(
    lambda row: row["created_at"].weekday(), axis=1
)

In [None]:
evaluate_group(
    delorean_incidents.groupby(["title", "day_of_week"]),
    all_non_delorean_incidents.groupby(["title", "day_of_week"]),
)

In [None]:
evaluate_group(
    delorean_incidents.groupby(["title", "day_of_week", "camera_uuid"]),
    all_non_delorean_incidents.groupby(
        ["title", "day_of_week", "camera_uuid"]
    ),
)

In [None]:
# add the organization too
delorean_incidents["organization_location"] = delorean_incidents.apply(
    lambda row: "".join(row["data"]["camera_uuid"].split("/")[:2]), axis=1
)
all_non_delorean_incidents[
    "organization_location"
] = all_non_delorean_incidents.apply(
    lambda row: "".join(row["data"]["camera_uuid"].split("/")[:2]), axis=1
)

In [None]:
evaluate_group(
    delorean_incidents.groupby(["title", "organization_location"]),
    all_non_delorean_incidents.groupby(["title", "organization_location"]),
)

In [None]:
# add the organization too
delorean_incidents["organization_name"] = delorean_incidents.apply(
    lambda row: "".join(row["data"]["camera_uuid"].split("/")[:1]), axis=1
)
all_non_delorean_incidents[
    "organization_name"
] = all_non_delorean_incidents.apply(
    lambda row: "".join(row["data"]["camera_uuid"].split("/")[:1]), axis=1
)

In [None]:
evaluate_group(
    delorean_incidents.groupby(["title", "organization_name"]),
    all_non_delorean_incidents.groupby(["title", "organization_name"]),
)