In [None]:
import os

# TODO chdir to server path regardless of computer
os.chdir("/home/andyzh45/citrus/server")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import database
import tba_communicator as tba
import utils
from pathlib import Path

# Params to edit
# =====================================
DB_NAME = "test2025capt"
READ_CLOUD = False
# =====================================

OUT_PATH = f"data/data_audit/{DB_NAME}_data_audit.html"
Path(utils.create_abs_path("data/data_audit/")).mkdir(exist_ok=True)

EVENT_KEY = DB_NAME[4:] if "test" in DB_NAME else DB_NAME
SCHEMA = utils.read_schema("schema/data_accuracy.yml")

db = database.BetterDatabase(DB_NAME, READ_CLOUD)
raw_data = db.get_documents("data_accuracy")
data = pd.DataFrame(raw_data)

TEAM_LIST = list(map(lambda doc: doc["team_number"], db.get_documents("obj_team")))

In [None]:
def graph_hist(var, title, data_to_graph=data, ax=None):

    if not ax:
        g = sns.histplot(data_to_graph, x=var, color="green")
    else:
        g = sns.histplot(data_to_graph, x=var, color="green", ax=ax)
    g.set(
        title=title,
        xlabel="Point Difference (Error)"
        if "point" in var
        else "Game Piece Difference (Error)",
        ylabel="Number of Alliances",
    )
    plt.suptitle(
        f"Avg. error: {round(data_to_graph[var].mean(), 1)} {'pts' if 'point' in var else 'pieces'}",
        fontsize=10,
        y=0.85,
    )

# Overall Accuracy

## Totals

In [None]:
graph_hist("total_point_diff", "Total Point Differences")

In [None]:
graph_hist("total_piece_diff", "Total Gamepiece Differences")

In [None]:
graph_hist("total_coral_diff", "Total Coral Count Differences")

In [None]:
graph_hist("total_net_diff", "Total Net Count Differences")

In [None]:
graph_hist(
    "total_net_diff",
    "Total Net Count Differences, Conditional on n > 0",
    data[data["total_net_diff"] > 0],
)

In [None]:
graph_hist("total_processor_diff", "Total Processor Count Differences")

In [None]:
graph_hist(
    "total_processor_diff",
    "Total Processor Count Differences, Conditional on n > 0",
    data[data["total_processor_diff"] > 0],
)

## Auto

In [None]:
graph_hist("auto_coral_diff", "Auto Coral Differences")

## Tele

In [None]:
graph_hist("tele_coral_diff", "Tele Coral Differences")

## Endgame

In [None]:
graph_hist("endgame_point_diff", "Endgame Point Differences")

# Team Error Bias

In [None]:
by_team = {team: 0 for team in TEAM_LIST}
for team in TEAM_LIST:
    by_team[team] = np.mean(
        list(
            map(
                lambda doc1: doc1["total_piece_diff"],
                filter(lambda doc: team in doc["team_numbers"], raw_data),
            )
        )
    )
by_team_df = pd.DataFrame(list(by_team.items()), columns=["team_number", "diff"])

g = sns.barplot(
    by_team_df.sort_values("diff", ascending=False).iloc[:10],
    x="team_number",
    y="diff",
    color="green",
)
g.set(
    title="Teams with Highest Avg. Errors",
    xlabel="Team Number",
    ylabel="Gamepiece Difference (Error)",
)

In [None]:
rating_to_team_err = []

for team in TEAM_LIST:
    rating_to_team_err.append(
        {
            "team_number": team,
            "avg_diff": by_team[team],
            "rating": db.get_documents("pickability", {"team_number": team})[0][
                "first_pickability"
            ],
        }
    )

g = sns.scatterplot(
    pd.DataFrame(rating_to_team_err), x="rating", y="avg_diff", color="green"
)
g.set(
    title="Team Avg. Error vs. Pickability",
    xlabel="First Pickability",
    ylabel="Avg. Error (pts)",
)

In [None]:
score_data = []

for alliance in db.get_documents("predicted_aim"):
    alliance_data = db.get_documents(
        "data_accuracy", {"match_number": alliance["match_number"]}
    )[0]
    alliance_data.update(alliance)
    score_data.append(alliance_data)

g = sns.scatterplot(
    pd.DataFrame(score_data), x="actual_score", y="total_piece_diff", color="green"
)
g.set(title="Match Error vs. Match Score", xlabel="Match Score", ylabel="Match Error")

# Scout Disagreements

In [None]:
sims = db.get_documents("unconsolidated_totals")
tba_matches = tba.tba_request(f"event/{EVENT_KEY}/matches")

disagreements = []
for match_number in sorted(
    list(map(lambda m: int(m), utils.get_match_schedule().keys()))
):
    match_sims = list(filter(lambda s: s["match_number"] == match_number, sims))

    for team in list(set([s["team_number"] for s in match_sims])):
        reported_values = []

        for sim in list(filter(lambda s: s["team_number"] == team, match_sims)):
            reported_values.append(
                utils.calc_weighted_sum(
                    sim, SCHEMA["--diffs"]["total_piece_diff"]["tim_weights"]
                )
            )

        max_ = max(reported_values)
        min_ = min(reported_values)
        disagreements.append(
            {
                "match_number": match_number,
                "team_number": team,
                "reported_values": reported_values,
                "min": min_,
                "max": max_,
                "range": (max_ - min_),
            }
        )

disagreements = pd.DataFrame(disagreements)

g = sns.histplot(disagreements, x="range", color="green")
g.set(
    title="Scout Disagreements",
    xlabel="Range of Reported Values",
    ylabel="Number of Robots",
)
plt.suptitle(
    f"Median disagreement: {round(disagreements['range'].median(), 1)} pieces", y=0.85
)

In [None]:
by_team_df = (
    disagreements[["team_number", "range"]].groupby("team_number").mean().reset_index()
)

g = sns.barplot(
    by_team_df.sort_values("range", ascending=False).iloc[:10],
    x="team_number",
    y="range",
    color="green",
)
g.set(
    title="Teams with Highest Disagreements",
    xlabel="Team Number",
    ylabel="Average Disagreement",
)

# Errors Across Alliances

In [None]:
top_matches = list(
    set(
        (
            map(
                lambda t2: t2["match_number"],
                filter(
                    lambda t: "1678" in t["team_numbers"]
                    or "1323" in t["team_numbers"]
                    or "604" in t["team_numbers"],
                    raw_data,
                ),
            )
        )
    )
)
top_match_data = pd.DataFrame(
    list(
        filter(
            lambda t: t["match_number"] in top_matches
            and (
                "1678" in t["team_numbers"]
                or "1323" in t["team_numbers"]
                or "604" in t["team_numbers"]
            ),
            raw_data,
        )
    )
)
opposing_data = pd.DataFrame(
    list(
        filter(
            lambda t: t["match_number"] in top_matches
            and not (
                "1678" in t["team_numbers"]
                or "1323" in t["team_numbers"]
                or "604" in t["team_numbers"]
            ),
            raw_data,
        )
    )
)

graph_hist(
    "total_piece_diff",
    "Avg. Match Error of Alliances with 1678, 1323, or 604",
    top_match_data,
)

In [None]:
graph_hist("total_piece_diff", "Avg. Match Error of Opposing Alliances", opposing_data)

# Error Persistence

In [None]:
team = "1678"

team_data = pd.DataFrame(list(filter(lambda t: team in t["team_numbers"], raw_data)))

g = sns.lineplot(team_data, x="match_number", y="total_piece_diff", color="green")
g.set(
    title=f"Team {team}'s Error Over Time",
    xlabel="Match Number",
    ylabel="Total Piece Difference (Error)",
)

# SPR Analysis

In [None]:
spr_data = pd.DataFrame(db.get_documents("scout_precision"))

g = sns.histplot(spr_data, x="scout_precision", color="green")
g.set(title="SPR Distribution", xlabel="SPR", ylabel="Number of Scouts")

In [None]:
utils.knit_ipynb("src/data_audit.ipynb", OUT_PATH)