# BravaStar(Forno) - Abyss NMEAOut Analysis

This notebook turns NMEAOut data to comprehensive tables, making them easier to be plotted

In [None]:
import pathlib, pandas as pd, sys, os
import matplotlib.pyplot as plt
sys.path.append(str(pathlib.Path.cwd().parent / "src"))  # for local import

In [None]:
from adcp_parser import parse_file, save_tables_html

In [None]:
RAW_FILE = "../data/01_06_2025.txt"   # drop your .txt here

In [None]:
ca, cs = parse_file(RAW_FILE)
ca.head(), cs.head()

In [None]:
from adcp_parser import parse_file, save_tables_html

save_tables_html(ca, cs,
                 out_dir="html",                      # folder will be created
                 css_path="../styles/tables.css")     # the stylesheet we generated



<h1> Comparing Bins
<h3> Here we take all BINS (From Aquadopp and ADCP Signature) order them in depth in time domain. Then we make some simple analysis on the data validity frequency and compare some BINs
<h4> This comparison spans over 

In [None]:
from adcp_parser import parse_file
from adcp_grids  import build_metric_grids

speed_mat, dir_mat = build_metric_grids(ca, cs, max_bin=29)

display(speed_mat)     # or .style.format("{:.2f}")


In [None]:
# Check how many valid points each bin has in the window you’re plotting
import numpy as np, pandas as pd

window = speed_mat   # or _slice_window(speed_mat, START, END) if you set a window
for b in ["BIN0", "BIN1"]:
    finite = window.loc[b].notna().sum()
    print(f"{b:4s}  has {finite} finite points")


<H1> Comparator Cell
<h3> Comparing Aquadopp current meter bin (BIN0) to ADCP first BIN (BIN1)

In [None]:
from adcp_stats import compare_bins

# ▶︎ Compare two bins in the speed matrix
BINS  = ["BIN0", "BIN1"]
# START = "2025-05-30 21:00:00"
# END   = "2025-05-30 22:00:00"

compare_bins(speed_mat, bins=BINS,     markersize=2,            
    linewidth=1.5,                    
    colors=["#e74c3c", "#3498db", "#2ecc71"],  
)

In [None]:
# ╔═╡  Interactive compare_bins  — break at NaNs ╞══════════════════════════════
import plotly.express as px
import pandas as pd

# ─── PARAMETERS ──────────────────────────────────────────────────────────────
BINS   = ["BIN0", "BIN1", "BIN5"]         # choose up to any number of bins
START  = None                             # eg "2025-05-30 00:00:00"
END    = None                             # eg "2025-05-31 23:59:59"
# ─────────────────────────────────────────────────────────────────────────────

# helper from earlier
def _slice_window(df, start=None, end=None):
    if start: df = df.loc[:, df.columns >= pd.to_datetime(start)]
    if end:   df = df.loc[:, df.columns <= pd.to_datetime(end)]
    return df

df = _slice_window(speed_mat, START, END).reindex(index=BINS)

# ─── KEEP NaNs!  (NO .dropna()) ──────────────────────────────────────────────
tidy = (
    df
    .T.reset_index()
    .melt(id_vars="index", var_name="bin", value_name="speed_ms")
    .rename(columns={"index": "timestamp"})
    # .dropna()   ← removed!  gaps remain, so connectgaps will be honoured
)

fig = px.line(
    tidy,
    x="timestamp",
    y="speed_ms",
    color="bin",
    title="Speed comparison – gaps not connected",
    markers=True,               # circles on each valid sample
)

# break the line across NaNs (default in Plotly = True)
fig.update_traces(
    connectgaps=False,          # <- THIS now works because NaNs were kept
    marker_size=4
)
fig.update_yaxes(title="Speed (m/s)")
fig.update_xaxes(title="Timestamp")
fig.show()


<h1> Validity Report Cell

In [None]:
from adcp_stats import validity_report

# BINS  = ["BIN0", "BIN5", "BIN10"]       # None → all bins
# START = "2025-05-30 21:00:00"
# END   = "2025-05-30 22:00:00"

report_df = validity_report(speed_mat, ) #bins=BINS
display(report_df.style.format("{:.1f}"))


<h1> Histogram of Percentage of Valid data per Bin

In [None]:
from adcp_stats import histogram_validity

# START = "2025-05-30 21:00:00"
# END   = "2025-05-30 22:00:00"

histogram_validity(speed_mat)


## Checking the the frequency of invalid data over each hour of the day

In [None]:
# ╔═╡  Rank “worst” time-frames + histogram by hour-of-day ╞═════════════════════
import matplotlib.pyplot as plt
import pandas as pd
from adcp_stats import _slice_window      # internal helper is fine to import

# ─── parameters you can tweak ────────────────────────────────────────────────
START      = None                 # e.g. "2025-05-30 00:00:00"
END        = None                 # e.g. "2025-06-01 23:59:59"
TOP_N      = 15                   # how many worst timestamps to list
# ─────────────────────────────────────────────────────────────────────────────

# 1) focus on the requested time-window
win = _slice_window(speed_mat, start=START, end=END)

# 2) count invalid bins at each timestamp
invalid_per_ts = win.isna().sum(axis=0)        # Series: index = timestamp
invalid_per_ts.name = "invalid_bins"

# 3) ranking – biggest ‘holes’ first
top = invalid_per_ts.sort_values(ascending=False).head(TOP_N)
display(top.to_frame().style.set_caption(f"Top {TOP_N} timestamps with most invalid bins"))

# 4) histogram → aggregate by hour-of-day (0-23)
hourly_sum = (
    invalid_per_ts
    .groupby(invalid_per_ts.index.hour)
    .sum()                         # total invalid-bin hits in each hour slot
)

ax = hourly_sum.plot.bar(figsize=(10,4))
ax.set_xlabel("Hour of day")
ax.set_ylabel("Total invalid-bin hits")
ax.set_title("Invalid data distribution across daytime hours")
ax.set_ylim(0, hourly_sum.max()*1.1)
ax.grid(True, axis="y")
plt.tight_layout()


In [None]:
# ╔═╡  Rank & hist  |  ADCP × Aquadopp separados ╞════════════════════════════
import matplotlib.pyplot as plt
import pandas as pd
from adcp_stats import _slice_window   # já usamos antes

# ─── parâmetros gerais ──────────────────────────────────────────────────────
START  = None     # p.ex. "2025-05-30 00:00:00"
END    = None     # p.ex. "2025-06-01 23:59:59"
TOP_N  = 15       # quantos timestamps listar no ranking
# ────────────────────────────────────────────────────────────────────────────

SUBSETS = {
    "ADCP  (BIN1-29)": [f"BIN{i}" for i in range(1, 30)],
    "Aquadopp (BIN0)": ["BIN0"],
}

for label, bins in SUBSETS.items():
    print(f"\n=== {label} ===")

    # 1) janela temporal + bins escolhidos
    win = _slice_window(speed_mat.loc[bins], start=START, end=END)

    # 2) contagem de bins inválidos por timestamp
    invalid_per_ts = win.isna().sum(axis=0)
    invalid_per_ts.name = "invalid_bins"

    # 3) ranking
    top = invalid_per_ts.sort_values(ascending=False).head(TOP_N)
    display(top.to_frame().style.set_caption(
        f"Top {TOP_N} timestamps com mais bins inválidos  [{label}]"))

    # 4) histograma por hora-do-dia
    hourly_sum = (
        invalid_per_ts
        .groupby(invalid_per_ts.index.hour)
        .sum()
    )

    ax = hourly_sum.plot.bar(figsize=(10,4), color="#c0392b" if "ADCP" in label else "#2980b9")
    ax.set_xlabel("Hora do dia")
    ax.set_ylabel("Total de bins inválidos")
    ax.set_title(f"Distribuição de falhas por hora – {label}")
    ax.set_ylim(0, hourly_sum.max() * 1.1)
    ax.grid(True, axis="y")
    plt.tight_layout()


In [None]:
# ╔═╡  Rank + 2 hist side-by-side  (ADCP × Aquadopp) ╞═══════════════════════════
import matplotlib.pyplot as plt
import pandas as pd
from adcp_stats import _slice_window             # helper já existente

# ─── parâmetros ──────────────────────────────────────────────────────────────
START = None              # p.ex. "2025-05-30 00:00:00"
END   = None              # p.ex. "2025-06-01 23:59:59"
TOP_N = 15
# ─────────────────────────────────────────────────────────────────────────────

SETS = {
    "Signature  (BIN1-29)": [f"BIN{i}" for i in range(1, 30)],
    "Aquadopp (BIN0)": ["BIN0"],
}

rank_tables   = {}
hourly_sums   = {}

for label, bins in SETS.items():
    win = _slice_window(speed_mat.loc[bins], start=START, end=END)

    invalid_per_ts = win.isna().sum(axis=0)
    invalid_per_ts.name = "invalid_bins"

    # — ranking —
    rank = invalid_per_ts.sort_values(ascending=False).head(TOP_N)
    rank_tables[label] = rank

    # — soma por hora —
    hourly_sums[label] = (
        invalid_per_ts
        .groupby(invalid_per_ts.index.hour)
        .sum()
    )

# 1) exibir rankings
for label in SETS:
    display(rank_tables[label].to_frame().style.set_caption(
        f"Top {TOP_N} timestamps – {label}"))

# 2) dois histogramas lado-a-lado
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 5), sharey=True)

colors = {"Signature  (BIN1-29)": "#c0392b", "Aquadopp (BIN0)": "#2980b9"}

for ax, label in zip(axes, SETS):
    hourly_sums[label].plot.bar(ax=ax, color=colors[label])
    ax.set_title(label)
    ax.set_xlabel("Hora do dia")
    ax.set_ylabel("Total de bins inválidos" if ax is axes[0] else "")
    ax.set_ylim(0, max(s.max() for s in hourly_sums.values())*1.1)
    ax.grid(True, axis="y")

fig.suptitle("Distribuição de falhas por hora do dia", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()


<h2> Invalid Data Correlation between BINs

In [None]:
# ╔═╡  Correlation / co-occurrence of invalid bins ╞═══════════════════════════
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from adcp_stats import _slice_window        # reuse helper; or slice manually

# ─── PARAMETERS ──────────────────────────────────────────────────────────────
BINS  = None                 # e.g. ["BIN0","BIN1","BIN5"]  or None = all
START = None                 # e.g. "2025-05-30 00:00:00"
END   = None                 # e.g. "2025-06-02 23:59:59"
# ─────────────────────────────────────────────────────────────────────────────

win = _slice_window(speed_mat, START, END)
if BINS is not None:
    win = win.reindex(index=BINS)

invalid = win.isna()                         # bool DataFrame: True = invalid
bins    = invalid.index
n_bins  = len(bins)

# ── 1) Conditional-probability matrix  P(j invalid | i invalid)  ────────────
cond_df = pd.DataFrame(index=bins, columns=bins, dtype=float)

for i in bins:
    denom = invalid.loc[i].sum()            # #timestamps where bin i invalid
    if denom == 0:
        cond_df.loc[i] = np.nan
        continue
    for j in bins:
        joint = (invalid.loc[i] & invalid.loc[j]).sum()
        cond_df.loc[i, j] = joint / denom

display(cond_df.style.format("{:.2f}")
                    .set_caption("Conditional probability "
                                 "P(bin j invalid | bin i invalid)"))

# ── 2) ϕ / Pearson correlation of invalid flags  ────────────────────────────
phi_df = invalid.T.corr(method="pearson")    # phi coefficient for binary vars

# ── 3) Heat-map  ─────────────────────────────────────────────────────────────
plt.figure(figsize=(6 + 0.25*n_bins, 5))
sns.heatmap(phi_df, vmin=-1, vmax=1, cmap="coolwarm",
            square=True, cbar_kws={"label": "ϕ / Pearson r"})
plt.title("Correlation of invalid occurrences across bins")
plt.yticks(rotation=0); plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


# 🚦 Análise de Co-ocorrência de Dados Inválidos entre Bins

> **Propósito**  
> Verificar se os *buracos de dados* tendem a acontecer **ao mesmo tempo** em diferentes bins do ADCP.  


---

## Intuição

1. **Mapa de probabilidades condicionais**  
   *Para cada bin-i*, medimos a fração de vezes em que **pelo menos um outro bin-j** também estava inválido **na mesma amostra de tempo**.  
   * Leitura: se `P(BIN7 inválido | BIN3 inválido) = 0,75`, significa que, em 75 % das vezes em que o BIN 3 falhou, o BIN 7 falhou junto.

2. **Matriz de correlação ϕ (Pearson para variáveis binárias)**  
   *Transformamos* a tabela `speed_mat` em um DataFrame booleano (`True → NaN`, `False → valor válido`).  
   * Calculamos ϕ (= correlação de Pearson para 0/1).  
   * ϕ ≈ 1 → os dois bins compartilham quase sempre o mesmo estado (válido/ inválido).  
   * ϕ ≈ 0 → falhas independentes.  
   * ϕ < 0 → pouco provável aqui; indicaria que um falha quando o outro não falha.

3. **Heat-map**  
   Cores quentes (vermelho) destacam pares de bins que apresentam *muitas* falhas simultâneas; bons candidatos a investigação conjunta.



In [None]:
import plotly.express as px

# transforma booleano → inteiro (0 válido, 1 inválido)
z = invalid.astype(int)

fig = px.imshow(
    z,
    aspect="auto",
    color_continuous_scale=[[0, "green"], [1, "red"]],
    origin="upper",
    labels=dict(x="Timestamp", y="Profundidade (m)", color="Inválido"),
)

fig.update_yaxes(autorange="reversed")       # profundidade crescente para baixo
fig.update_xaxes(tickangle=-45)

fig.update_layout(
    title="Dado inválido (vermelho) — mapa tempo × profundidade",
    coloraxis_showscale=False,
    height=700,                                # ← canvas alto
    # width=1200,                                # (opcional) controlando largura
)
fig.show()
