# Saving Suspect Samples

This notebook contains the code to extracts sequence sample from data and saves only suspect ones. The reason to develop this code is to study the behavior of suspect example accross the data. 

In [None]:
from qa_qc.ai_utils import get_file_names
import pandas as pd
import os
from qa_qc.ai_utils import generate_time_windows, QartodFlags
from tqdm import tqdm

eov_col_name = 'temperature'
eov_flag_name = 'qc_flag_temperature'

In [None]:
parent_ = "D://CIOOS-Full-Data/chunking/"
chunk_dir = os.path.join(parent_,"Inverness/")

window_hour = 12
file_names = get_file_names(chunk_dir)
suspect_df = []
for file_name in tqdm(file_names):
    print(f"Processing : [{file_name}]")
    df = pd.read_csv(file_name, usecols=['time', eov_flag_name, eov_col_name])
    df['time'] = pd.to_datetime(df['time'])

    # Feature engineering from window
    lst_of_seq_ = generate_time_windows(df, window_hours=window_hour, min_rows_in_chunk=10)
    for current_, past_, future_ in lst_of_seq_:
        label = current_[eov_flag_name]
        if label == QartodFlags.SUSPECT:
            su_df = pd.concat([past_.reset_index(drop=True), current_.to_frame().T.reset_index(drop=True), future_.reset_index(drop=True)])
            suspect_df.append(su_df)


df__ = pd.concat(suspect_df, axis=0)
df__.drop_duplicates(inplace=True)
df__.to_csv(os.path.join(parent_, f"{os.path.basename(os.path.dirname(chunk_dir))}__SUSPECT.csv"), index=False)

# dir_ = "D:/CIOOS-Full-Data/chunking/"
# for rt, dir_, files in os.walk(dir_):
#     for fl in files:
#         if "SUSPECT.csv" in fl:
#             fapath_ = os.path.join(rt, fl)
#             df_ = pd.read_csv(fapath_)
#             print(df_.shape)
#             df_.drop_duplicates(inplace=True)
#             fapath_ = fapath_.replace(".csv","_noduplicate.csv")
#             df_.to_csv(fapath_, index=False)

# Plotting Chart of datasets

In [2]:
import plotly.express as px
import pandas as pd
import os
import re
from ai_utils import get_file_names
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots

dir_ = "D:/CIOOS-Full-Data/chunking/Antigonish/"
fnames_ = get_file_names(dir_)
# fnames_ = ["Cape Breton County Water Quality Data.csv-1.csv", "Cape Breton County Water Quality Data.csv-2.csv"]
eov_col_name = 'temperature'
eov_flag_name = 'qc_flag_temperature'
lst_of_dfs = []
map__ = {}
for i, fname in enumerate(fnames_):
    fname = os.path.basename(fname)
    fname = os.path.join(dir_, fname)
    df = pd.read_csv(fname, usecols=['time', eov_flag_name, eov_col_name])
    df['time'] = pd.to_datetime(df['time'])
    df['ID'] = i
    map__[i] = fname
    lst_of_dfs.append(df)

new_df_ = pd.concat(lst_of_dfs)

new_df_.sort_values(by=['ID', 'time'], inplace=True)

# Unique IDs determine number of plots
unique_ids = sorted(new_df_['ID'].unique())
num_plots = len(unique_ids)

# Create subplot layout
fig = make_subplots(
    rows=num_plots, cols=1,
    shared_xaxes=False,
    vertical_spacing=0.002,
    subplot_titles=[f"ID: {map__[id_]}" for id_ in unique_ids]
)

# Add single line trace for each ID
for idx, id_ in enumerate(unique_ids):
    sub_df = new_df_[new_df_['ID'] == id_]
    fig.add_trace(
        go.Scatter(
            x=sub_df['time'],
            y=sub_df[eov_col_name],
            mode='lines',
            name=f'ID {id_}',
            showlegend=False
        ),
        row=idx + 1, col=1
    )

# Set overall figure layout
fig.update_layout(
    height=400 * num_plots,
    title="Scrollable Subplots: Temperature by ID"
)


from plotly.offline import init_notebook_mode, iplot

# Enable Plotly in the notebook
init_notebook_mode(connected=True)

fig.show()
# # Save scrollable HTML
# output_path = os.path.join(dir_, "visualize_scrollable_subplots.html")
# fig.write_html(output_path)
# 
# print(f"Saved scrollable subplot HTML to: {output_path}")