In [6]:
import mlflow
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from dash.dependencies import State
from mlflow.tracking import MlflowClient
from sklearn.preprocessing import MinMaxScaler

from src.functions import load_stations_from_path, preprocces, \
    create_dataset

In [7]:
test_path = '../data/labeled_benchmark/test'
testing_stations = {station.iloc[0]['station_code']: station for station in load_stations_from_path(test_path)}

for station_code, df in testing_stations.items():
    df['measure_date'] = pd.to_datetime(df['measure_date'])
    df['year'] = df['measure_date'].dt.year
    df['month'] = df['measure_date'].dt.month
    df['hydro_year'] = df['year']
    df.loc[df['month'] >= 10, 'hydro_year'] = df['hydro_year'] + 1

def scale(data, scaler_class):
    features_to_scale = ['HS', 'TSS_30MIN_MEAN', 'RSWR_30MIN_MEAN', 'TA_30MIN_MEAN', 'VW_30MIN_MEAN']
    scaled = {}
    for station_code, df in data.items():
        scaler = scaler_class()
        scaled[station_code] = df.copy()
        scaled[station_code][features_to_scale] = scaler.fit_transform(scaled[station_code][features_to_scale])

    return scaled

# testing_stations_zscore = scale(testing_stations, StandardScaler)
testing_stations_minmax = scale(testing_stations, MinMaxScaler)

In [8]:
mlflow.set_tracking_uri('http://localhost:5000')

experiment_name = 'Benchmark'
run_name = '2023-12-12_16:24:28_truncate_all_if_one_missing'

client = MlflowClient()
experiment = client.get_experiment_by_name('Benchmark')
runs = client.search_runs(experiment_ids=[experiment.experiment_id], filter_string=f"tags.mlflow.runName = '{run_name}'")
selected_run = runs[0] if len(runs) > 0 else None
run_id = selected_run.info.run_id
model_uri = f"runs:/{run_id}/model"
model = mlflow.tensorflow.load_model(model_uri)

Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/12/21 22:52:02 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [9]:
SEQUENCE_LENGTH = 90
TARGET_START_INDEX = SEQUENCE_LENGTH - 1
FEATURE_COLUMNS = [
    'HS',
    'day_sin',
    'day_cos',
    'month_sin',
    'month_cos',
    'TSS_30MIN_MEAN',
    'RSWR_30MIN_MEAN',
    'TA_30MIN_MEAN',
    'VW_30MIN_MEAN'
]
TARGET_COLUMN = 'no_snow'
DATASET_BATCH_SIZE = 64

# Values taken from the original dataset
train_mean = np.array([89.38116858114648, 0.0011084782603573208, -0.017600892344483886, -0.0054428754913079195, 0.005005952200941785, -1.2958664264976802, 86.82415770740349, 1.733119096445471, 1.8837531498159192])
train_std = np.array([105.46005140685408, 0.7132728403500057, 0.7006645667270192, 0.7042157273701615, 0.7099475507870721, 9.356596509160058, 84.9729233123369, 7.32290085363148, 1.6545282612533645])

testing_stations_real = {}
for station_code, df in testing_stations.items():
    testing_stations_real[station_code] = preprocces(df.copy())
    testing_stations_real[station_code][FEATURE_COLUMNS] -= train_mean
    testing_stations_real[station_code][FEATURE_COLUMNS] /= train_std

evaluate = False
all_evaluation_results = np.empty((0, 5), float)

for station_code, df in testing_stations_real.items():
    station = preprocces(df)

    features = station[FEATURE_COLUMNS]
    targets = station[TARGET_COLUMN]

    test_dataset = create_dataset(
        features, targets, SEQUENCE_LENGTH, TARGET_START_INDEX, DATASET_BATCH_SIZE, shuffle=False
    )
    
    testing_stations_minmax[station_code]['preds'] = np.concatenate([np.full(SEQUENCE_LENGTH - 1, False), model.predict(test_dataset, verbose=0).reshape((-1,)) > 0.5])
    if evaluate:
        evaluation_results = model.evaluate(test_dataset, verbose=0)
        precision = evaluation_results[2]
        recall = evaluation_results[3]
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        print(f"station: {station_code}, loss: {evaluation_results[0]:.4f} - accuracy: {evaluation_results[1]:.4f} - recall_2: {evaluation_results[2]:.4f} - precision_2: {evaluation_results[3]:.4f}")
        all_evaluation_results = np.append(all_evaluation_results, [evaluation_results + [f1_score]], axis=0)
if evaluate: 
    print('test_avg_loss', f"{np.mean(all_evaluation_results[:, 0]):.4f}")
    print('test_avg_accuracy', f"{np.mean(all_evaluation_results[:, 1]):.4f}")
    print('test_avg_precision', f"{np.mean(all_evaluation_results[:, 2]):.4f}")
    print('test_avg_recall', f"{np.mean(all_evaluation_results[:, 3]):.4f}")
    print('test_avg_f1_score', f"{np.mean(all_evaluation_results[:, 4]):.4f}")

In [15]:
import dash
from dash import dcc, html, Input, Output
import plotly.graph_objs as go

app = dash.Dash(__name__)

features_to_display =[
    'HS',
    'TSS_30MIN_MEAN',
    'RSWR_30MIN_MEAN',
    'TA_30MIN_MEAN',
    'VW_30MIN_MEAN',
    'preds',
    'no_snow'
] 

var_descriptions = [
    'HS',
    'TSS_30MIN_MEAN (snow surface temperature, mean over last 30 minutes)', 
    'RSWR_30MIN_MEAN (reflected short wave radiation, mean over last 30 minutes)', 
    'TA_30MIN_MEAN  (air temperature, mean over last 30 minute)', 
    'VW_30MIN_MEAN (wind speed, vectorial mean over last 30 minutes)', 
    'preds',
    'no_snow' 
]
def find_consecutive_date_ranges(dates):
    if len(dates) == 0:
        return []

    sorted_dates = sorted(dates)  # Ensure dates are sorted
    ranges = []
    start = end = sorted_dates[0]

    for date in sorted_dates[1:]:
        if (date - end).days <= 1:
            end = date
        else:
            ranges.append((start, end))
            start = end = date
    ranges.append((start, end))  # Add the last range
    return ranges


app.layout = html.Div([
    html.Div([
        dcc.Graph(id='time-series-chart', figure=go.Figure()),
    ], style={'width': '60%', 'margin': 'auto'}),

    html.Div([
        dcc.Dropdown(
            id='station-dropdown',
            options=[{'label': i, 'value': i} for i in testing_stations_minmax.keys()],
            value=list(testing_stations_minmax.keys())[0],
            style={'width': '48%', 'display': 'inline-block'}
        ),
        dcc.Dropdown(
            id='year-dropdown',
            options=[{'label': i, 'value': i} for i in range(1990, 2023)],
            value=2006,
            style={'width': '48%', 'display': 'inline-block'}
        )
    ]),

    html.Div([
        dcc.Checklist(
            id='feature-checklist',
            options=[{'label': var_descriptions[i], 'value': feature} for i, feature in enumerate(features_to_display)],
            value=['HS', 'no_snow', 'preds'],
            style={'padding': 10}
        )
    ])
])

@app.callback(
    Output('year-dropdown', 'options'),
    [Input('station-dropdown', 'value')]
)
def update_year_options(selected_station):
    selected_station_df = testing_stations_minmax[selected_station]

    years = sorted(selected_station_df['measure_date'].dt.year.unique())
    year_options = [{'label': year, 'value': year} for year in years]

    return year_options

@app.callback(
    Output('time-series-chart', 'figure'),
    [Input('station-dropdown', 'value'),
     Input('year-dropdown', 'value'),
     Input('feature-checklist', 'value')],
    [State('time-series-chart', 'figure')]
)
def update_graph(selected_station, selected_hydro_year, selected_features, existing_figure):
    selected_station_df = testing_stations_minmax[selected_station]
    filtered_df = selected_station_df[selected_station_df['hydro_year'] == selected_hydro_year]

    existing_figure['layout']['shapes'] = []
    data = []
    shapes = []
    for feature in selected_features:
        if feature != 'no_snow' and feature != 'preds':
            data.append(go.Scatter(x=filtered_df['measure_date'], y=filtered_df[feature], name=feature))

    if 'no_snow' in selected_features:
        no_snow_data = filtered_df[filtered_df['no_snow']]['measure_date'].sort_values()
        if not no_snow_data.empty:
            date_ranges = find_consecutive_date_ranges(no_snow_data)
            for start, end in date_ranges:
                if start == end:
                    # Add a vertical line for a single day
                    shapes.append({
                        'type': 'line',
                        'x0': start,
                        'y0': 0,
                        'x1': start,
                        'y1': filtered_df['HS'].max(),
                        'line': {
                            'color': 'rgba(255, 0, 0, 0.2)',
                            'width': 1,
                        },
                    })
                else:
                    shapes.append({
                        'type': 'rect',
                        'x0': start,
                        'y0': 0,
                        'x1': end,
                        # 'y1': filtered_df['HS'].max(),
                        'y1': 1,
                        'fillcolor': 'red',
                        'opacity': 0.2,
                        'line': {'width': 0},
                    })

    if 'preds' in selected_features:
        preds_data = filtered_df[filtered_df['preds']]
        data.append(
            go.Scatter(x=preds_data['measure_date'], y=preds_data['HS'], mode='markers', name='Predictions', marker=dict(color='green', size=5))
        )
        

    figure = go.Figure(data=data, layout=existing_figure['layout'])
    figure.update_layout(
        xaxis_title='Date',
        yaxis_title='Snow Height',
        shapes=shapes,
        template="plotly_white",
        showlegend=True,
        # legend=dict(
            # x=0.5,
            # y=1.2,
            # xanchor='right',
            # yanchor='top'
        # )
    )
    
    return figure

if __name__ == '__main__':
    app.run(jupyter_mode="external", debug=True)

Dash app running on http://127.0.0.1:8050/
