In [None]:
%load_ext autoreload
%autoreload 2
import os

import mlflow
import numpy as np
from dotenv import load_dotenv

from src.functions import create_model, plot_data, check_gpus, create_train_val_datasets, load_stations_from_path, \
    create_test_datasets, plot_keras_history, get_features_and_targets
from src.utils import now_formatted, setup_logger, format_with_border, measure_execution_time

In [None]:
check_gpus()

In [None]:
load_dotenv()
mlflow_port = os.getenv('MLFLOW_PORT')
mlflow_uri = f'http://localhost:{mlflow_port}'
mlflow_experiment_name = f'Benchmark'
train_path = os.getenv('LS_LABELED_TRAIN_DATA_PATH')
test_path = os.getenv('LS_LABELED_TEST_DATA_PATH')
log_file_path = '/tmp/benchmark.log'

mlflow.set_tracking_uri(mlflow_uri)
mlflow.set_experiment(mlflow_experiment_name)
mlflow.tensorflow.autolog()

In [None]:
SEQUENCE_LENGTH = 20
TARGET_START_INDEX = SEQUENCE_LENGTH - 1
FEATURE_COLUMNS = ['HS', 'day_sin', 'day_cos', 'month_sin', 'month_cos']
TARGET_COLUMN = 'no_snow'
DATE_COLUMN = 'measure_date'
SPLIT_PERCENTAGE = 0.8
DATASET_BATCH_SIZE = 64

# Model configuration
MODEL_ARCHITECTURE = "128(l)-64-8(d)-1"
MODEL_INPUT_SHAPE = (SEQUENCE_LENGTH, len(FEATURE_COLUMNS))
MODEL_DROPOUT_RATE = 0.5
MODEL_OPTIMIZER = 'adam'
MODEL_METRICS = ['accuracy']
MODEL_LOSS = 'binary_crossentropy'
MODEL_BATCH_SIZE = 64
MODEL_EPOCHS = 20

def log_parameters(logging, mlflow):
    global_vars = globals()
    for var_name, value in global_vars.items():
        if var_name.isupper():
            logging.info(f'{var_name}: {value}')
            mlflow.log_param(f'benchmark_{var_name.lower()}', value)

In [None]:
training_stations = load_stations_from_path(train_path)
testing_stations = {station.iloc[0]['station_code']: station for station in load_stations_from_path(test_path)}

In [None]:
with mlflow.start_run(run_name=now_formatted()):
    logging, tmp_log_file = setup_logger(log_file_path=log_file_path)
    logging.info(format_with_border('Starting experiment'))
    
    log_parameters(logging, mlflow)
    logging.info(format_with_border('Preparing Training Data'))
    train_dataset, val_dataset, mean, std, num_train_samples, num_val_samples, _ = create_train_val_datasets(
        training_stations, SPLIT_PERCENTAGE, FEATURE_COLUMNS, TARGET_COLUMN, SEQUENCE_LENGTH, TARGET_START_INDEX, DATASET_BATCH_SIZE
    )

    logging.info(f"Training samples: {num_train_samples}")
    logging.info(f"Validation samples: {num_val_samples}")
    mlflow.log_param('benchmark_training_samples', num_train_samples)
    mlflow.log_param('benchmark_validation_samples', num_val_samples)

    model = create_model(MODEL_ARCHITECTURE, MODEL_INPUT_SHAPE, logging=None, dropout_rate=0.5, summary=False)
    model.compile(
        optimizer=MODEL_OPTIMIZER,
        metrics=MODEL_METRICS,
        loss=MODEL_LOSS
    )

    logging.info(format_with_border('Fitting Model'))
    @measure_execution_time
    def fit_model():
        return model.fit(
            train_dataset,
            epochs=MODEL_EPOCHS,
            batch_size=MODEL_BATCH_SIZE,
            validation_data=val_dataset
        )
    history, elapsed_fitting_time = fit_model()
    logging.info(f'Model fitting completed in {elapsed_fitting_time}')
    mlflow.log_param('benchmark_model_fitting_time', elapsed_fitting_time)

    logging.info(format_with_border('Evaluating Model on Test Data'))
    test_datasets = create_test_datasets(
        testing_stations.values(), FEATURE_COLUMNS, TARGET_COLUMN, SEQUENCE_LENGTH, TARGET_START_INDEX, DATASET_BATCH_SIZE, mean, std
    )
    # TODO: Convert to shared function
    all_evaluation_results = np.empty((0, 2), float)
    for j, dataset in enumerate(test_datasets):
        evaluation_results = model.evaluate(dataset, verbose=0)
        station_name = list(testing_stations.keys())[j]
        test_df = list(testing_stations.values())[j]
        logging.info(
            f'Station: {station_name}, Samples: {len(test_df)}, Loss: {evaluation_results[0]:.2f}, Accuracy: {evaluation_results[1]:.2f}'
        )
        all_evaluation_results = np.append(all_evaluation_results, [evaluation_results], axis=0)

    mlflow.log_metric('test_avg_loss', np.mean(all_evaluation_results[:, 0]))
    mlflow.log_metric('test_avg_accuracy', np.mean(all_evaluation_results[:, 1]))

    # Plotting
    predictions = [model.predict(td, verbose=0).reshape((-1,)) > 0.5 for td in test_datasets]
    fig = plot_data(
        [test_station[TARGET_START_INDEX:] for test_station in testing_stations.values()],
        predictions=predictions,
        show=False
    )
    mlflow.log_figure(fig, 'prediction_results.png')
    mlflow.log_artifact(tmp_log_file)

In [None]:
history_plot = plot_keras_history(history)

In [None]:
# Smote implementation TEST, if not working sohuld be removed from here
from matplotlib import pyplot as plt
from imblearn.over_sampling import SMOTE

features, targets, mean, std = get_features_and_targets(training_stations[0], len(training_stations[0]), ['HS'], ['no_snow'])


# plt.figure(figsize=(20, 5))
# plt.plot(range(len(features.flatten())), features.flatten())
# plt.show()

smote = SMOTE(random_state=42)
x_resample, y_resample = smote.fit_resample(features, targets)
len(targets), len(y_resample), np.unique(targets, return_counts=True), np.unique(y_resample, return_counts=True)
training_stations[0].head()
plt.figure(figsize=(20, 5))
plt.plot(range(len(features.flatten())), features.flatten())
plt.title('Original')
plt.show()

plt.figure(figsize=(20, 5))
plt.plot(range(len(x_resample.flatten())), x_resample.flatten())
plt.title('Smote')
plt.show()

for ts in training_stations:
    ts['no_snow'] = ts['no_snow'].astype(int)
    
from sklearn.utils.class_weight import compute_class_weight

weights = []

for i, ts in enumerate(training_stations):
    class_weights = compute_class_weight('balanced', classes=np.unique(training_stations[i]['no_snow']), y=training_stations[i]['no_snow'])
    weights.append(class_weights)
    
class_weights = dict(enumerate(np.array(weights).mean(axis=0)))