# Imports

In [6]:
import plotly.express as px
import pandas as pd
import glob
import plotly.graph_objects as go
import pandas as pd

import plotly.io as pio
pio.renderers.default = "browser"

## Load Data

In [7]:
# Get a list of all the file paths
file_paths = glob.glob('mlflow_results/*.csv')

# Initialize an empty list to store the individual DataFrames
data_frames = []

# Loop through the file paths and import the data
for file_path in file_paths:
    # Read the data from each file
    data = pd.read_csv(file_path)

    # Append the data frame to the list
    data_frames.append(data)

# Concatenate the individual DataFrames into a single DataFrame
combined_data = pd.concat(data_frames, ignore_index=True)


# df_grouped = df.groupby(['instance_family', 'Years used to predict'], as_index=False).mean()

## RMSE Plot

In [8]:
# Define the year ranges and their corresponding numeric values
year_ranges = {
    "[2021]": 2021,
    "[2020 2021]": 2020,
    "[2019 2020 2021]": 2019,
    "[2018 2019 2020 2021]": 2018,
    "[2017 2018 2019 2020 2021]": 2017,
    "[2016 2017 2018 2019 2020 2021]": 2016
}

# Filter relevant columns
df = combined_data[['instance_family',
                    'Years used to predict', 'validation_RMSE']]

# Group data and calculate mean validation MAE
df_grouped = df.groupby(
    ['instance_family', 'Years used to predict'], as_index=False).mean()

# Map the year ranges to their numeric values
df_grouped['Year Numeric'] = df_grouped['Years used to predict'].map(
    year_ranges)

# Sort the DataFrame by the Year Numeric column
df_grouped = df_grouped.sort_values(by='Year Numeric')

# Create a line plot for each instance family
fig = go.Figure()

for instance_family in df_grouped['instance_family'].unique():
    df_instance = df_grouped[df_grouped['instance_family'] == instance_family]
    fig.add_trace(go.Scatter(
        x=df_instance['Years used to predict'],
        y=df_instance['validation_RMSE'],
        mode='lines+markers',
        name=instance_family
    ))

# Customize the layout
fig.update_layout(
    title='RMSE by Instance Family and Years Used for Prediction',
    xaxis_title='Years Used for Prediction',
    yaxis_title='RMSE',
    legend=dict(orientation='h', yanchor='bottom',
                y=1, xanchor='left', x=0),
    legend_title='Instance Family',
    hovermode='x'
)

fig.show()

## MAE Plot

In [12]:
# Define the year ranges and their corresponding numeric values
year_ranges = {
    "[2021]": 2021,
    "[2020 2021]": 2020,
    "[2019 2020 2021]": 2019,
    "[2018 2019 2020 2021]": 2018,
    "[2017 2018 2019 2020 2021]": 2017,
    "[2016 2017 2018 2019 2020 2021]": 2016
}

# Filter relevant columns
df = combined_data[['instance_family',
                    'Years used to predict', 'validation_MAE']]

# Group data and calculate mean validation MAE
df_grouped = df.groupby(
    ['instance_family', 'Years used to predict'], as_index=False).mean()

# Map the year ranges to their numeric values
df_grouped['Year Numeric'] = df_grouped['Years used to predict'].map(
    year_ranges)

# Sort the DataFrame by the Year Numeric column
df_grouped = df_grouped.sort_values(by='Year Numeric')

# Create a line plot for each instance family
fig = go.Figure()

for instance_family in df_grouped['instance_family'].unique():
    df_instance = df_grouped[df_grouped['instance_family'] == instance_family]
    fig.add_trace(go.Scatter(
        x=df_instance['Years used to predict'],
        y=df_instance['validation_MAE'],
        mode='lines+markers',
        name=instance_family
    ))

# Customize the layout
fig.update_layout(
    title='MAE by Instance Family and Years Used for Prediction',
    xaxis_title='Years Used for Prediction',
    yaxis_title='MAE',
    legend=dict(orientation='h', yanchor='bottom',
                y=1, xanchor='left', x=0),
    legend_title='Instance Family',
    hovermode='x'
)

fig.show()

## MAPE Plot

In [10]:
# Define the year ranges and their corresponding numeric values
year_ranges = {
    "[2021]": 2021,
    "[2020 2021]": 2020,
    "[2019 2020 2021]": 2019,
    "[2018 2019 2020 2021]": 2018,
    "[2017 2018 2019 2020 2021]": 2017,
    "[2016 2017 2018 2019 2020 2021]": 2016
}

# Filter relevant columns
df = combined_data[['instance_family',
                    'Years used to predict', 'validation_MAPE']]

# Group data and calculate mean validation MAE
df_grouped = df.groupby(
    ['instance_family', 'Years used to predict'], as_index=False).mean()

# Map the year ranges to their numeric values
df_grouped['Year Numeric'] = df_grouped['Years used to predict'].map(
    year_ranges)

# Sort the DataFrame by the Year Numeric column
df_grouped = df_grouped.sort_values(by='Year Numeric')

# Create a line plot for each instance family
fig = go.Figure()

for instance_family in df_grouped['instance_family'].unique():
    df_instance = df_grouped[df_grouped['instance_family'] == instance_family]
    fig.add_trace(go.Scatter(
        x=df_instance['Years used to predict'],
        y=df_instance['validation_MAPE'],
        mode='lines+markers',
        name=instance_family
    ))

# Customize the layout
fig.update_layout(
    title='MAPE by Instance Family and Years Used for Prediction',
    xaxis_title='Years Used for Prediction',
    yaxis_title='MAPE',
    legend=dict(orientation='h', yanchor='bottom',
                y=1, xanchor='left', x=0),
    legend_title='Instance Family',
    hovermode='x'
)

fig.show()