In [None]:
import os
import matplotlib
import plotly
import plotly.express as px
import seaborn as sns
import pandas as pd

In [None]:
DOE_NAME = "doe2"
factors = ["max_time", "seq_len", "est_frac", "est_direction", "est_type", "est_hidden_size"]
response = "FIT"

In [None]:
df_res = pd.read_csv(DOE_NAME + "_res.csv")  # doe1_res.csv

In [None]:
df_res.sort_values(by=response, inplace=True, ascending=False)
df_res["RMSE"] = df_res["RMSE"].fillna(1000)
df_res["FIT"] = df_res["FIT"].fillna(0.0)
df_res["FIT"] = df_res["FIT"] * (df_res["FIT"] > 0) # minimum fit to 0% (easier to interpret)


In [None]:
for factor in factors:
    df_res[factor] = df_res[factor].astype("category")

In [None]:
df_res.head(20)

In [None]:
df_res.tail(20)

In [None]:
# Full results: main effects
g = sns.PairGrid(df_res, y_vars=response,
                 x_vars=factors,
                 height=5, aspect=.5)
g.map(sns.pointplot, scale=1.3)
sns.despine(fig=g.fig, left=True)

In [None]:
fig = px.scatter(df_res, y="FIT",
                 facet_col="est_type", facet_row="est_direction", color="seq_len",
                 hover_data=["est_frac", "max_time", "est_hidden_size"])
fig.show()
# Bad performance generally associated to seq_len=40, est_type != Zero, est_direction=backward.
# Some kind of overfitting?

In [None]:
df_good = df_res[(df_res["seq_len"] != 40) & (df_res["max_time"] != 300)]

In [None]:
df_good.head(10)

In [None]:
df_good.tail(10)

In [None]:
fig = px.scatter(df_good, y="FIT",
                 facet_col="est_type", facet_row="est_direction", color="seq_len",
                 hover_data=["est_frac", "max_time", "est_hidden_size"])
fig.show()

In [None]:
# Full results: main effects
g = sns.PairGrid(df_good, y_vars=response,
                 x_vars=factors,
                 height=5, aspect=.5)
g.map(sns.pointplot, scale=1.3)
sns.despine(fig=g.fig, left=True)

In [None]:
df_5min = df_res[df_res["max_time"] == 300]

In [None]:
df_5min.head(10)

In [None]:
df_5min.tail(10)

In [None]:
fig = px.scatter(df_5min, y="FIT",
                 facet_col="est_type", facet_row="est_direction", color="seq_len",
                 hover_data=["est_frac", "max_time", "est_hidden_size"])
fig.show()
# With just 5 minutes, Zero estimator works better?

In [None]:
df_30min = df_res[df_res["max_time"] == 1800]

In [None]:
fig = px.scatter(df_30min, y="FIT",
                 facet_col="est_type", facet_row="est_direction", color="seq_len",
                 hover_data=["est_frac", "max_time", "est_hidden_size"])
fig.show()

In [None]:
max_loc = df_res.groupby("est_type")["FIT"].idxmax()
df_res.loc[max_loc]

In [None]:
df_zero = df_res[df_res["est_type"] == "ZERO"]

In [None]:
df_zero

In [None]:
df_res.groupby(["est_type", "est_direction", "est_hidden_size"]).agg({"FIT": "mean"})

In [None]:
# Conclusions: 
# - Pretty good models in 30 mins (not 100 hours...)
# - Backward estimation (together with seq_len=40, est_type != Zero) occasionally fails, forward is more stable
# - Feedforward estimator slightly better than LSTM in average (but best models are LSTM)
# - Est-hidden-size not too important...
# Let us avoid backward-est and focus