In [31]:
import mlflow
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from pgd_optim_pytorch._utils import filter_df
%matplotlib inline

In [32]:
# Set the tracking URI to the directory containing the mlruns folder
mlflow.set_tracking_uri("../../../mlruns")

# Genetic Mutations - max gamma

We will check how frequently can pSAGD find the global minimum of the CIB when $\gamma = 1$.
The chosen hyperparameters were: `lr = 1e1 1e2, 1e3, 1e4, 1e5, 1e6, 1e7`, `temperature = 10.0` and `cooling_rate = 0.99`. 

# Load and pre-process the data

In [33]:
# Retrieve all runs from the experiment of interest
experiment = mlflow.get_experiment_by_name("Mutations - max gamma")
experiment_id = experiment.experiment_id
print(experiment_id)

908478878173762075


In [34]:
try:
    runs_df = pd.read_csv('mutations_runs_max_gamma.csv')
except FileNotFoundError:
    # Extract df from mlflow directly
    runs_df = mlflow.search_runs(experiment_ids=experiment_id, max_results=100000)
    # Export the DataFrame to a CSV file
    runs_df.to_csv('mutations_runs_max_gamma.csv', index=False)

print(len(runs_df))
print(runs_df.columns)

700
Index(['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time',
       'end_time', 'metrics.diff_loss', 'metrics.CIB loss',
       'metrics.converged', 'metrics.HcYdoT', 'metrics.HT', 'metrics.HY',
       'metrics.Final CIB loss', 'metrics.VI of T and T_',
       'metrics.Expected optimal HTcondX', 'metrics.Expected optimal HT',
       'metrics.penalty', 'metrics.Expected optimal CIB loss',
       'metrics.diff_q', 'metrics.HTcondX', 'metrics.Expected optimal HcYdoT',
       'params.gamma', 'params.end_lr_factor', 'params.by', 'params.Learned q',
       'params.lr', 'params.temperature', 'params.Expected optimal q',
       'params.eps', 'params.beta', 'params.bxi', 'params.max iter',
       'params.bs', 'params.cooling rate', 'tags.mlflow.user',
       'tags.optimizer', 'tags.mlflow.runName', 'tags.mlflow.source.type',
       'tags.mlflow.source.name', 'tags.loss',
       'tags.mlflow.source.git.commit'],
      dtype='object')


In [35]:
# Add duration metric
runs_df['metrics.duration']=(pd.to_datetime(runs_df['end_time'], format='ISO8601') - pd.to_datetime(runs_df['start_time'], format='ISO8601')).dt.total_seconds()

In [36]:
# Only need some columns
runs = runs_df[[
      'metrics.duration',
      'params.bxi',
      'params.by',
      'params.bs',
      'params.gamma',
      'params.beta',
      'params.lr',
      'params.temperature',
      'metrics.VI of T and T_',
      'metrics.Final CIB loss',
      'metrics.HT',
      'metrics.HTcondX',
      'metrics.HY',
      'metrics.HcYdoT',
      'metrics.converged',
      'params.Learned q',
      'tags.optimizer',
      'tags.loss'
]]

In [37]:
# List of columns that should be float
float_columns = ['params.bxi', 'params.by', 'params.bs', 'params.gamma', 'params.beta', 'params.lr', 'params.temperature'] 
runs.loc[:, float_columns] = runs[float_columns].apply(pd.to_numeric, errors='raise')

# Remove prefixes
runs.columns = runs.columns.str.split('.').str[1]

# Add MI, Ic columns
runs.loc[:, ['MI_XT']] = runs['HT'] - runs['HTcondX']
runs.loc[:, ['IcYdoT']] = runs['HY'] - runs['HcYdoT']
runs.head()

Unnamed: 0,duration,bxi,by,bs,gamma,beta,lr,temperature,VI of T and T_,Final CIB loss,HT,HTcondX,HY,HcYdoT,converged,Learned q,optimizer,loss,MI_XT,IcYdoT
0,10.419,0.3,0.1,0.5,1.0,inf,10000000.0,10.0,3.030305e-07,-0.780195,1.432582,-0.0,2.014693,1.234498,1.0,"tensor([[[[[1., 1.],\n [1., 1.]],\n\...",pSAGD,wCIB+penalty,1.432582,0.780195
1,11.156,0.3,0.1,0.5,1.0,inf,10000000.0,10.0,-1.738066e-07,-0.780195,1.432582,-0.0,2.014693,1.234498,1.0,"tensor([[[[[0., 0.],\n [0., 0.]],\n\...",pSAGD,wCIB+penalty,1.432582,0.780195
2,4.299,0.3,0.1,0.5,1.0,inf,10000000.0,10.0,-1.738066e-07,-0.780195,1.432582,-0.0,2.014693,1.234498,1.0,"tensor([[[[[0., 0.],\n [0., 0.]],\n\...",pSAGD,wCIB+penalty,1.432582,0.780195
3,7.303,0.3,0.1,0.5,1.0,inf,10000000.0,10.0,3.030305e-07,-0.780195,1.432582,-0.0,2.014693,1.234498,1.0,"tensor([[[[[1., 1.],\n [1., 1.]],\n\...",pSAGD,wCIB+penalty,1.432582,0.780195
4,9.209,0.3,0.1,0.5,1.0,inf,10000000.0,10.0,3.030305e-07,-0.780195,1.432582,-0.0,2.014693,1.234498,1.0,"tensor([[[[[1., 1.],\n [1., 1.]],\n\...",pSAGD,wCIB+penalty,1.432582,0.780195


## Frequencies for wCIB

In [38]:
results = []

In [39]:
for temp in np.sort(runs['temperature'].unique()):
    for lr in np.sort(runs['lr'].unique()):
        df_fixedlrtemp = filter_df(runs, lr=lr, temperature=temp)
        try:
            num_successful_runs = pd.Series.value_counts(df_fixedlrtemp['VI of T and T_'] < 1e-5).loc[True]
        except KeyError: # No successful runs -> no True row.
            num_successful_runs = 0
        total_num_runs = len(df_fixedlrtemp)
        print(f"For lr={lr:.1e} and temperature={temp:.1e}, "
              + f"the ground truth abstraction was found in {num_successful_runs} of the {total_num_runs} runs."
        ) 
        success_rate = num_successful_runs / total_num_runs if total_num_runs > 0 else 0
        average_duration = df_fixedlrtemp['duration'].mean()
        results.append({'temperature': temp, 'lr': lr, 'success_rate': success_rate, 'average_duration': average_duration})
    
# Create a DataFrame from the results
results_df = pd.DataFrame(results)
results_df

For lr=1.0e+01 and temperature=1.0e+01, the ground truth abstraction was found in 34 of the 100 runs.
For lr=1.0e+02 and temperature=1.0e+01, the ground truth abstraction was found in 56 of the 100 runs.
For lr=1.0e+03 and temperature=1.0e+01, the ground truth abstraction was found in 83 of the 100 runs.
For lr=1.0e+04 and temperature=1.0e+01, the ground truth abstraction was found in 93 of the 100 runs.
For lr=1.0e+05 and temperature=1.0e+01, the ground truth abstraction was found in 55 of the 100 runs.
For lr=1.0e+06 and temperature=1.0e+01, the ground truth abstraction was found in 95 of the 100 runs.
For lr=1.0e+07 and temperature=1.0e+01, the ground truth abstraction was found in 95 of the 100 runs.


Unnamed: 0,temperature,lr,success_rate,average_duration
0,10.0,10.0,0.34,18.55853
1,10.0,100.0,0.56,11.93835
2,10.0,1000.0,0.83,8.07042
3,10.0,10000.0,0.93,7.19983
4,10.0,100000.0,0.55,26.11997
5,10.0,1000000.0,0.95,8.02744
6,10.0,10000000.0,0.95,8.00984


### Conclusion

For $\gamma=1$, one can use very large values of the learning rate to achieve high probability of finding the global minimum, i.e. the ground truth encoder.
(For a learning rate of $10^5$ this is not true. Examining the runs, it seems that numeric instability is responsible for this. We did not delve into this technical issue further since it was not relevant for our goals).
By looking at the values of the encoder during the learning process, one sees that, for large learning rates, the search space is effectively reduced to the boundary of the constraint space $\Delta$. 
Since we know that the ground truth is in $\Delta$, it stands to reason that constraining our search to $\Delta$ improves our results, as it is observed.

It is for values of the learning rate of the order of $100.0$ that the rest of the search space is also explored.
This informs our decision of what learning rates to use for our ensembles: we will use both high and low learning rates when dealing with other values for $\gamma$, to also allow for an exploration of the search space.