In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings

warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "virny-flow-experiments":
    os.chdir("../../..")

print('Current location: ', os.getcwd())

Current location:  /Users/denys_herasymuk/Research/NYU/VirnyFlow_Project/Code/virny-flow-experiments


# Case Studies Visualizations

In [4]:
import pandas as pd
from duckdb import query as sqldf
from virny_flow.core.custom_classes.core_db_client import CoreDBClient
from virny_flow.configs.constants import EXP_CONFIG_HISTORY_TABLE, ALL_EXPERIMENT_METRICS_TABLE
from source.visualizations.use_case_queries import get_best_lps_per_exp_config
from source.visualizations.scalability_viz import create_speedup_plot, create_performance_plot_v2, display_table_with_results_heart

## Prepare data for visualizations

In [5]:
SECRETS_PATH = os.path.join(os.getcwd(), "scripts", "configs", "secrets.env")
EXP_NAME = 'scalability_exp'
DATASET_NAME = 'heart'
GROUP = "gender"
VIRNY_FLOW = 'virny_flow'
ALPINE = 'alpine_meadow'
AUTOSKLEARN = 'autosklearn'
EXP_CONFIG_NAMES = {
    VIRNY_FLOW: {
        'scalability_exp_heart_w1_vf': 1,
        'scalability_exp_heart_w2_vf': 2,
        'scalability_exp_heart_w4_vf': 4,
        'scalability_exp_heart_w8_vf': 8,
        'scalability_exp_heart_w16_vf': 16,
        'scalability_exp_heart_w32_vf': 32,
        'scalability_exp_heart_n2_w32_vf': 64,
        'scalability_exp_heart_n4_w32_vf': 128,
    },
    ALPINE: {
        'scalability_exp_heart_w2_am': 2,
        'scalability_exp_heart_w4_am': 4,
        'scalability_exp_heart_w8_am': 8,
        'scalability_exp_heart_w16_am': 16,
        'scalability_exp_heart_w32_am': 32,
    },
    AUTOSKLEARN: {
        'scalability_exp_heart_w2_askl': 2,
        'scalability_exp_heart_w4_askl': 4,
        'scalability_exp_heart_w8_askl': 8,
        'scalability_exp_heart_w16_askl': 16,
        'scalability_exp_heart_w32_askl': 32,
    },
}

db_client = CoreDBClient(SECRETS_PATH)
db_client.connect()

In [6]:
def get_virny_flow_metrics(db_client):
    exp_config_names = list(EXP_CONFIG_NAMES['virny_flow'].keys())
    best_lp_metrics_per_exp_config_df = get_best_lps_per_exp_config(secrets_path=SECRETS_PATH,
                                                                    exp_config_names=exp_config_names)
    best_lp_metrics_per_exp_config_df['num_workers'] = best_lp_metrics_per_exp_config_df['exp_config_name'].map(EXP_CONFIG_NAMES['virny_flow'])

    virny_flow_all_runtime_df = pd.DataFrame()
    for exp_config_name in exp_config_names:
        virny_flow_runtime_df = db_client.read_metric_df_from_db(collection_name=EXP_CONFIG_HISTORY_TABLE,
                                                                 query={'exp_config_name': exp_config_name,
                                                                        'deletion_flag': False})
        virny_flow_all_runtime_df = pd.concat([virny_flow_all_runtime_df, virny_flow_runtime_df])
    
    new_column_names = []
    for col in virny_flow_all_runtime_df.columns:
        new_col_name = '_'.join([c.lower() for c in col.split('_')])
        new_column_names.append(new_col_name)
    virny_flow_all_runtime_df.columns = new_column_names

    virny_flow_metrics_df = sqldf("""
        SELECT DISTINCT t1.*, t2.exp_config_execution_time
        FROM best_lp_metrics_per_exp_config_df AS t1
        JOIN virny_flow_all_runtime_df AS t2
          ON t1.exp_config_name = t2.exp_config_name
         AND t1.run_num = t2.run_num
    """).to_df()
    
    return virny_flow_metrics_df


def get_system_metrics(db_client, system_name: str):
    exp_config_names = list(EXP_CONFIG_NAMES[system_name].keys())
    system_all_runtime_df = pd.DataFrame()
    for exp_config_name in exp_config_names:
        system_runtime_df = db_client.read_metric_df_from_db(collection_name=ALL_EXPERIMENT_METRICS_TABLE,
                                                                 query={'exp_config_name': exp_config_name,
                                                                        'system_name': system_name,
                                                                        'tag': 'OK'})
        
        new_column_names = []
        for col in system_runtime_df.columns:
            new_col_name = '_'.join([c.lower() for c in col.split('_')])
            new_column_names.append(new_col_name)
        system_runtime_df.columns = new_column_names
        system_runtime_df['num_workers'] = system_runtime_df['exp_config_name'].map(EXP_CONFIG_NAMES[system_name])
    
        # Create columns based on values in the Subgroup column
        pivoted_all_metrics_df = pd.DataFrame()
        for run_num in system_runtime_df['run_num'].unique():
            subset_df = system_runtime_df[system_runtime_df['run_num'] == run_num]
            pivoted_metrics_df = subset_df.pivot(columns='subgroup', values='metric_value',
                                                 index=[col for col in subset_df.columns
                                                        if col not in ('subgroup', 'metric_value')]).reset_index()
            pivoted_all_metrics_df = pd.concat([pivoted_all_metrics_df, pivoted_metrics_df])
        
        system_all_runtime_df = pd.concat([system_all_runtime_df, pivoted_all_metrics_df])

    return system_all_runtime_df

In [7]:
virny_flow_metrics_df = get_virny_flow_metrics(db_client)

Extracting metrics for scalability_exp_heart_w1_vf...
best_pps_per_lp_and_run_num_df.shape: (216, 15)
best_lp_per_run_all.shape: (171, 15)
Extracted metrics for scalability_exp_heart_w1_vf

Extracting metrics for scalability_exp_heart_w2_vf...
best_pps_per_lp_and_run_num_df.shape: (171, 15)
best_lp_per_run_all.shape: (135, 15)
Extracted metrics for scalability_exp_heart_w2_vf

Extracting metrics for scalability_exp_heart_w4_vf...
best_pps_per_lp_and_run_num_df.shape: (153, 15)
best_lp_per_run_all.shape: (108, 15)
Extracted metrics for scalability_exp_heart_w4_vf

Extracting metrics for scalability_exp_heart_w8_vf...
best_pps_per_lp_and_run_num_df.shape: (207, 15)
best_lp_per_run_all.shape: (153, 15)
Extracted metrics for scalability_exp_heart_w8_vf

Extracting metrics for scalability_exp_heart_w16_vf...
best_pps_per_lp_and_run_num_df.shape: (297, 15)
best_lp_per_run_all.shape: (198, 15)
Extracted metrics for scalability_exp_heart_w16_vf

Extracting metrics for scalability_exp_heart_w32

In [8]:
alpine_meadow_metrics_df = get_system_metrics(db_client, system_name=ALPINE)

In [9]:
autosklearn_metrics_df = get_system_metrics(db_client, system_name=AUTOSKLEARN)

## Display Results

In [10]:
virny_flow_final_metrics_df = display_table_with_results_heart(virny_flow_metrics_df, 'virny_flow', 'Equalized_Odds_TNR', GROUP)
alpine_meadow_final_metrics_df = display_table_with_results_heart(alpine_meadow_metrics_df, 'alpine_meadow', 'Equalized_Odds_TNR', GROUP)
autosklearn_final_metrics_df = display_table_with_results_heart(autosklearn_metrics_df, AUTOSKLEARN, 'Equalized_Odds_TNR', GROUP)

In [11]:
one_worker_metrics_df, virny_flow_final_metrics_df = (virny_flow_final_metrics_df[virny_flow_final_metrics_df['num_workers'] == 1],
                                                      virny_flow_final_metrics_df[virny_flow_final_metrics_df['num_workers'] != 1])
final_metrics_df = pd.concat([virny_flow_final_metrics_df, alpine_meadow_final_metrics_df, autosklearn_final_metrics_df])

In [12]:
create_speedup_plot(one_worker_metrics_df, final_metrics_df, dataset='heart')

In [13]:
create_performance_plot_v2(final_metrics_df, metric_name="F1")

In [14]:
create_performance_plot_v2(final_metrics_df, metric_name="Equalized_Odds_TNR")