In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import tqdm

import results_analysis_utils
import setup_stability_score

In [None]:
# Function for computing all 4 evaluation measures described in the paper

def get_evaluation_measure(transferability_scores, actual_performances, transferability_metric, target_dataset, sources_id):
  """Calculates evaluation measures given a transferability metric, a target dataset and a set of sources."""

  w_kendall = results_analysis_utils.get_correlation_scores(
    actual_performances, transferability_scores, 'w-kendall')
  
  kendall = results_analysis_utils.get_correlation_scores(
    actual_performances, transferability_scores, 'kendall')
  
  pearson = results_analysis_utils.get_correlation_scores(
    actual_performances, transferability_scores, 'pearson')

  relative_top_accuracy = results_analysis_utils.relative_top_accuracy(
    actual_performances, transferability_scores)
  
  results = []
  for scores, name_scores in zip(
      [w_kendall, kendall, pearson, relative_top_accuracy],
      ['w-kendall', 'kendall', 'pearson', 'rel']):
    results.append([transferability_metric, target_dataset, float(scores), name_scores, sources_id])
  
  return results

In [None]:
# Function for generating and evaluating experiments by subsampling over available resources

def evaluate_subsampled_experiments(data_df: pd.DataFrame, sampling_ratio: float, scenario: str, frozen_backbone: bool=False) -> pd.DataFrame:
  """Evaluate subsampled experiments keeping sampling_ratio percentage of elements."""

  assert scenario in ['classification', 'semantic segmentation']
  sampling_what = 'Source Architecture' if scenario == 'classification' else 'Source Dataset'
  actual_performance = 'Test Accuracy' if scenario == 'classification' else 'Test Mean IoU'

  if scenario == 'classification':  # Filter based on backbone choice
    data_df = data_df[data_df['Trainable Backbone'] == frozen_backbone]

  source_pool = data_df[sampling_what].unique()
  num_sources2sample = int(sampling_ratio * len(source_pool))
  source_combinations = list(itertools.combinations(source_pool, num_sources2sample))

  transferability_metrics = data_df['Transferability Metric'].unique()
  target_datasets = data_df['Target Dataset'].unique()
  evaluation_results = []

  for transf_metric in transferability_metrics:
    data_df_metric = data_df[data_df['Transferability Metric'] == transf_metric]
    for target_dataset in target_datasets:
      data_df_metric_target = data_df_metric[data_df_metric['Target Dataset'] == target_dataset]
      for i, source_set in enumerate(
          tqdm.tqdm(source_combinations,
                    desc=f'Calculating evaluation measures for {transf_metric} applied to the target dataset {target_dataset}.')):
        # If present, filter case when source == target
        source_set = [s for s in source_set if s != target_dataset]
        # Get transferability scores for all sources in the source set
        transferability_scores = [data_df_metric_target[
            data_df_metric_target[sampling_what] == source]['Transferability Metric Score'].values[0] for source in source_set]
        # Get actual performance for all sources in the source set
        actual_performances = [data_df_metric_target[
            data_df_metric_target[sampling_what] == source][actual_performance].values[0] for source in source_set]
        # Get evaluation measures for a particular target, metric and source set
        evaluation_measures = get_evaluation_measure(transferability_scores, actual_performances, transf_metric, target_dataset, i)
        evaluation_results.extend(evaluation_measures)  

  evaluation_results_df = pd.DataFrame(
      evaluation_results,
      columns=['Transferability Metric', 'Target Dataset', 'Evaluation Measure Score', 'Evaluation Measure', 'Sources ID'])
  
  return evaluation_results_df

In [None]:
# Naming Conversion for plotting

naming_conversion = {
    'leep': 'LEEP',
    'nleep': '$\mathcal{N}$LEEP',
    'gbc': 'GBC',
    'hscore': 'H-score',
    'logme': 'LogME',
    'cifar10': 'CIFAR10',
    'cifar100': 'CIFAR100',
    'imagenette': 'Imagenette',
    'oxford_iiit_pet': 'Oxford Pets',
    'caltech_birds2011': 'Caltech Birds',
    'stanford_dogs': 'Stanford Dogs',
    'oxford_flowers102': 'Oxford Flowers',
    'sun397': 'SUN',
    'dtd': 'DTD',
    'w-kendall': r'$\tau_w$',
    'kendall': r'$\tau$',
    'pearson': r'$\rho$',
    'rel': r'$Rel@1$',
    'pcontext': 'Pascal Context',
    'ade': 'ADE20K',
    'coco': 'COCO',
    'kitti': 'KITTI',
    'city': 'CityScapes',
    'idd': 'IDD',
    'bdd': 'BDD',
    'mapillary': 'MVD',
    'isprs': 'ISPRS',
    'isaid': 'iSAID',
    'sunrgbd': 'SUN RGB-D',
    'scannet': 'ScanNet',
    'suim': 'SUIM',
    'vkitti2': 'vKITTI2',
    'vgallery': 'vGallery',
    'camvid': 'CamVid',
    'pvoc': 'Pascal VOC'
}

In [None]:
def compare_metrics_pairs(data_df: pd.DataFrame, evaluation_measure: str='w-kendall'):
  """Qualitative analysis to compare metric pairs, method 4.2 in the paper."""

  # Filter chosen evaluation measure for plotting
  data_df = data_df[data_df['Evaluation Measure'] == evaluation_measure]

  # Extract color map for plotting
  cmap = plt.cm.tab20
  colors = [cmap(i) for i in range(0, cmap.N, 2)][::-1] + [cmap(i) for i in range(1, cmap.N, 2)][::-1]

  transferability_metrics = data_df['Transferability Metric'].unique()
  target_datasets = data_df['Target Dataset'].unique()
  transf_metrics_pairs = list(itertools.combinations(transferability_metrics, 2))

  fig, ax = plt.subplots(2, 5, figsize=(15, 5))
  plt.subplots_adjust(wspace=0.6, hspace=0.5)

  for i, metrics_pair in enumerate(transf_metrics_pairs):
    # Loop over all possible transferablity metric pairs
    metric_a, metric_b = metrics_pair[0], metrics_pair[1]
    fig_i = i % 5
    fig_j = i // 5
    ax[fig_j][fig_i].set_xlabel(naming_conversion[metric_a], fontsize=15)
    ax[fig_j][fig_i].set_ylabel(naming_conversion[metric_b], fontsize=15)
    for k, target in enumerate(target_datasets):
      # Plot scatterplot of specific target datasets
      target_data = data_df[data_df['Target Dataset'] == target]
      x = target_data[target_data['Transferability Metric'] == metric_a]['Evaluation Measure Score'].values
      y = target_data[target_data['Transferability Metric'] == metric_b]['Evaluation Measure Score'].values
      ax[fig_j][fig_i].scatter(x, y, s=0.3, c=colors[k])
    x = data_df[data_df['Transferability Metric'] == metric_a]['Evaluation Measure Score'].values
    y = data_df[data_df['Transferability Metric'] == metric_b]['Evaluation Measure Score'].values
    if fig_i == 4 and fig_j == 1:  # Last plot
      ax[fig_j][fig_i].legend([naming_conversion[t] for t in target_datasets], bbox_to_anchor=(0., -0.4), markerscale=14, fontsize=14, ncol=5)
    ax[fig_j][fig_i].plot([min(x), max(x)], [min(x), max(x)], color='r', linestyle='dashed', linewidth=1.5)
    ax[fig_j][fig_i].set_ylim([min(y), max(y)])

  plt.show()

In [None]:
# Results folders

classification_path = 'source_model_architectures_table.csv'
semantic_seg_path = 'source_datasets_table.csv'

def load_results(filename):
  """Returns pandas dataframe."""
  df = pd.read_csv(filename, index_col=0)
  return df

In [None]:
# Load semantic segmentation results

segmentation_results_df = load_results(semantic_seg_path)

# Subsample experiments

sampled_segmentation_results_df = evaluate_subsampled_experiments(
    segmentation_results_df,
    sampling_ratio=0.9,
    scenario='semantic segmentation',
    frozen_backbone=False)

In [None]:
# Qualitative analysis

evaluation_measure = 'w-kendall'
compare_metrics_pairs(sampled_segmentation_results_df, evaluation_measure)

In [None]:
# Quantitative analysis - Setup Stability Score varying one component at the time

components = ['Target Dataset', 'Sources ID', 'Evaluation Measure']
ss_score = {}

for varying in ['Target Dataset', 'Sources ID', 'Evaluation Measure']:
  fixing = [c for c in components if c != varying]
  ss_score[varying] = setup_stability_score.get_setup_stability_score(
      sampled_segmentation_results_df,
      varying=varying,
      fixing=fixing)

ss_score