# Semantic Evaluation

### Load Data

You must specify:

- Which models to evaluate
- Which test set to evaluate on

In [1]:
%store -r probs_0
%store -r probs_16

models = {'w2v': probs_16.argmax(axis=1),
          '1hot': probs_0.argmax(axis=1)
}

img_loader_str = '5_1-train_100.p'

import pickle

img_loader_color = pickle.load(open('pickle_jar/{}'.format(img_loader_str), 'rb'))

labels = img_loader_color.test_labels.argmax(axis=1)

evaluation_metrics = ['aff_wordnet_path',
                      'aff_wordnet_wup',
                      'aff_wordnet_zhao',
                      'aff_gist_100',
                      'aff_gist_100_clean',
                      'aff_gist_1260',
                      'aff_gist_1260_clean',
                      'aff_w2v_1.0.p',
                      'aff_w2v_0.5.p',
                      'aff_w2v_0.25.p']

### Construct Evaluation Matrix

In [2]:
df = pd.DataFrame(np.zeros([len(models), len(evaluation_metrics)]), index=models, columns=evaluation_metrics)

df

Unnamed: 0,aff_wordnet_path,aff_wordnet_wup,aff_wordnet_zhao,aff_gist_100,aff_gist_100_clean,aff_gist_1260,aff_gist_1260_clean,aff_w2v_1.0.p,aff_w2v_0.5.p,aff_w2v_0.25.p
w2v,0,0,0,0,0,0,0,0,0,0
1hot,0,0,0,0,0,0,0,0,0,0


### Fill in Evaluation Matrix

In [3]:
from soft_labels import get_soft_labels_from_file

def evaluate(aff_mat, preds):
    """Compute the soft accuracy for predictions on the affinity matrix provided"""
    
    return np.mean([aff_mat[label][pred] for pred, label in zip(preds, labels)])

def evaluate_all(models, evaluation_metrics):
    """Evaluate each model on all the evaluation metrics
    
    Parameters
    ----------
    models : dict from model name to predictions
    evaluation_metrics : list of evaluation metric to evaluate on
    
    - Model names are irrelevant
    - Evaluation metrics *must* be the name of an affinity matrix in data_files
    - Hardcode in class set 5_1 for now
    
    """
    for model_name, preds in models.items():
        for evaluation_metric in evaluation_metrics:
            # Load evaluation matrix
            #
            if evaluation_metric.endswith('.p'):
                aff_mat = pickle.load(open('data_files/5_1/{}'.format(evaluation_metric, 'rb')))
            else:
                aff_mat = get_soft_labels_from_file('data_files/5_1/{}'.format(evaluation_metric))

            df.ix[model_name, evaluation_metric] = evaluate(aff_mat, preds)
            
    return df

### Evaluate Models

In [4]:
evaluate_all(models, evaluation_metrics)

Unnamed: 0,aff_wordnet_path,aff_wordnet_wup,aff_wordnet_zhao,aff_gist_100,aff_gist_100_clean,aff_gist_1260,aff_gist_1260_clean,aff_w2v_1.0.p,aff_w2v_0.5.p,aff_w2v_0.25.p
w2v,0.624362,0.755722,0.742397,0.98354,0.70297,0.984362,0.708035,0.741076,0.665538,0.627769
1hot,0.622404,0.74834,0.735746,0.983558,0.702357,0.984631,0.711471,0.687699,0.638849,0.614425


### Only Examples Which Were Missclassified

In [5]:
missed_models = {model_name: preds[preds != labels] for model_name, preds in models.items()}

df_missed = df.copy()

evaluate_all(missed_models, evaluation_metrics)

Unnamed: 0,aff_wordnet_path,aff_wordnet_wup,aff_wordnet_zhao,aff_gist_100,aff_gist_100_clean,aff_gist_1260,aff_gist_1260_clean,aff_w2v_1.0.p,aff_w2v_0.5.p,aff_w2v_0.25.p
w2v,0.2995,0.59544,0.550523,0.952458,0.38476,0.953228,0.38395,0.5069,0.375401,0.309652
1hot,0.361664,0.566278,0.533101,0.955525,0.432775,0.958236,0.438576,0.460857,0.388965,0.353019
