In [26]:
import json
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr
import codecs

In [53]:
eval_path = "checkpoints/run_123456"
data_path = "data/evaluation_data"

## DailyDialog-Eval

In [54]:
dailydialog_zhao_eval = json.load(open('{}/dailydialog-eval.json'.format(data_path), 'r'))
appropriateness_scores = [np.mean(item['annotations']['appropriateness']) for item in dailydialog_zhao_eval]
machine_scores = pd.read_csv("{}/dailydialog_test.txt.score".format(eval_path), sep='\t', header=None)
original_scores = list(machine_scores[2])
dailydialog_pearson = pearsonr(appropriateness_scores, original_scores)[0]
dailydialog_spearman =spearmanr(appropriateness_scores, original_scores)[0]

## Persona-Eval

In [55]:
persona_zhao_eval = json.load(open('{}/persona-eval.json'.format(data_path), 'r'))
appropriateness_scores = [np.mean(item['annotations']['appropriateness']) for item in persona_zhao_eval]
machine_scores = pd.read_csv("{}/persona_test.txt.score".format(eval_path), sep='\t', header=None)
original_scores = list(machine_scores[2])
persona_pearson = pearsonr(appropriateness_scores, original_scores)[0]
persona_spearman = spearmanr(appropriateness_scores, original_scores)[0]

## Empathetic-Eval

In [56]:
machine_scores = pd.read_csv("{}/empathetic_test.txt.score".format(eval_path), sep='\t', header=None)
empathetic_grade_eval = json.load(open('{}/empathetic-eval.json'.format(data_path), 'r'))
relevance_scores = [np.mean(item['annotations']['relevance']) for item in empathetic_grade_eval]
original_scores = list(machine_scores[2])
empathetic_pearson = pearsonr(relevance_scores, original_scores)[0]
empathetic_spearman = spearmanr(relevance_scores, original_scores)[0]

## Movie-Eval

In [57]:
humod_eval = json.load(open('{}/movie-eval.json'.format(data_path), 'r'))
relevance_scores = [np.mean(item['annotations']['relevance']) for item in humod_eval]
machine_scores = pd.read_csv("{}/movie_test.txt.score".format(eval_path), sep='\t', header=None)
original_scores = list(machine_scores[2])
movie_pearson = pearsonr(relevance_scores, original_scores)[0]
movie_spearman = spearmanr(relevance_scores, original_scores)[0]

## Topical-Eval

In [58]:
topical_usr_eval = json.load(open('{}/topical-eval.json'.format(data_path), 'r'))
overall_scores = [np.mean(item['annotations']['Overall']) for item in topical_usr_eval]
machine_scores = pd.read_csv("{}/topical_test.txt.score".format(eval_path), sep='\t', header=None)
original_scores = list(machine_scores[2])
topical_pearson = pearsonr(overall_scores, original_scores)[0]
topical_spearman = spearmanr(overall_scores, original_scores)[0]

## Twitter-Eval

In [59]:
dstc6_eval = json.load(open('{}/twitter-eval.json'.format(data_path), 'r'))
overall_scores = [np.mean(item['annotations']['overall']) for item in dstc6_eval]
machine_scores = pd.read_csv("{}/twitter_test.txt.score".format(eval_path), sep='\t', header=None)
original_scores = list(machine_scores[2])
twitter_pearson = pearsonr(overall_scores, original_scores)[0]
twitter_spearman = spearmanr(overall_scores, original_scores)[0]

## Display Results

In [60]:
print("Pearson Correlation-------------------")
print("dailydialog-eval: {}".format(dailydialog_pearson))
print("pearsona-eval: {}".format(persona_pearson))
print("topical-eval: {}".format(topical_pearson))
print("movie-eval: {}".format(movie_pearson))
print("empathetic-eval: {}".format(empathetic_pearson))
print("twitter-eval: {}".format(twitter_pearson))
print("average: {}".format(np.mean([dailydialog_pearson, persona_pearson, topical_pearson, 
                                    movie_pearson, empathetic_pearson, twitter_pearson])))
print("Spearman Correlation-------------------")
print("dailydialog-eval: {}".format(dailydialog_spearman))
print("pearsona-eval: {}".format(persona_spearman))
print("topical-eval: {}".format(topical_spearman))
print("movie-eval: {}".format(movie_spearman))
print("empathetic-eval: {}".format(empathetic_spearman))
print("twitter-eval: {}".format(twitter_spearman))
print("average: {}".format(np.mean([dailydialog_spearman, persona_spearman, topical_spearman, 
                                    movie_spearman, empathetic_spearman, twitter_spearman])))

Pearson Correlation-------------------
dailydialog-eval: 0.4784728726938251
pearsona-eval: 0.5367195606372421
topical-eval: 0.45747352246578243
movie-eval: 0.47885848893428434
empathetic-eval: 0.44757469869600314
twitter-eval: 0.22713030880960522
average: 0.4377049087061238
Spearman Correlation-------------------
dailydialog-eval: 0.571580485869982
pearsona-eval: 0.6226206814690305
topical-eval: 0.5108527999992152
movie-eval: 0.5348361880117385
empathetic-eval: 0.3837658913492567
twitter-eval: 0.24708022044924793
average: 0.47845604452474505
