# NLG Evaluations

Erik McGuire

CSC594-810, Winter 19-20

### Preliminaries

In [None]:
import pandas as pd
import numpy as np
import statistics as st
from string import punctuation as punk
from ipywidgets import *
import nltk
from nltk.tokenize import word_tokenize

* Import [NLG-eval](https://arxiv.org/abs/1706.09799) for suite of automated metrics: BLEU, ROUGE, METEOR, CIDEr, Skip Thought, GloVe:

In [None]:
!git clone https://github.com/Maluuba/nlg-eval.git
%cd nlg-eval
!pip install .

In [None]:
%cd drive/My Drive/csc594-ADL

In [None]:
# NLGEval functions run from here, looking at below setup path as relative data_path.%cd ../drive/'My Drive'/csc594-ADL
!nlg-eval --setup "nlgeval"

In [None]:
from nlgeval import NLGEval
from nlgeval import compute_metrics, compute_individual_metrics

In [None]:
!pip install vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
nltk.download('punkt')

In [None]:
!pip install spacy-readability
from spacy_readability import Readability

#### Imports and installations for entity coreference

In [None]:
!pip install stanza==1.0.0
import stanza

In [None]:
import spacy, pprint
!python -m spacy download en_core_web_md
import en_core_web_sm

In [None]:
# Import client module
from stanza.server import CoreNLPClient

In [None]:
%cd drive/My Drive/csc594-ADL

In [None]:
!echo "Downloading CoreNLP..."
# !wget "http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip" -O corenlp.zip
!unzip corenlp.zip
!mv ./stanford-corenlp-full-2018-10-05 ./corenlp
import os
os.environ["CORENLP_HOME"] = "./corenlp"


In [None]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(annotators=['coref', 'tokenize','ssplit', 'pos', 'lemma', 'ner'], memory='4G', endpoint='http://localhost:9001')
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

In [None]:
client.stop()

### Select model for evaluations

In [None]:
def model_c(model: str) -> list:
    """Choose model, load story data."""
    print(f"Selected model: {model}")
    gold_refs = f'endings/{model}_corr_ends.txt'
    story_refs = f'datasets/story_bodies.txt'
    hypotheses = f'endings/{model}_gen_ends.txt'
    gen_stories = f'seg_results_{model}.txt'
    refs = [story_refs]

    stories_split = pd.read_csv(story_refs, sep=",", header=None).values.tolist()
    stories = ["".join(story) for story in stories_split]
    with open(gold_refs) as gold_file:
        corr_ends = gold_file.readlines()
        
    with open(hypotheses) as endings_file:
        gendings = endings_file.readlines()
    return stories_split, model, stories, corr_ends, gendings

model_chooser = interactive(model_c, model=[('Base', 'gpt2'),
                                            ('Base to SCT', 'b_SCT'), 
                                            ('ConceptNet to SCT', 'cn_SCT_new'), #
                                            ('ConceptNet to Sentiment', 'cn_sentiment'), #
                                            ('CN to SCT to Sentiment', 'cn_SCT_sentiment'),
                                            ('CN to Sentiment to SCT', 'cn_sentiment_SCT'),
                                            ('ROC to SCT to Sentiment', 'roc1617_SCT_sentiment'),
                                            ('ROC to Sentiment to SCT', 'roc1617_sentiment_SCT'),
                                            ('ROC to SCT', 'roc1617_SCT'),
                                            ('ROC to Sentiment', 'roc1617_sentiment')])
display(model_chooser)

In [None]:
#Correct mishap with newlines being inserted between ending segments, throwing off row count.
gen_stories = f'stories/seg_results_cn_SCT_new.txt'
gdf = pd.read_csv(gen_stories, sep='\t')

In [None]:
gdf.dropna(axis=0, inplace=True, thresh=2)
gdf.fillna(axis=0, inplace=True, value="_none_")


In [None]:
stories_split, fname, stories, corr_ends, gendings_alt = model_chooser.result 

gendings = gdf.GenEnding.tolist()
for ix, gending in enumerate(gendings):
    if gending.strip() == corr_ends[ix].strip() or len(gending.strip()) == 0:
        gendings[ix] = '_none_'

for i in range(2):
    print("\nStory body sample:\n", ".\n ".join(stories[i].split(". ")), 
            "\n\nCorrect ending: ", corr_ends[i], 
          "\nGenerated ending: ", gendings[i])

### Entity Coreference
* Roemmele \[[PDF](https://roemmele.github.io/publications/fiction_generation.pdf)]

* Analyze entity coherence between story prompts, generated endings. 
* Metric code adapted from Melissa Roemmele \[[GitHub](https://github.com/roemmele/narrative-prediction/)]
* Stanford CoreNLP code adapted from \[[GitHub](https://github.com/stanfordnlp/stanfordnlp)]

#### Definitions adapted from Roemmele

In [None]:
def get_noun_chunk_complexity(gen_seqs):
    '''return number and length of noun chunks in each generated sequence'''
    gen_seqs = check_seqs_format(gen_seqs)
    chunk_lengths = []
    n_chunks = []
    seq_lengths = []  # also track sequence length for normalized scores
    for gen_seqs_ in gen_seqs:
        chunk_lengths_ = []
        n_chunks_ = []
        seq_lengths_ = []
        for gen_seq in gen_seqs_:
            gen_seq = encoder(gen_seq)
            seq_lengths_.append(len(gen_seq))
            seq_chunks = [chunk for chunk in gen_seq.noun_chunks]
            n = len(seq_chunks)
            if n:
                mean_chunk_length = np.mean([len(chunk) for chunk in seq_chunks])
            else:
                mean_chunk_length = 0  # if no chunks in this sequence, set mean length to 0
            chunk_lengths_.append(mean_chunk_length)
            n_chunks_.append(n)
        n_chunks.append(n_chunks_)
        chunk_lengths.append(chunk_lengths_)
        seq_lengths.append(seq_lengths_)
    n_chunks = np.array(n_chunks)
    chunk_lengths = np.array(chunk_lengths)
    seq_lengths = np.array(seq_lengths)
    norm_n_chunks = n_chunks * 1. / seq_lengths
    norm_chunk_lengths = chunk_lengths * 1. / seq_lengths
    return {'n_chunks': n_chunks, 'chunk_lengths': chunk_lengths, 'norm_n_chunks': norm_n_chunks, 'norm_chunk_lengths': norm_chunk_lengths,
            'mean_n_chunks': np.mean(n_chunks), 'mean_chunk_lengths': np.mean(chunk_lengths),
            'norm_mean_n_chunks': np.mean(norm_n_chunks), 'norm_mean_chunk_lengths': np.mean(norm_chunk_lengths)}  # [~np.isnan(chunk_lengths)])}

def segment(seq, clauses=False):
    if clauses:
        seq = segment_into_clauses(seq) #segment into clauses rather than just sentences
    else:
        seq = [sent.string.strip() for sent in encoder(seq).sents]
    return seq
    
def check_seqs_format(seqs):
    '''functions below expect generated sequences to be a list of lists, i.e. multiple sequences for each context sequence;
    transform to this format if seqs are a flat list'''
    assert(type(seqs) in (list, tuple))
    if type(seqs[0]) not in (list, tuple):
        seqs = [[seq] for seq in seqs]
    return seqs
    
def get_corefs(context_seqs, gen_seqs, verbose=False):
    '''return all the entities in each generated sequence that co-ref to an entity in the corresponding context'''
    assert(len(context_seqs) == len(gen_seqs))
    assert(type(gen_seqs) in (list, tuple) and type(context_seqs) in (list, tuple))

    gen_seqs = check_seqs_format(gen_seqs)

    corefs = []
    for context_seq_idx, (context_seq, gen_seqs_) in enumerate(zip(context_seqs, gen_seqs)):
        n_sents_in_context = len(segment(context_seq))
        gen_corefs = []
        for gen_seq in gen_seqs_:
            seq_corefs = []
            seq = context_seq + " " + gen_seq
            parse = client.annotate(seq, properties={'annotators': 'coref', 'outputFormat': 'json'})
            if type(parse) is dict:
                #sents = parse['sentences']
                for coref_ent_idx, coref_ent in parse['corefs'].items():
                    mentions = {'rep_mention': None, 'context_mentions': [], 'gen_mentions': []}
                    for mention in coref_ent:
                        if mention['isRepresentativeMention']:
                            mentions['rep_mention'] = (mention['sentNum'], mention['text'])
                        if mention['sentNum'] > n_sents_in_context:  # mention is in generated sequence
                            mentions['gen_mentions'].append((mention['sentNum'], mention['text']))
                        elif mention['sentNum'] <= n_sents_in_context:
                            mentions['context_mentions'].append((mention['sentNum'], mention['text']))
                    if mentions['context_mentions']:  # only count corefs between context and generated sequence, not corefs only within generated sequence
                        seq_corefs.append(mentions)
            gen_corefs.append(seq_corefs)
        if verbose and context_seq_idx % 500 == 0:
            print("processed coreferences in", context_seq_idx, "sequences...")
        corefs.append(gen_corefs)

    return corefs


def get_coref_counts(context_seqs, gen_seqs):
    '''return 1) the number of entities (noun chunks) in each generated sequence, 2) the number of entities in each generated sequence that co-refer to entities in its context,
    and 3) the proportion of entities in each generated sequence that co-refer to entities in the corresponding context'''
    assert(len(context_seqs) == len(gen_seqs))
    counts = {'corefs': [], 'prev_mention_sents': []}

    corefs = get_corefs(context_seqs, gen_seqs)

    for gen_corefs in corefs:
        gen_coref_counts = []
        #gen_ent_counts = []
        gen_prev_mention_sents = []
        for seq_corefs in gen_corefs:
            coref_counts = sum([len(coref['gen_mentions']) for coref in seq_corefs])
            gen_coref_counts.append(coref_counts)
            prev_mentions = []
            for coref in seq_corefs:
                # find the sentence position (number) of the most recent previous mention of each coreferring entity;
                # if an entity is the first mention in the generated sequence, look for a coreference in the preceding context sequence;
                # if none found or the entity is not the first mention, the previous mention position is the number of the generated sentence itself
                # coref_prev_mentions = []
                for mention_idx, mention in enumerate(coref['gen_mentions']):
                    if mention_idx > 0:
                        prev_mentions.append(coref['gen_mentions'][mention_idx - 1][0])
                    elif not coref['context_mentions']:
                        prev_mentions.append(mention[0])
                    else:
                        prev_mentions.append(coref['context_mentions'][-1][0])
            gen_prev_mention_sents.append(prev_mentions)
        counts['corefs'].append(gen_coref_counts)
        counts['prev_mention_sents'].append(gen_prev_mention_sents)

    counts['ents'] = get_noun_chunk_complexity(gen_seqs)['n_chunks']
    #counts['ents'] = np.array(counts['ents'])
    counts['mean_ents'] = np.mean(counts['ents'])
    counts['corefs'] = np.array(counts['corefs'])
    counts['ents'] = np.maximum(counts['ents'], counts['corefs'])  # don't let number of entities exceed the number of coreferences
    counts['mean_corefs'] = np.mean(counts['corefs'])
    counts['res_rates'] = np.nan_to_num(counts['corefs'] * 1. / counts['ents'])
    counts['mean_res_rates'] = np.mean(counts['res_rates'])

    return counts

def run_corefs(gen_seqs: dict, context_seqs, stat_sig):
    coref_counts = {'models':{}, 'p-values':{}}
    print("\nCOREFERENCE")
    for model in gen_seqs.keys():
        coref_counts['models'][model] = get_coref_counts(context_seqs, gen_seqs[model])
    corefdf = pd.DataFrame.from_dict(coref_counts['models'], orient='index')
    pprint.pprint(pd.DataFrame.from_dict(coref_counts['models'], orient='index')[['mean_ents', 'mean_corefs', 'mean_res_rates']])
    if stat_sig:
        coref_counts['p-values']['ents'] = eval_all_diffs({model:analysis['ents']\
                                                            for model,analysis\
                                                            in coref_counts['models'].items()})
        coref_counts['p-values']['corefs'] = eval_all_diffs({model:analysis['corefs']\
                                                            for model,analysis\
                                                            in coref_counts['models'].items()})
        coref_counts['p-values']['res_rates'] = eval_all_diffs({model:analysis['res_rates']\
                                                                for model,analysis\
                                                                in coref_counts['models'].items()})
        print("\np-values:")
        pprint.pprint(pd.DataFrame.from_dict(coref_counts['p-values'], orient='index'))
    return corefdf.loc[:, 'mean_ents':'mean_res_rates'].drop('res_rates', axis=1)

encoder = en_core_web_sm.load()

#### Run corefs

In [None]:
story_refs = f'datasets/story_bodies.txt'
#ending_refs = f'endings/cn_SCT_new_gen_ends.txt'
    
context_seqs = pd.read_csv(story_refs, header=None, sep='\t')[0].values.tolist()
gen_seqs = {}
#gen_seqs[fname] = pd.read_csv(ending_refs, header=None, sep='_nodel_').values.tolist()
gen_seqs[fname] = gendings
corefdf = run_corefs(gen_seqs, context_seqs, stat_sig=False)
corefdf.to_csv(f"evals/{fname}.csv")

## Readability
* [Novikova](https://arxiv.org/pdf/1707.06875.pdf) et al.

In [None]:
nlp = spacy.load('en')
read = Readability()
nlp.add_pipe(read, last=True)

In [None]:
if model_chooser.result:
    fname = model_chooser.result
    right_refs = f'endings/{fname}_corr_ends.txt'
    hypothesis = f'endings/{fname}_gen_ends.txt'
    refs = [right_refs]
    with open(hypothesis) as hypfile:
        gen_endings = hypfile.read().splitlines()
    with open(right_refs) as corrfile:
        corr_endings = corrfile.read().splitlines()
    readability = []
    for corr, gen in zip(corr_endings, gen_endings):
        refdoc = nlp(corr)
        gendoc = nlp(gen)
        readability.append([refdoc._.flesch_kincaid_reading_ease, gendoc._.flesch_kincaid_reading_ease])
readf = pd.DataFrame(readability, columns=['corr_ease', 'gen_ease'])

In [None]:
readability = []
for corr, gen in zip(corr_ends, gendings):
    refdoc = nlp(corr)
    gendoc = nlp(gen)
    readability.append([refdoc._.flesch_kincaid_reading_ease, gendoc._.flesch_kincaid_reading_ease])
readf = pd.DataFrame(readability, columns=['corr_ease', 'gen_ease'])

In [None]:
readfm = readf.mean(axis=0)

## NLG-Eval

* Sharma et al. \[[PDF](https://arxiv.org/pdf/1706.09799.pdf)]

In [None]:
"""n = NLGEval(metrics_to_omit=['SkipThoughtCS', 
                             'VectorExtremaCosineSimilarity',
                             'EmbeddingAverageCosineSimilairty', 
                             'GreedyMatchingScore', 
                             'EmbeddingAverageCosineSimilairty'])"""

n = NLGEval(metrics_to_omit=['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'CIDEr', 'ROUGE_L', 'METEOR'])

#metrics_dict = n.compute_metrics([corr_ends], gendings)

metrics_dict = n.compute_metrics([stories], gendings) # Each element of stories is the story prompt (joined as a string) for the corresponding ending in endings.

In [None]:
mdf = pd.DataFrame.from_records([metrics_dict])

---

## Distinct

---



Li, Jiwei, Michel Galley, Chris Brockett, Jianfeng Gao, and Bill Dolan. "[A Diversity-Promoting Objective Function for Neural Conversation Models.](https://www.aclweb.org/anthology/N16-1014/)" In Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 110-119. 2016.

---

"We report degree of diversity by calculating the number of distinct unigrams and bigrams in generated responses. The value is scaled by total number of generated tokens to avoid favoring long sentences."

"... *distinct-1* and *distinct-2* are respectively the number of distinct unigrams and bigrams divided by total number of generated words." - Interestingly, See et al. \[[PDF](https://arxiv.org/pdf/1909.10705.pdf), p. 5] describe and [implement](https://github.com/abisee/story-generation-eval/blob/master/metrics.py) it differently from the authors, with division by the total number of _n_-grams. The original authors' [code](https://github.com/YifanZhou95/diversity-promoting-dialogue-system/blob/e4a83359c22299999dab4bc7882c63b1dec51cf3/MMI_antiLM.ipynb) \[_distinctEval_()] uses tokens.

In [None]:
def distinct_n(hyp) -> float:
    """Compute distinct-n for sequence."""
    df = pd.DataFrame({'Distinct_1': 0,
                       'Distinct_2': 0,
                       'Distinct_3': 0,
                       'Distinct_4': 0}, index=[0])
    with open(f"{hyp}") as hfile:
        lines = hfile.read().splitlines()
        d1s = []
        d2s = []
        d3s = []
        d4s = []
        for s in lines:
            seq = word_tokenize(s)
            n = len(seq) or 1
            unigrams = set(seq)
            bigrams = set(nltk.bigrams(seq)) # distinct bigrams
            trigrams = set(nltk.trigrams(seq))
            fourgrams = set(nltk.ngrams(seq, 4))
            d1s.append(len(unigrams)/n)
            d2s.append(len(bigrams)/n)
            d3s.append(len(trigrams)/n)
            d4s.append(len(fourgrams)/n)
    df.Distinct_1 = st.mean(d1s)
    df.Distinct_2 = st.mean(d2s)
    df.Distinct_3 = st.mean(d3s)
    df.Distinct_4 = st.mean(d4s)
    return df

Attempt at reproducing _repetition-4_ metric as described by [Shao](https://www.aclweb.org/anthology/D19-1321.pdf) et al.; [Guan](https://arxiv.org/pdf/2001.05139.pdf) et al.:

In [None]:
def repetition_n(gendings) -> pd.DataFrame:
    """Compute percentage of generated texts 
       that repeats at least one 4-gram."""
    cnt = 0
    for s in gendings:
        seq = word_tokenize(s)
        fourgrams = list(nltk.ngrams(seq, 4))
        if fourgrams:
            if not len(set(fourgrams)) == len(fourgrams): # If no 4-gram repetition, lengths equal.
                cnt += 1 # Else this text repeats at least one 4-gram.
    return cnt/len(gendings) # num_repeating_texts/num_texts


### Get distinct-1, 2, 3, 4 scores.

In [None]:
ddf = distinct_n(f'endings/cn_SCT_new_gen_ends.txt')
display(ddf)

In [None]:
repetition_n(gendings)

### Alternate BLEU:

In [None]:
nltk.translate.bleu_score.corpus_bleu(stories, endings)

## Write evaluations to file.

Evaluate endings with respect to references, and write endings' scores to evaluations file:

&nbsp;&nbsp;&nbsp;&nbsp;<small><u>Note</u>: For some cases, temporarily edited *nlgeval*'s \_init_.py to return individual story results.</small>

In [None]:
mdf.drop("EmbeddingAverageCosineSimilairty", axis=1).to_csv(f"evals/{fname}_story_end_sg_evals.txt", 
                                                            mode='w', index=False)

In [None]:
mdf.head()

In [None]:
readf.to_csv(f"evals/{fname}_readability.txt", mode='w', index=False)

In [None]:
ddf.to_csv(f"evals/cn_SCT_new_distinct_evals.txt", index=False)

In [None]:
def write_dist_nlg(fname: str, metrics: dict = None, ddf: dict = None, 
                   glove=False, skip=False, overlap=False) -> None:
    """Write BLEU-1, 2, METEOR, ROUGE-L, CIDEr, Distinct-1, 2, 3, 4, embedding scores to file."""
    if not ddf.empty:
        df = mdf.join(ddf)
    else:
        df = mdf
    try:
        df = df.drop(columns=['EmbeddingAverageCosineSimilairty'])
    except:
        pass
    if overlap and not glove and not skip:
        df.to_csv(f"evals/{fname}_all_evals.txt", index=False)
    if glove and not overlap:
        df.loc[:, 'EmbeddingAverageCosineSimilarity': ].to_csv(f"evals/{fname}_sg_glove_evals.txt", index=False)
    if skip and not overlap:
        pd.DataFrame(data=[df.SkipThoughtCS.values], 
                     columns=['SkipThoughtCS']).to_csv(f"evals/{fname}_sg_skip_evals.txt", 
                                                       index=False)
    if skip and overlap and glove:
        df.to_csv(f"evals/{fname}_o_skip_glove_evals.txt", index=False)

In [None]:
write_dist_nlg(fname=fname, metrics=mdf, 
               glove=False, skip=False, overlap=True,
               ddf=ddf)

#### Originally to create data in seg_evaluations notebook

In [None]:
def display_overlap_all(models: list = []) -> list:
    models = ['gpt2',
              'b_SCT', 
              'cn_SCT', 
              'cn_SCT_sentiment', 
              'cn_sentiment_SCT', 
              'roc1617_SCT_sentiment', 
              'roc1617_sentiment_SCT', 
              'roc1617_SCT']
    model_keys = ['Base',
                  'Base_SCT',
                  'ConceptNet_SCT',
                  'CN_SCT_Sentiment',
                  'CN_Sentiment_SCT',
                  'ROC_SCT_Sentiment',
                  'ROC_Sentiment_SCT',
                  'ROC_SCT']
    df_list = []
    try:
        corpus_all_df = pd.read_csv(f'evals/models_overlap.txt', sep=',')
        if not corpus_all_df.columns.values[0] == "Models":
            corpus_all_df.columns.values[0] = "Models"
            corpus_all_df.columns.values[1] = "drop"
            corpus_all_df = corpus_all_df.drop("drop", axis=1)
            corpus_all_df.to_csv(f'evals/models_overlap.txt', sep=',', index=False)
    except:
        for mod in models:
            corpus_df = pd.read_csv(f"evals/{mod}_corpus_all.txt", sep=',')
            df_list.append(corpus_df)
        corpus_all_df = pd.concat(df_list, axis=0, keys=model_keys)
        corpus_all_df.to_csv(f'evals/models_overlap.txt', sep=',', index=True)
    corpus_all_df = corpus_all_df.style.hide_index().\
                            apply(bold_max).\
                            apply(ital_min)
    if corpus_all_df.columns.values[0] == "Models":
        return corpus_all_df
    else:
        return "Built table. Run cell again."

def display_distinct_all(models: list = []) -> list:
    models = ['gpt2',
              'b_SCT', 
              'cn_SCT', 
              'cn_SCT_sentiment', 
              'cn_sentiment_SCT', 
              'roc1617_SCT_sentiment', 
              'roc1617_sentiment_SCT', 
              'roc1617_SCT']
    model_keys = ['Base',
                  'Base_SCT',
                  'ConceptNet_SCT',
                  'CN_SCT_Sentiment',
                  'CN_Sentiment_SCT',
                  'ROC_SCT_Sentiment',
                  'ROC_Sentiment_SCT',
                  'ROC_SCT']
    df_list = []
    try:
        distinct_all_df = pd.read_csv(f'evals/models_distinct.txt', sep=',')
        if not distinct_all_df.columns.values[0] == "Models":
            distinct_all_df.columns.values[0] = "Models"
            distinct_all_df.columns.values[1] = "drop"
            distinct_all_df = distinct_all_df.drop("drop", axis=1)
            distinct_all_df.to_csv(f'evals/models_distinct.txt', sep=',', index=False)
    except:
        for mod in models:
            distinct_df = pd.read_csv(f"evals/{mod}_distinct_evals.txt", sep=',')
            df_list.append(distinct_df)
        distinct_all_df = pd.concat(df_list, axis=0, keys=model_keys)
        distinct_all_df.to_csv(f'evals/models_distinct.txt', sep=',', index=True)
    distinct_all_df = distinct_all_df.style.hide_index().\
                            apply(bold_max).\
                            apply(ital_min)
    if distinct_all_df.columns.values[0] == "Models":
        return distinct_all_df
    else:
        return "Built table. Run cell again."


def display_ppl_all(models: list = []) -> list:
    models = ['gpt2',
              'b_SCT', 
              'cn_SCT', 
              'cn_SCT_sentiment', 
              'cn_sentiment_SCT', 
              'roc1617_SCT_sentiment', 
              'roc1617_sentiment_SCT', 
              'roc1617_SCT']
    model_keys = ['Base',
                  'Base_SCT',
                  'ConceptNet_SCT',
                  'CN_SCT_Sentiment',
                  'CN_Sentiment_SCT',
                  'ROC_SCT_Sentiment',
                  'ROC_Sentiment_SCT',
                  'ROC_SCT']
    df_list = []
    try:
        all_ppl_df = pd.read_csv(f'evals/models_ppl.txt', sep=',')
        try:
            if not all_ppl_df.columns.values[0] == "Models":
                all_ppl_df.columns.values[0] = "Models"
                all_ppl_df.columns.values[1] = "Type"
                all_ppl_df.columns.values[2] = "PPL"
                all_ppl_df = all_ppl_df.drop("Story", axis=1)
                all_ppl_df = all_ppl_df.drop("CorrectEnding", axis=1)
                all_ppl_df = all_ppl_df.drop("GenEnding", axis=1)
                all_ppl_df.to_csv(f'evals/models_ppl.txt', sep=',', index=False)
        except:
            pass
    except:
        for mod in models:
            try:
                all_ppl = pd.read_csv(f"evals/{mod}_ppl.csv", sep=',')
                df_list.append(all_ppl.mean(axis=0))
            except:
                continue
        all_ppl_df = pd.concat(df_list, axis=0, keys=model_keys)
        all_ppl_df.to_csv(f'evals/models_ppl.txt', sep=',', index=True)

    if all_ppl_df.columns.values[0] == "Models":
        return all_ppl_df
    else:
        return "Built table. Run cell again."

## Sentiment

In [None]:
analyser = SentimentIntensityAnalyzer()

In [None]:
def get_sentiment(p: float) -> None:
    """Label compound scores heuristically."""
    if p >= 0.05:
        result = 'POS'
    elif p <= -0.05:
        result = 'NEG'
    else: # -0.05 < p < 0.05
        result = 'NEU'
    return result

def get_avg_senti_label(story: list) -> float:
    """Compute average sentiment; via VADER paragraph demo."""
    story_sentiments = 0.0
    for sentence in story: # Get avg story/context sentiment.
        valences = analyser.polarity_scores(sentence)['compound']
        story_sentiments += valences
    return get_sentiment(round(story_sentiments / len(story), 4))

def get_matches(prompts, endings):
    """Get ending distance to avg story sentiment."""
    matches = []
    story_scores = [get_avg_senti_label(story) for story in prompts] # Label avg sentences' compound scores in story.
    ending_scores = [get_sentiment(analyser.polarity_scores(end)['compound']) for end in endings]
    for story_score, end_score in zip(story_scores, ending_scores):
        if story_score == end_score:
            matches.append(end_score)
        else:
            matches.append('None')    
    return matches, story_scores, ending_scores

In [None]:
gen_matches, story_scores, ending_scores = get_matches(stories_split, gendings)
gen_p = (gen_matches.count('POS') + gen_matches.count('NEG'))/len(gen_matches)
#corr_matches, story_scores, ending_scores = get_matches(stories_split, corr_ends)
#corr_p = (corr_matches.count('POS') + corr_matches.count('NEG'))/len(corr_matches)

In [None]:
d = {'Model': [f'{fname}'], 'Matches': [gen_p]}
df = pd.DataFrame.from_dict(d)
df

In [None]:
df.to_csv(f'evals/{fname}_sent_matches.txt', index=False, sep=',')

In [None]:
models_sent_df = pd.read_csv('evals/models_sent_matches.txt', sep=',')

In [None]:
print(models_sent_df.to_latex(index=False))