## Imports

In [0]:
"""
from google.colab import drive
drive.mount('/content/drive')
"""

In [0]:
import string, csv, re, random, logging, torch, io
from string import punctuation as punk
from ipywidgets import *
import pandas as pd
import numpy as np

> We must navigate to the main project folder in mounted My Drive. 

> Assumes the following structure:
<pre>.
├── content
│   ├──drive                         # Mounted drive folder.
│   │   └── My Drive                 # Mounted drive folder.
│   │       └── CSC-594-ADL          # Main project folder.
│   │           ├── datasets         # ConceptNet and ROCStories.
│   │           ├── endings          # Correct and generated endings per model.
│   │           ├── evals            # Evaluation results for stories and endings per model.
│   │           ├── models           # Pretrained models, tokenizers, vocabulary, etc.
│   │           ├── scripts          # Scripts for training and generation.
│   │           └── stories          # Combined story bodies and generated endings per model.
│   ├── sample_data                  # Default Colab folder.
│   └── transformers                 # Installed from HuggingFace.
└── ...
</pre>

In [0]:
%cd drive/My Drive/csc594-ADL

---
## Definitions
---

In [0]:
def bold_max(s): 
    """"Bold largest values.""" 
    if s.name != "Models":
        is_max = s == s.max()
    else:
        is_max = s == False
    return ['font-weight: bold' if v else '' for v in is_max]

def ital_min(s):
    """"Italicize smallest values.""" 
    if s.name != "Models":
        is_min = s == s.min()
    else:
        is_min = s == False
    return ['font-style: italic' if v else '' for v in is_min]

def display_df_evals(model_type: str, eval_type: str, show_full: bool):
    """Display metrics results with emphasis on min/max vals."""
    print(f'Evaluations for {model_type}:')
    print("\n------------\n")
    evals = {}
    metrics = [('Corpus-level overlap', 'corpus_all.txt'), 
               ('Entity coreference', 'trunc_corefs.csv'), 
               ('Distinct', 'distinct_evals.txt'), 
               ('Skip-Thought', 'sg_skip_evals.txt'), 
               ('GloVe', 'sg_glove_evals.txt'), 
               ('Averaged truncated Skip/GloVe', 'o_skip_glove_evals.txt'), 
               ('Perplexity', 'ppl.csv'), 
               ('Flesch-Kincaid readability', 'joint_readability.txt')]
    for metric in metrics:
        if eval_type == metric[0]:
            try:
                evals[metric[0]] = pd.read_csv(f"evals/{model_type}_{metric[1]}", sep=',')
            except:
                if 'Distinct' in metric[0]:
                    try:
                        evals[metric[0]] = pd.read_csv(f"evals/{model_type}_{'distinct_evals.txt'}", 
                                                    sep=',')
                    except:
                        pass
                elif 'Entity' in metric[0]:
                    try:
                        evals[metric[0]] = pd.read_csv(f"evals/{model_type}_{'corefs.csv'}", 
                                                    sep=',')
                    except:
                        pass
                else:
                    print(f"{metric[0]} evaluations weren't found. \n")
    if eval_type == 'Skip-Thought/GloVe':
        try:
            evals['GloVe'] = pd.DataFrame(evals['GloVe'].VectorExtremaCosineSimilarity)
            skip_glove_evals_df = evals['GloVe'].join(evals['Skip-Thought'])
            evals['Skip/GloVe'] = skip_glove_evals_df
            del evals['Skip-Thought']
            del evals['GloVe']
        except:
            print(f"Skip-Thought/GloVe evaluations weren't found. \n")
    for m, e in evals.items():
        try:
            if 'Entity' in m:
                if not evals[m].columns.values[0] == "Model":
                    evals[m].columns.values[0] = "Model"
            if 'Perplexity' in m or 'Distinct' in m:
                if 'Distinct' in m:
                    evals[m] = evals[m].mean(axis=0)
                if 'Perplexity' in m:
                    ppl_df = evals[m]
                if not show_full:
                    evals[m] = evals[m].head()
            if 'truncated' in m or 'readability' in m:
                evals[m] = evals[m].mean()
            print(f"{m}:\n")
            display(evals[m])
            print("\n------------\n")
        except:
            continue
    try:
        return ppl_df
    except:
        return pd.DataFrame()

def display_overlap_all(models: list = []) -> list:
    models = ['gpt2',
              'b_SCT', 
              'cn_SCT', 
              'cn_SCT_sentiment', 
              'cn_sentiment_SCT', 
              'roc1617_SCT_sentiment', 
              'roc1617_sentiment_SCT', 
              'roc1617_SCT']
    model_keys = ['Base',
                  'Base_SCT',
                  'ConceptNet_SCT',
                  'CN_SCT_Sentiment',
                  'CN_Sentiment_SCT',
                  'ROC_SCT_Sentiment',
                  'ROC_Sentiment_SCT',
                  'ROC_SCT']
    df_list = []
    try:
        corpus_all_df = pd.read_csv(f'evals/models_overlap.txt', sep=',')
        if not corpus_all_df.columns.values[0] == "Models":
            corpus_all_df.columns.values[0] = "Models"
            corpus_all_df.columns.values[1] = "drop"
            corpus_all_df = corpus_all_df.drop("drop", axis=1)
            corpus_all_df.to_csv(f'evals/models_overlap.txt', sep=',', index=False)
    except:
        for mod in models:
            corpus_df = pd.read_csv(f"evals/{mod}_corpus_all.txt", sep=',')
            df_list.append(corpus_df)
        corpus_all_df = pd.concat(df_list, axis=0, keys=model_keys)
        corpus_all_df.to_csv(f'evals/models_overlap.txt', sep=',', index=True)
    corpus_all_df = corpus_all_df.style.hide_index().\
                            apply(bold_max).\
                            apply(ital_min)
    if corpus_all_df.columns.values[0] == "Models":
        return corpus_all_df
    else:
        return "Built table. Run cell again."

def display_distinct_all(models: list = []) -> list:
    models = ['gpt2',
              'b_SCT', 
              'cn_SCT', 
              'cn_SCT_sentiment', 
              'cn_sentiment_SCT', 
              'roc1617_SCT_sentiment', 
              'roc1617_sentiment_SCT', 
              'roc1617_SCT']
    model_keys = ['Base',
                  'Base_SCT',
                  'ConceptNet_SCT',
                  'CN_SCT_Sentiment',
                  'CN_Sentiment_SCT',
                  'ROC_SCT_Sentiment',
                  'ROC_Sentiment_SCT',
                  'ROC_SCT']
    df_list = []
    try:
        distinct_all_df = pd.read_csv(f'evals/models_distinct.txt', sep=',')
        if not distinct_all_df.columns.values[0] == "Models":
            distinct_all_df.columns.values[0] = "Models"
            distinct_all_df.columns.values[1] = "drop"
            distinct_all_df = distinct_all_df.drop("drop", axis=1)
            distinct_all_df.to_csv(f'evals/models_distinct.txt', sep=',', index=False)
    except:
        for mod in models:
            distinct_df = pd.read_csv(f"evals/{mod}_distinct_evals.txt", sep=',')
            df_list.append(distinct_df)
        distinct_all_df = pd.concat(df_list, axis=0, keys=model_keys)
        distinct_all_df.to_csv(f'evals/models_distinct.txt', sep=',', index=True)
    distinct_all_df = distinct_all_df.style.hide_index().\
                            apply(bold_max).\
                            apply(ital_min)
    if distinct_all_df.columns.values[0] == "Models":
        return distinct_all_df
    else:
        return "Built table. Run cell again."


def display_ppl_all(models: list = []) -> list:
    models = ['gpt2',
              'b_SCT', 
              'cn_SCT', 
              'cn_SCT_sentiment', 
              'cn_sentiment_SCT', 
              'roc1617_SCT_sentiment', 
              'roc1617_sentiment_SCT', 
              'roc1617_SCT']
    model_keys = ['Base',
                  'Base_SCT',
                  'ConceptNet_SCT',
                  'CN_SCT_Sentiment',
                  'CN_Sentiment_SCT',
                  'ROC_SCT_Sentiment',
                  'ROC_Sentiment_SCT',
                  'ROC_SCT']
    df_list = []
    try:
        all_ppl_df = pd.read_csv(f'evals/models_ppl.txt', sep=',')
        try:
            if not all_ppl_df.columns.values[0] == "Models":
                all_ppl_df.columns.values[0] = "Models"
                all_ppl_df.columns.values[1] = "Type"
                all_ppl_df.columns.values[2] = "PPL"
                all_ppl_df = all_ppl_df.drop("Story", axis=1)
                all_ppl_df = all_ppl_df.drop("CorrectEnding", axis=1)
                all_ppl_df = all_ppl_df.drop("GenEnding", axis=1)
                all_ppl_df.to_csv(f'evals/models_ppl.txt', sep=',', index=False)
        except:
            pass
    except:
        for mod in models:
            try:
                all_ppl = pd.read_csv(f"evals/{mod}_ppl.csv", sep=',')
                df_list.append(all_ppl.mean(axis=0))
            except:
                continue
        all_ppl_df = pd.concat(df_list, axis=0, keys=model_keys)
        all_ppl_df.to_csv(f'evals/models_ppl.txt', sep=',', index=True)

    if all_ppl_df.columns.values[0] == "Models":
        return all_ppl_df
    else:
        return "Built table. Run cell again."

## Evaluations

In [0]:
display_dfe = interactive(display_df_evals,
                          model_type=Dropdown(options=[
                                            ('Base', 'gpt2'),
                                            ('Base to SCT', 'b_SCT'),
                                            ('ConceptNet to SCT', 'cn_SCT'), #
                                            ('CN to SCT to Sentiment', 'cn_SCT_sentiment'),
                                            ('CN to Sentiment to SCT', 'cn_sentiment_SCT'),
                                            ('ROC to SCT to Sentiment', 'roc1617_SCT_sentiment'),
                                            ('ROC to Sentiment to SCT', 'roc1617_sentiment_SCT'),
                                            ('ROC to SCT', 'roc1617_SCT')],
                                            description="Model type"),
                          eval_type=Dropdown(options=['Corpus-level overlap', 
                                                      'Entity coreference', 
                                                      'Distinct', 
                                                      'Skip-Thought/GloVe', 
                                                      'Averaged truncated Skip/GloVe', 
                                                      'Perplexity', 
                                                      'Flesch-Kincaid readability'],
                                             description="Metric"),
                          show_full=Checkbox(value=False, description="Show head ... tail"))

display_dfe.layout.height = "450px"
display(display_dfe)

Display models' overlap and distinct average scores evaluated on endings generated for story prompts from ROCStories test set:

In [0]:
all_df = pd.read_csv(f'evals/models_base_all.txt', sep=',')
all_df.style.apply(bold_max).apply(ital_min)

In [0]:
ppl_all_df = pd.read_csv(f"evals/models_ppl.txt", sep=',')
display(ppl_all_df)

In [0]:
all_corefs = pd.read_csv(f"evals/models_corefs.txt", sep=',')
display(all_corefs)