#### Setup

In [1]:
import os
path = os.path.abspath(os.path.join("..", "language-modeling"))
%cd $path

/media/pips/Data/Projects/NLU-UniTN-2022/project/language-modeling


In [2]:
import yaml
import math
import pickle
import numpy as np
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt

from sklearn.metrics import classification_report

import torch
import torch.nn.functional as F

from IPython.display import display

from cli import inference


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("configs/inference.yaml", "r") as f:
    inf_config = yaml.safe_load(f)

In [4]:
def load_pickle(filename: str):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [5]:
def plot_ppl_per_len(df):
    df = df[['lengths', 'loss']]
    df['lengths'] = df.apply(lambda x: list(x["lengths"])[0], axis=1)
    df = df.groupby("lengths").mean()
    df["ppl"] = df.apply(lambda x: math.exp(x["loss"]), axis=1)
    df.plot(
        y="ppl", 
        title="Perplexity by sequence length", 
        kind="bar",
        xlabel="Sentence length",
        ylabel="Average perplexity",
        rot=0,
        xticks=range(0, 70, 10),
        legend=False,)
    plt.show()

In [6]:
def fetch_predictions(outputs):
    outputs = torch.tensor(outputs) if len(outputs.shape) > 1 \
                                    else torch.tensor(outputs).unsqueeze(0)
    softmax = F.softmax(outputs, dim=1)
    predictions = torch.argmax(softmax, dim=1)
    return predictions.numpy()

def get_counts(df, col, lang):
    if col == "outputs":
        df[col] = df.apply(lambda pred: fetch_predictions(pred[col]), axis=1)
        
    counts = Counter()
    for row in df[col]:
        if col == "targets" and row.size < 2:
            row = [int(row)]
        counts.update(row)
    counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))
    counts = {lang[int(k)]: v  for (k, v) in counts.items()}
    return counts

In [7]:
def plot_targets_vs_preds(targets, preds):
    targets_df = pd.DataFrame(targets.items(), columns=["word", "target_count"])
    total = targets_df["target_count"].sum()
    targets_df["target_freq"] = targets_df["target_count"] / total
    targets_df["pred_count"] = pd.Series(preds.values())
    targets_df["pred_count"] = targets_df["pred_count"].fillna(0).astype(int)
    targets_df["diff"] = targets_df["pred_count"] - targets_df["target_count"]
    targets_df["pred_freq"] = targets_df["pred_count"] / total
    return targets_df

In [8]:
def report(df, targets_counts, outputs_counts, lang, mode="most"):
    if mode == "most_present":
        ids = set([lang[w] for w in list(targets_counts.keys())[:20]])
    elif mode == "most_guessed":
        ids = set([lang[w] for w in list(outputs_counts.keys())[:20]])
    elif mode == "least_guessed":
        ids = set([lang[w] for w in list(outputs_counts.keys())[-20:]])
    targets = []
    preds = []
    for _, row in df.iterrows():
        rt = row["targets"].squeeze().tolist()
        rp = row["outputs"].squeeze().tolist()
        try:
            for t, p in zip(rt, rp):
                if (t in ids or p in ids):
                    targets.append(t)
                    preds.append(p)
        except TypeError:
            targets.append(rt)
            preds.append(rp[0]) 
    report = classification_report(targets, preds, labels=list(ids), target_names=[lang.ids2words[i] for i in ids], output_dict=True)
    display(pd.DataFrame(report).transpose())

In [13]:
def evaluate(experiment_name: str):
    results = load_pickle(f"results/outputs/{experiment_name}.pkl")
    lang = load_pickle("lang.pkl")
    df = pd.DataFrame(results)
    # display(df)
    # plot_ppl_per_len(df)
    targets = get_counts(df, "targets", lang)
    outputs = get_counts(df, "outputs", lang)
    # diff = plot_targets_vs_preds(targets, outputs)
    # pd.options.display.float_format = '{:.2%}'.format
    # display(diff)
    pd.options.display.float_format = '{:.2}'.format
    report(df, targets, outputs, lang, mode="most_present")
    report(df, targets, outputs, lang, mode="most_guessed")
    report(df, targets, outputs, lang, mode="least_guessed")
    del df

#### merity_ad_nohh_1024

In [10]:
experiment_name = "merity_ad_nohh_1024"

In [13]:
with open(f"configs/{experiment_name}.yaml") as config_file:
    config = yaml.safe_load(config_file)
print(inference(config, inf_config, "the"))

T: 1.00	=>	the surprising pace came in the july N causing paper in the u.s. effect a consumer <unk> was pegged at $ N million
T: 0.80	=>	the treasury will raise about $ N billion in short-term bills and annuities
T: 0.75	=>	the accord also includes a $ N million air force contract for power to consolidate the second-largest airline
T: 0.70	=>	the lack of <unk> money is too profitable
T: 0.50	=>	the real estate unit is n't a real estate



In [14]:
print(inference(config, inf_config, "the price"))

T: 1.00	=>	the price is at induce price
T: 0.80	=>	the price was unchanged to $ N million
T: 0.75	=>	the price of $ N a share
T: 0.70	=>	the price for the s&p N was equivalent to N N
T: 0.50	=>	the price was $ N



In [15]:
print(inference(config, inf_config, "it may"))

T: 1.00	=>	it may not always lock on european market
T: 0.80	=>	it may be far greater and more than N N of those in the stock exchange and many markets
T: 0.75	=>	it may be n't a separate <unk>
T: 0.70	=>	it may be <unk> by a <unk> plan
T: 0.50	=>	it may not be a <unk>



In [14]:
evaluate(experiment_name)

TypeError: 'int' object is not subscriptable