In [38]:
import os
import math
import json
import numpy as np
from scipy.optimize import curve_fit
import plotly.express as px
import plotly.graph_objects as go

In [39]:
def combine_attrs(list1, list2):
    return [str(list1[i]) + ', ' + str(list2[i]) for i in range(len(list1))]

def data(json_file):
    j = json.load(open(json_file))
    settings = os.path.basename(json_file).replace('.json', '').split('_')
    j['dist'] = settings[0]
    j['uni_or_bi'] = settings[1]
    j['vocab_size'] = int(settings[2])
    j['softmax'] = True if settings[3] == 'softmax' else False
    j['settings'] = settings
    j['losses'] = []
    for i in range(len(j['train_losses'])):
        j['losses'].extend(j['train_losses'][i])
    del j['train_losses']
    if 'val_losses' in j:
        print(json_file)
    return j

json_files = [
    os.path.join('results', x) for x in os.listdir('results') if x.endswith('.json')
]

jsons = [data(json_file) for json_file in json_files]

test_set_perplexity = []
entropy = []
dist = []
uni_or_bi = []
vocab_size = []
softmax = []

for j in jsons:
    test_set_perplexity.append(min(j['test_set_perplexities']))
    entropy.append(j['entropy'])
    dist.append(j['dist'])
    uni_or_bi.append(j['uni_or_bi'])
    vocab_size.append(j['vocab_size'])
    softmax.append(j['softmax'])
    
test_set_avg_cross_entropy = [math.log(x) for x in test_set_perplexity]

df = {
    'Test set perplexity': test_set_perplexity,
    'Test set average cross-entropy': test_set_avg_cross_entropy,
    'Entropy': entropy,
    'Distribution': dist,
    'Uni- or bigram': uni_or_bi,
    'Vocab size': vocab_size,
    'Softmax': softmax
}

In [40]:
# curve fitting
def exponential(x, a, b, c):
    return a * np.exp(b * x) + c

def linear(x, a, b):
    return a * x + b

uni_ppls = [test_set_perplexity[i] for i in range(len(uni_or_bi)) if uni_or_bi[i] == 'unigrams']
bi_ppls = [test_set_perplexity[i] for i in range(len(uni_or_bi)) if uni_or_bi[i] == 'bigrams']

uni_ces = [test_set_avg_cross_entropy[i] for i in range(len(uni_or_bi)) if uni_or_bi[i] == 'unigrams']
bi_ces = [test_set_avg_cross_entropy[i] for i in range(len(uni_or_bi)) if uni_or_bi[i] == 'bigrams']

uni_ents = [entropy[i] for i in range(len(uni_or_bi)) if uni_or_bi[i] == 'unigrams']
bi_ents = [entropy[i] for i in range(len(uni_or_bi)) if uni_or_bi[i] == 'bigrams']

uni_ppl_vs_ent = curve_fit(exponential, uni_ents, uni_ppls)
bi_ppl_vs_ent = curve_fit(exponential, bi_ents, bi_ppls)

uni_ce_vs_ent = curve_fit(linear, uni_ents, uni_ces)
bi_ce_vs_ent = curve_fit(linear, bi_ents, bi_ces)

In [41]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Uni- or bigram'},
    color=uni_or_bi,
    hover_data={'Vocab size': True, 'Softmax': True, 'Distribution': True}
)
fig.update_layout(
    legend=dict(
        x=0.015,
        y=0.97
    )
)
x1 = np.linspace(min(uni_ents), max(uni_ents), 100)
y1 = exponential(x1, *uni_ppl_vs_ent[0])
x2 = np.linspace(min(bi_ents), max(bi_ents), 100)
y2 = exponential(x2, *bi_ppl_vs_ent[0])
fig.add_trace(go.Scatter(
    x=x1,
    y=y1,
    mode='lines',
    name=f'unigrams: {uni_ppl_vs_ent[0][0]:.2f} * exp({uni_ppl_vs_ent[0][1]:.2f} * x) - {abs(uni_ppl_vs_ent[0][2]):.2f}')
)
fig.add_trace(go.Scatter(
    x=x2,
    y=y2,
    mode='lines',
    name=f'bigrams: {bi_ppl_vs_ent[0][0]:.2f} * exp({bi_ppl_vs_ent[0][1]:.2f} * x) - {abs(bi_ppl_vs_ent[0][2]):.2f}')
)
fig.write_html('plots/uni_or_bi.html')





In [42]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Vocab size'},
    color=[str(x) for x in vocab_size],
    category_orders={'color': reversed([str(x) for x in sorted(vocab_size)])},
    hover_data={'Uni- or bigram': True, 'Softmax': True, 'Distribution': True}
)
fig.update_layout(
    legend=dict(
        x=0.015,
        y=0.97
    )
)
fig.add_trace(go.Scatter(
    x=x1,
    y=y1,
    mode='lines',
    name=f'unigrams: {uni_ppl_vs_ent[0][0]:.2f} * exp({uni_ppl_vs_ent[0][1]:.2f} * x) - {abs(uni_ppl_vs_ent[0][2]):.2f}')
)
fig.add_trace(go.Scatter(
    x=x2,
    y=y2,
    mode='lines',
    name=f'bigrams: {bi_ppl_vs_ent[0][0]:.2f} * exp({bi_ppl_vs_ent[0][1]:.2f} * x) - {abs(bi_ppl_vs_ent[0][2]):.2f}')
)
fig.write_html('plots/vocab.html')





In [43]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Uni- or bigram and distribution'},
    color=combine_attrs(uni_or_bi, dist),
    hover_data={'Vocab size': True, 'Softmax': True}
)
fig.update_layout(
    legend=dict(
        x=0.015,
        y=0.97
    )
)
fig.add_trace(go.Scatter(
    x=x1,
    y=y1,
    mode='lines',
    name=f'unigrams: {uni_ppl_vs_ent[0][0]:.2f} * exp({uni_ppl_vs_ent[0][1]:.2f} * x) - {abs(uni_ppl_vs_ent[0][2]):.2f}')
)
fig.add_trace(go.Scatter(
    x=x2,
    y=y2,
    mode='lines',
    name=f'bigrams: {bi_ppl_vs_ent[0][0]:.2f} * exp({bi_ppl_vs_ent[0][1]:.2f} * x) - {abs(bi_ppl_vs_ent[0][2]):.2f}')
)
fig.write_html('plots/uni_or_bi_dist.html')





In [44]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Uni- or bigram'},
    color='Uni- or bigram',
    hover_data={'Vocab size': True, 'Softmax': True, 'Distribution': True}
)
fig.update_layout(
    legend=dict(
        x=0.015,
        y=0.97
    )
)
x1 = np.linspace(min(uni_ents), max(uni_ents), 100)
y1 = linear(x1, *uni_ce_vs_ent[0])
x2 = np.linspace(min(bi_ents), max(bi_ents), 100)
y2 = linear(x2, *bi_ce_vs_ent[0])
fig.add_trace(go.Scatter(
    x=x1,
    y=y1,
    mode='lines',
    name=f'unigrams: {uni_ce_vs_ent[0][0]:.2f} * x + {uni_ce_vs_ent[0][1]:.2f}')
)
fig.add_trace(go.Scatter(
    x=x2,
    y=y2,
    mode='lines',
    name=f'bigrams: {bi_ce_vs_ent[0][0]:.2f} * x + {bi_ce_vs_ent[0][1]:.2f}')
)
fig.write_html('plots/uni_or_bi_ce.html')





In [45]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Vocab size'},
    color=[str(x) for x in vocab_size],
    category_orders={'color': reversed([str(x) for x in sorted(vocab_size)])},
    hover_data={'Uni- or bigram': True, 'Softmax': True, 'Distribution': True}
)
fig.update_layout(
    legend=dict(
        x=0.015,
        y=0.97
    )
)
fig.add_trace(go.Scatter(
    x=x1,
    y=y1,
    mode='lines',
    name=f'unigrams: {uni_ce_vs_ent[0][0]:.2f} * x + {uni_ce_vs_ent[0][1]:.2f}')
)
fig.add_trace(go.Scatter(
    x=x2,
    y=y2,
    mode='lines',
    name=f'bigrams: {bi_ce_vs_ent[0][0]:.2f} * x + {bi_ce_vs_ent[0][1]:.2f}')
)
fig.write_html('plots/vocab_ce.html')





In [46]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Uni- or bigram and distribution'},
    color=combine_attrs(uni_or_bi, dist),
    hover_data={'Vocab size': True, 'Softmax': True}
)
fig.update_layout(
    legend=dict(
        x=0.015,
        y=0.97
    )
)
fig.add_trace(go.Scatter(
    x=x1,
    y=y1,
    mode='lines',
    name=f'unigrams: {uni_ce_vs_ent[0][0]:.2f} * x + {uni_ce_vs_ent[0][1]:.2f}')
)
fig.add_trace(go.Scatter(
    x=x2,
    y=y2,
    mode='lines',
    name=f'bigrams: {bi_ce_vs_ent[0][0]:.2f} * x + {bi_ce_vs_ent[0][1]:.2f}')
)
fig.write_html('plots/uni_or_bi_dist_ce.html')



