In [25]:
import os
import math
import json
import numpy as np
from scipy.optimize import curve_fit
import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
def combine_attrs(list1, list2):
    return [str(list1[i]) + ', ' + str(list2[i]) for i in range(len(list1))]

def data(json_file):
    j = json.load(open(json_file))
    settings = os.path.basename(json_file).replace('.json', '').split('_')
    j['dist'] = settings[0]
    j['uni_or_bi'] = settings[1]
    j['vocab_size'] = int(settings[2])
    j['softmax'] = True if settings[3] == 'softmax' else False
    j['settings'] = settings
    j['losses'] = []
    if 'lstm' in settings:
        j['model_type'] = 'lstm'
    else:
        j['model_type'] = 'trf'
    if 'embd256' in settings:
        j['embd_size'] = 256
    else:
        j['embd_size'] = 64
    for i in range(len(j['train_losses'])):
        j['losses'].extend(j['train_losses'][i])
    del j['train_losses']
    if 'val_losses' in j:
        print(json_file)
    return j

json_files = [
    os.path.join('results', x) for x in os.listdir('results') if x.endswith('.json')
]

jsons = [data(json_file) for json_file in json_files]

test_set_perplexity = []
entropy = []
dist = []
uni_or_bi = []
vocab_size = []
softmax = []
model_type = []
embd_size = []

for j in jsons:
    test_set_perplexity.append(min(j['test_set_perplexities']))
    entropy.append(j['entropy'])
    dist.append(j['dist'])
    uni_or_bi.append(j['uni_or_bi'])
    vocab_size.append(j['vocab_size'])
    softmax.append(j['softmax'])
    model_type.append(j['model_type'])
    embd_size.append(j['embd_size'])
    
for i in range(len(entropy)):
    if uni_or_bi[i] == 'bigrams':
        entropy[i] = entropy[i] / 2
    
test_set_avg_cross_entropy = [math.log(x) for x in test_set_perplexity]

df = {
    'Test set perplexity': test_set_perplexity,
    'Test set average cross-entropy': test_set_avg_cross_entropy,
    'Entropy': entropy,
    'Distribution': dist,
    'Uni- or bigram': uni_or_bi,
    'Vocab size': vocab_size,
    'Softmax': softmax,
    'Model type': model_type,
    'Embedding size': embd_size
}

In [27]:
# curve fitting
def exponential(x, a, b, c):
    return a * np.exp(b * x) + c

def linear(x, a, b):
    return a * x + b

x_ent = np.linspace(min(entropy), max(entropy), 100)

ppl_vs_ent = curve_fit(exponential, entropy, test_set_perplexity)

y_ppl = exponential(x_ent, *ppl_vs_ent[0])

ce_vs_ent = curve_fit(linear, entropy, test_set_avg_cross_entropy)

y_ce = linear(x_ent, *ce_vs_ent[0])

ppl_fit = go.Scatter(
    x=x_ent,
    y=y_ppl,
    mode='lines',
    name=f'{ppl_vs_ent[0][0]:.2f} * exp({ppl_vs_ent[0][1]:.2f} * x) + {ppl_vs_ent[0][2]:.2f}'
)

ce_fit = go.Scatter(
    x=x_ent,
    y=y_ce,
    mode='lines',
    name=f'{ce_vs_ent[0][0]:.2f} * x + {ce_vs_ent[0][1]:.2f}'
)

default_colors = plotly.colors.qualitative.Plotly
next_color = default_colors[13 % len(default_colors)]
ppl_fit.update(marker_color=next_color)
ce_fit.update(marker_color=next_color)
print()




In [28]:
# plotting options
legend_dict = dict(
    orientation='h',
    y=-0.15,
)

marker_dict = dict(
    size=12,
    opacity=0.5,
)

In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Uni- or bigram'},
    color=uni_or_bi,
    hover_data={'Vocab size': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/uni_or_bi.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Vocab size'},
    color=[str(x) for x in vocab_size],
    category_orders={'color': reversed([str(x) for x in sorted(vocab_size)])},
    hover_data={'Uni- or bigram': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/vocab.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Uni- or bigram and distribution'},
    color=combine_attrs(uni_or_bi, dist),
    hover_data={'Vocab size': True, 'Softmax': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/uni_or_bi_dist.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Model type'},
    color='Model type',
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/model_type.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Embedding size'},
    color='Embedding size',
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Model type': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/embd_size.html')

In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Uni- or bigram'},
    color='Uni- or bigram',
    hover_data={'Vocab size': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/uni_or_bi_ce.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Vocab size'},
    color=[str(x) for x in vocab_size],
    category_orders={'color': reversed([str(x) for x in sorted(vocab_size)])},
    hover_data={'Uni- or bigram': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/vocab_ce.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Uni- or bigram and distribution'},
    color=combine_attrs(uni_or_bi, dist),
    hover_data={'Vocab size': True, 'Softmax': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/uni_or_bi_dist_ce.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Uni- or bigram'},
    color='Model type',
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/model_type_ce.html')





In [None]:
fig = px.scatter(
    df,
    x='Entropy',
    y='Test set average cross-entropy',
    title='Test set average cross-entropy vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Embedding size'},
    color='Embedding size',
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Model type': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.data = fig.data[::-1]
fig.write_html('plots/embd_size_ce.html')