In [11]:
import os
import math
import json
import numpy as np
from scipy.optimize import curve_fit
import plotly
import plotly.express as px
import plotly.graph_objects as go
import glob
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import statsmodels.api as sm

In [12]:
orig_df = pd.read_json('var_results/data_for_modeling.json')
orig_df = orig_df.sort_values(by=['Test set average cross-entropy'])
orig_df

Unnamed: 0,Test set perplexity,Test set average cross-entropy,Entropy,Distribution,Uni- or bigram,Vocab size,Softmax,Model type,Embedding size
92,1.471989,0.386615,2.302584,manual,unigrams,8193,False,lstm,256
99,1.472562,0.387004,2.302584,manual,unigrams,8193,False,trf,256
89,1.481061,0.392758,2.302584,manual,unigrams,8193,False,lstm,64
95,1.489251,0.398274,2.302584,manual,unigrams,8193,False,trf,64
90,1.621255,0.483200,2.303436,manual,unigrams,283,False,lstm,64
...,...,...,...,...,...,...,...,...,...
76,8682.717773,9.069090,9.210342,uniform,unigrams,10000,False,lstm,64
306,8684.697266,9.069318,9.201830,normal,bigrams,10000,False,lstm,64
139,8718.180664,9.073166,8.960391,normal,bigrams,10000,True,lstm,256
226,8752.186523,9.077059,9.201830,normal,bigrams,10000,False,lstm,256


# Curve fitting

In [13]:
# curve fitting
def exponential(x, a, b, c):
    return a * np.exp(b * x) + c

def linear(x, a, b):
    return a * x + b

def combine_attrs(list1, list2):
    return [str(list1[i]) + ', ' + str(list2[i]) for i in range(len(list1))]

entropy = orig_df['Entropy']
test_set_perplexity = orig_df['Test set perplexity']
test_set_avg_cross_entropy = orig_df['Test set average cross-entropy']
dist = orig_df['Distribution']
uni_or_bi = orig_df['Uni- or bigram']
vocab_size = orig_df['Vocab size']
softmax = orig_df['Softmax']
model_type = orig_df['Model type']
embd_size = orig_df['Embedding size']

x_ent = np.linspace(min(entropy), max(entropy), 100)

ppl_vs_ent = curve_fit(exponential, entropy, test_set_perplexity)

y_ppl = exponential(x_ent, *ppl_vs_ent[0])

ce_vs_ent = curve_fit(linear, entropy, test_set_avg_cross_entropy)

y_ce = linear(x_ent, *ce_vs_ent[0])

ppl_fit = go.Scatter(
    x=x_ent,
    y=y_ppl,
    mode='lines',
    name=f'{ppl_vs_ent[0][0]:.2f} * exp({ppl_vs_ent[0][1]:.2f} * x) + {ppl_vs_ent[0][2]:.2f}'
)

ce_fit = go.Scatter(
    x=x_ent,
    y=y_ce,
    mode='lines',
    name=f'{ce_vs_ent[0][0]:.2f} * x + {ce_vs_ent[0][1]:.2f}'
)

y_equals_x = go.Scatter(
    x=x_ent,
    y=x_ent,
    mode='lines',
    name='Theoretical limit',
    marker=dict(color='red')
)

default_colors = plotly.colors.qualitative.Plotly
next_color = default_colors[13 % len(default_colors)]
ppl_fit.update(marker_color=next_color)
ce_fit.update(marker_color=next_color)
print()




# R^2

In [14]:
residuals = test_set_avg_cross_entropy - linear(np.array(entropy), *ce_vs_ent[0])
ss_res = np.sum(residuals**2)
ss_tot = np.sum((test_set_avg_cross_entropy - np.mean(test_set_avg_cross_entropy))**2)
r_squared = 1 - (ss_res / ss_tot)
print(f'Overall R^2: {r_squared}')

Overall R^2: 0.984227875005013


# Plots

In [15]:
legend_dict = dict(
    orientation='h',
    y=-0.15,
)

marker_dict = dict(
    size=12,
    opacity=0.5,
)

In [16]:
fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Uni- or bigram'},
    color=uni_or_bi,
    hover_data={'Vocab size': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/uni_or_bi.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Vocab size'},
    color=[str(x) for x in vocab_size],
    category_orders={'color': reversed([str(x) for x in sorted(vocab_size)])},
    hover_data={'Uni- or bigram': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/vocab.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Uni- or bigram and distribution'},
    color=combine_attrs(uni_or_bi, dist),
    hover_data={'Vocab size': True, 'Softmax': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/uni_or_bi_dist.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Model type'},
    color='Model type',
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/model_type.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Embedding size'},
    color=[str(x) for x in embd_size],
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Model type': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/embd_size.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set perplexity',
    title='Test set perplexity vs. entropy',
    labels={'x': 'Entropy', 'y': 'Test set perplexity', 'color': 'Softmax'},
    color='Softmax',
    hover_data={'Vocab size': True, 'Uni- or bigram': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ppl_fit)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/softmax.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set average cross-entropy',
    title=f'Test set average cross-entropy vs. entropy, R^2={r_squared:.4f}',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Uni- or bigram'},
    color='Uni- or bigram',
    hover_data={'Vocab size': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.add_trace(y_equals_x)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/uni_or_bi_ce.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set average cross-entropy',
    title=f'Test set average cross-entropy vs. entropy, R^2={r_squared:.4f}',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Vocab size'},
    color=[str(x) for x in vocab_size],
    category_orders={'color': reversed([str(x) for x in sorted(vocab_size)])},
    hover_data={'Uni- or bigram': True, 'Softmax': True, 'Distribution': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.add_trace(y_equals_x)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/vocab_ce.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set average cross-entropy',
    title=f'Test set average cross-entropy vs. entropy, R^2={r_squared:.4f}',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Uni- or bigram and distribution'},
    color=combine_attrs(uni_or_bi, dist),
    hover_data={'Vocab size': True, 'Softmax': True, 'Model type': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.add_trace(y_equals_x)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/uni_or_bi_dist_ce.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set average cross-entropy',
    title=f'Test set average cross-entropy vs. entropy, R^2={r_squared:.4f}',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Model type'},
    color='Model type',
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Embedding size': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.add_trace(y_equals_x)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/model_type_ce.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set average cross-entropy',
    title=f'Test set average cross-entropy vs. entropy, R^2={r_squared:.4f}',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Embedding size'},
    color=[str(x) for x in embd_size],
    hover_data={'Vocab size': True, 'Softmax': True, 'Uni- or bigram': True, 'Distribution': True, 'Model type': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.add_trace(y_equals_x)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/embd_size_ce.html')

fig = px.scatter(
    orig_df,
    x='Entropy',
    y='Test set average cross-entropy',
    title=f'Test set average cross-entropy vs. entropy, R^2={r_squared:.4f}',
    labels={'x': 'Entropy', 'y': 'Test set average cross-entropy', 'color': 'Softmax'},
    color='Softmax',
    hover_data={'Vocab size': True, 'Embedding size': True, 'Uni- or bigram': True, 'Distribution': True, 'Model type': True},
)
fig.update_layout(legend=legend_dict)
fig.update_traces(marker=marker_dict)
fig.add_trace(ce_fit)
fig.add_trace(y_equals_x)
fig.data = fig.data[::-1]
fig.update_yaxes(scaleanchor='x', scaleratio=1)
fig.write_html('plots/softmax_ce.html')

# Linear models

In [17]:
def encode_and_fit(df_arg):
    
    df = df_arg.copy()
    
    enc = OrdinalEncoder()
    scaler = StandardScaler()
    try:
        df['Distribution'] = enc.fit_transform(df[['Distribution']])
    except:
        pass
    try:
        df['Uni- or bigram'] = enc.fit_transform(df[['Uni- or bigram']])
    except:
        pass
    try:
        df['Softmax'] = enc.fit_transform(df[['Softmax']])
    except:
        pass
    try:
        df['Model type'] = enc.fit_transform(df[['Model type']])
    except:
        pass
    for col in df.columns:
        if col not in ['Distribution', 'Uni- or bigram', 'Softmax', 'Model type', 'Test set perplexity']:
            df[col] = scaler.fit_transform(df[[col]])
    
    reg = sm.OLS(
        df['Test set average cross-entropy'],
        sm.add_constant(df.drop(columns=['Test set average cross-entropy', 'Test set perplexity']))
    )
    res = reg.fit()
    print(res.summary2())
    print('\n\n\n')
    return res

In [18]:
for var in ['Entropy', 'Distribution', 'Uni- or bigram', 'Vocab size', 'Softmax', 'Model type', 'Embedding size']:
    print('independent variable:', var)
    res = encode_and_fit(orig_df[[var, 'Test set average cross-entropy', 'Test set perplexity']])

independent variable: Entropy
                        Results: Ordinary least squares
Model:              OLS                            Adj. R-squared:     0.984    
Dependent Variable: Test set average cross-entropy AIC:                -541.6398
Date:               2025-03-07 21:25               BIC:                -533.5784
No. Observations:   416                            Log-Likelihood:     272.82   
Df Model:           1                              F-statistic:        2.583e+04
Df Residuals:       414                            Prob (F-statistic): 0.00     
R-squared:          0.984                          Scale:              0.015848 
-------------------------------------------------------------------------------------
              Coef.       Std.Err.         t          P>|t|        [0.025      0.975]
-------------------------------------------------------------------------------------
const        -0.0000        0.0062       -0.0000      1.0000      -0.0121      0.0121
Ent

In [19]:
vars = ['Distribution', 'Uni- or bigram', 'Vocab size', 'Softmax', 'Model type', 'Embedding size']

for var in vars:
    for unique_val in orig_df[var].unique():
        print('data subset:', var, '=', unique_val)
        independent_vars = vars.copy()
        independent_vars.remove(var)
        if var == 'Distribution':
            if unique_val in ('manual', 'long_range'):
                independent_vars.remove('Uni- or bigram')
            if unique_val in ('manual', 'uniform', 'uneven'):
                independent_vars.remove('Softmax')
        res = encode_and_fit(orig_df[orig_df[var] == unique_val][independent_vars + ['Test set average cross-entropy', 'Test set perplexity']])

data subset: Distribution = manual
                       Results: Ordinary least squares
Model:              OLS                            Adj. R-squared:     0.328  
Dependent Variable: Test set average cross-entropy AIC:                43.4775
Date:               2025-03-07 21:25               BIC:                46.5679
No. Observations:   16                             Log-Likelihood:     -17.739
Df Model:           3                              F-statistic:        3.440  
Df Residuals:       12                             Prob (F-statistic): 0.0519 
R-squared:          0.462                          Scale:              0.71688
---------------------------------------------------------------------------------
                    Coef.     Std.Err.       t       P>|t|      [0.025     0.975]
---------------------------------------------------------------------------------
const              -0.0010      0.2993    -0.0035    0.9973    -0.6533     0.6512
Vocab size         -0.6799   


kurtosistest only valid for n>=20 ... continuing anyway, n=16



                        Results: Ordinary least squares
Model:              OLS                            Adj. R-squared:     0.613   
Dependent Variable: Test set average cross-entropy AIC:                467.8415
Date:               2025-03-07 21:25               BIC:                488.8245
No. Observations:   244                            Log-Likelihood:     -227.92 
Df Model:           5                              F-statistic:        77.93   
Df Residuals:       238                            Prob (F-statistic): 3.76e-48
R-squared:          0.621                          Scale:              0.38877 
----------------------------------------------------------------------------------
                     Coef.     Std.Err.       t       P>|t|      [0.025     0.975]
----------------------------------------------------------------------------------
const               -0.1959      0.0844    -2.3227    0.0210    -0.3621    -0.0298
Distribution         0.2814      0.0595     4.7268  


omni_normtest is not valid with less than 8 observations; 4 samples were given.


omni_normtest is not valid with less than 8 observations; 4 samples were given.


omni_normtest is not valid with less than 8 observations; 4 samples were given.


omni_normtest is not valid with less than 8 observations; 4 samples were given.



                        Results: Ordinary least squares
Model:              OLS                            Adj. R-squared:     0.602   
Dependent Variable: Test set average cross-entropy AIC:                67.6848 
Date:               2025-03-07 21:25               BIC:                76.4792 
No. Observations:   32                             Log-Likelihood:     -27.842 
Df Model:           5                              F-statistic:        10.39   
Df Residuals:       26                             Prob (F-statistic): 1.47e-05
R-squared:          0.666                          Scale:              0.41062 
----------------------------------------------------------------------------------
                     Coef.     Std.Err.       t       P>|t|      [0.025     0.975]
----------------------------------------------------------------------------------
const               -0.5005      0.2603    -1.9232    0.0655    -1.0355     0.0344
Distribution         0.6509      0.1493     4.3604  

# Spearman rhos

In [20]:
orig_df[['Entropy', 'Vocab size', 'Test set average cross-entropy']].corr(method='spearman')

Unnamed: 0,Entropy,Vocab size,Test set average cross-entropy
Entropy,1.0,0.930072,0.9948
Vocab size,0.930072,1.0,0.908688
Test set average cross-entropy,0.9948,0.908688,1.0


# Mixed Effects

In [21]:
import statsmodels.formula.api as smf

In [22]:
formula = "Q('Test set average cross-entropy') ~ Q('Entropy') + Q('Distribution') + Q('Uni- or bigram') + Q('Vocab size') + Q('Softmax') + Q('Model type') + Q('Embedding size') - 1"

In [23]:
model = smf.mixedlm(
    formula,
    data=orig_df,
    groups=orig_df['Entropy'] 
).fit()
print(model.summary())

                      Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Q('Test set average cross-entropy')
No. Observations: 416     Method:             REML                               
No. Groups:       104     Scale:              0.0006                             
Min. group size:  4       Log-Likelihood:     670.2246                           
Max. group size:  4       Converged:          Yes                                
Mean group size:  4.0                                                            
----------------------------------------------------------------------------------
                                  Coef.   Std.Err.     z     P>|z|  [0.025  0.975]
----------------------------------------------------------------------------------
Q('Distribution')[long_range]     -0.595     0.049  -12.087  0.000  -0.691  -0.498
Q('Distribution')[manual]         -1.677     0.074  -22.689  0.000  -1.822  -1.532
Q('Distribution')[normal]        

In [24]:
x = orig_df['Entropy']
y = orig_df['Test set average cross-entropy']
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
orig_df['Entropy residual'] = model.resid

formula = "Q('Entropy residual') ~ Q('Distribution') + Q('Uni- or bigram') + Q('Vocab size') + Q('Softmax') + Q('Model type') + Q('Embedding size') - 1"

model = smf.mixedlm(
    formula,
    data=orig_df,
    groups=orig_df['Entropy']
).fit()
print(model.summary())

                   Mixed Linear Model Regression Results
Model:                MixedLM   Dependent Variable:   Q('Entropy residual')
No. Observations:     416       Method:               REML                 
No. Groups:           104       Scale:                0.0006               
Min. group size:      4         Log-Likelihood:       671.2819             
Max. group size:      4         Converged:            Yes                  
Mean group size:      4.0                                                  
---------------------------------------------------------------------------
                                Coef.  Std.Err.    z    P>|z| [0.025 0.975]
---------------------------------------------------------------------------
Q('Distribution')[long_range]   -0.058    0.031  -1.874 0.061 -0.118  0.003
Q('Distribution')[manual]       -1.077    0.075 -14.452 0.000 -1.223 -0.931
Q('Distribution')[normal]        0.189    0.031   6.148 0.000  0.129  0.249
Q('Distribution')[uneven]      