In [8]:
import os
import math
import json

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import statsmodels.api as sm

In [9]:
def data(json_file):
    j = json.load(open(json_file))
    settings = os.path.basename(json_file).replace('.json', '').split('_')
    j['dist'] = settings[0]
    j['uni_or_bi'] = settings[1]
    j['vocab_size'] = int(settings[2])
    j['softmax'] = True if settings[3] == 'softmax' else False
    j['settings'] = settings
    j['losses'] = []
    if 'lstm' in settings:
        j['model_type'] = 'lstm'
    elif 'ffnn' in settings:
        j['model_type'] = 'ffnn'
    else:
        j['model_type'] = 'trf'
    if 'embd256' in settings:
        j['embd_size'] = 256
    else:
        j['embd_size'] = 64
    for i in range(len(j['train_losses'])):
        j['losses'].extend(j['train_losses'][i])
    del j['train_losses']
    if 'val_losses' in j:
        print(json_file)
    return j

json_files = [
    os.path.join('results', x) for x in os.listdir('results') if x.endswith('.json')
]

jsons = [data(json_file) for json_file in json_files]

test_set_perplexity = []
entropy = []
dist = []
uni_or_bi = []
vocab_size = []
softmax = []
model_type = []
embd_size = []

for j in jsons:
    test_set_perplexity.append(min(j['test_set_perplexities']))
    entropy.append(j['entropy'])
    dist.append(j['dist'])
    uni_or_bi.append(j['uni_or_bi'])
    vocab_size.append(j['vocab_size'])
    softmax.append(j['softmax'])
    model_type.append(j['model_type'])
    embd_size.append(j['embd_size'])
    
for i in range(len(entropy)):
    if uni_or_bi[i] == 'bigrams':
        entropy[i] = entropy[i] / 2
    
test_set_avg_cross_entropy = [math.log(x) for x in test_set_perplexity]

df = pd.DataFrame({
    'Test set perplexity': test_set_perplexity,
    'Test set average cross-entropy': test_set_avg_cross_entropy,
    'Entropy': entropy,
    'Distribution': dist,
    'Uni- or bigram': uni_or_bi,
    'Vocab size': vocab_size,
    'Softmax': softmax,
    'Model type': model_type,
    'Embedding size': embd_size
})

In [10]:
enc = OrdinalEncoder()
df['Distribution'] = enc.fit_transform(df[['Distribution']])
df['Uni- or bigram'] = enc.fit_transform(df[['Uni- or bigram']])
df['Softmax'] = enc.fit_transform(df[['Softmax']])
df['Model type'] = enc.fit_transform(df[['Model type']])

In [11]:
reg = sm.OLS(
    df['Test set average cross-entropy'],
    sm.add_constant(df.drop(columns=['Test set average cross-entropy', 'Test set perplexity']))
)
res = reg.fit()
print(res.summary())

                                  OLS Regression Results                                  
Dep. Variable:     Test set average cross-entropy   R-squared:                       0.999
Model:                                        OLS   Adj. R-squared:                  0.999
Method:                             Least Squares   F-statistic:                 3.657e+04
Date:                            Tue, 10 Dec 2024   Prob (F-statistic):               0.00
Time:                                    21:36:42   Log-Likelihood:                 394.74
No. Observations:                             357   AIC:                            -773.5
Df Residuals:                                 349   BIC:                            -742.5
Df Model:                                       7                                         
Covariance Type:                        nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
-------