In [3]:
import os
import math
import json
import glob

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import statsmodels.api as sm

In [4]:
def combine_attrs(list1, list2):
    return [str(list1[i]) + ', ' + str(list2[i]) for i in range(len(list1))]

def data(json_file):
    j = json.load(open(json_file))
    del j['train_losses']
    del j['table']
    settings = os.path.basename(json_file).replace('.json', '').split('_')
    if settings[0] == 'long':
        del settings[0]
        settings[0] = 'long_range'
    j['dist'] = settings[0]
    j['uni_or_bi'] = settings[1]
    j['vocab_size'] = int(settings[2])
    j['softmax'] = True if settings[3] == 'softmax' else False
    j['settings'] = settings
    if 'lstm' in settings:
        j['model_type'] = 'lstm'
    elif 'ffnn' in settings:
        j['model_type'] = 'ffnn'
    else:
        j['model_type'] = 'trf'
    if '256' in settings and 'embd' in settings:
        j['embd_size'] = 256
    else:
        j['embd_size'] = 64
    if 'val_losses' in j:
        print(json_file)
    return j

json_files = paths = glob.glob('results/u*.json') + glob.glob('results/m*.json') + glob.glob('results/n*.json') + glob.glob('results/long*.json')

jsons = []
for json_file in json_files:
    try:
        d = data(json_file)
        jsons.append(d)
    except:
        print(json_file)

test_set_perplexity = []
entropy = []
dist = []
uni_or_bi = []
vocab_size = []
softmax = []
model_type = []
embd_size = []

for j in jsons:
    test_set_perplexity.append(min(j['test_set_perplexities']))
    entropy.append(j['entropy'])
    dist.append(j['dist'])
    uni_or_bi.append(j['uni_or_bi'])
    vocab_size.append(j['vocab_size'])
    softmax.append(j['softmax'])
    model_type.append(j['model_type'])
    embd_size.append(j['embd_size'])
    
for i in range(len(entropy)):
    if uni_or_bi[i] == 'bigrams':
        entropy[i] = entropy[i] / 2
    
test_set_avg_cross_entropy = [math.log(x) for x in test_set_perplexity]

df = {
    'Test set perplexity': test_set_perplexity,
    'Test set average cross-entropy': test_set_avg_cross_entropy,
    'Entropy': entropy,
    'Distribution': dist,
    'Uni- or bigram': uni_or_bi,
    'Vocab size': vocab_size,
    'Softmax': softmax,
    'Model type': model_type,
    'Embedding size': embd_size
}

In [5]:
df = pd.DataFrame(df)

In [12]:
df = {
    'Test set perplexity': test_set_perplexity,
    'Test set average cross-entropy': test_set_avg_cross_entropy,
    'Entropy': entropy,
    'Distribution': dist,
    'Uni- or bigram': uni_or_bi,
    'Vocab size': vocab_size,
    'Softmax': softmax,
    'Model type': model_type,
    'Embedding size': embd_size
}

In [13]:
df = pd.DataFrame(df)

In [15]:
df.to_json('results/data_for_modeling.json')

In [6]:
enc = OrdinalEncoder()
df['Distribution'] = enc.fit_transform(df[['Distribution']])
df['Uni- or bigram'] = enc.fit_transform(df[['Uni- or bigram']])
df['Softmax'] = enc.fit_transform(df[['Softmax']])
df['Model type'] = enc.fit_transform(df[['Model type']])

In [7]:
# zscale numerical columns
df = df.apply(lambda x: (x - x.mean()) / x.std() if x.name not in ['Distribution', 'Uni- or bigram', 'Softmax', 'Model type'] else x)

In [8]:
reg = sm.OLS(
    df['Test set average cross-entropy'],
    sm.add_constant(df.drop(columns=['Test set average cross-entropy', 'Test set perplexity']))
)
res = reg.fit()
print(res.summary())

                                  OLS Regression Results                                  
Dep. Variable:     Test set average cross-entropy   R-squared:                       0.990
Model:                                        OLS   Adj. R-squared:                  0.990
Method:                             Least Squares   F-statistic:                     5950.
Date:                            Tue, 04 Mar 2025   Prob (F-statistic):               0.00
Time:                                    14:18:47   Log-Likelihood:                 374.40
No. Observations:                             416   AIC:                            -732.8
Df Residuals:                                 408   BIC:                            -700.6
Df Model:                                       7                                         
Covariance Type:                        nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
-------

In [9]:
p_values = res.pvalues
print(p_values.apply('{:.30e}'.format))  # Print with scientific notation


const             7.865134578826737610492472323491e-11
Entropy           0.000000000000000000000000000000e+00
Distribution      4.102618449911355731440236761405e-24
Uni- or bigram    1.752839494882939304539502605677e-11
Vocab size        8.952711003910896366715297742457e-15
Softmax           1.448775993863391750885935560511e-07
Model type        5.233806389292354754871894328971e-01
Embedding size    9.678307428845354110080734244548e-01
dtype: object


In [10]:
df.to_json('results/data_for_paper.json')