In [None]:
import os
import json
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.optimize import curve_fit
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [44]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

In [45]:
trf_locations = sorted(
    glob.glob(
        os.path.join('experiments', '*', 'trf')
    ), key = lambda x: int(x.split(os.path.sep)[1])
)
lstm_locations = sorted(
    glob.glob(
        os.path.join('experiments', '*', 'lstm')
    ), key = lambda x: int(x.split(os.path.sep)[1])
)

In [46]:
markers = [
    'o', '*', 'x', '+', 's', 'D', 'p', 'h', 'v', '^'
]

In [47]:
def plot_run(locations, vocab_size):
    locations_copy = []
    for location in locations:
        data = json.load(open(os.path.join(location, 'hparams.json'), 'r', encoding='utf-8'))
        if data['grammar_num_symbols'] == vocab_size:
            if data['grammar_type'] == 'pfsa':
                if data['grammar_seed'] == 0:
                    locations_copy.append(location)
    
    fig, axs = plt.subplots(
        nrows=len(locations_copy),
        ncols=1,
        sharex='all',
        # sharey='all',
        figsize=(10, 3 * len(locations_copy))  # Add height: 3 inches per subplot
    )
    
    for j, location in enumerate(locations_copy):
        data = json.load(open(os.path.join(location, 'hparams.json'), 'r', encoding='utf-8'))
        
        if data['grammar_type'] == 'pfsa':
            title = f"num_states={data['grammar_formalism_arg']}"
        
        length_wise_metrics = pd.read_csv(
            os.path.join(location, 'length_wise_metrics.tsv'), sep='\t'
        )
        
        length_wise_metrics = length_wise_metrics[length_wise_metrics['step'] <= 10_000]
        
        for i in range(10):
            subset = length_wise_metrics[length_wise_metrics['seq_len'] == (i+1)]
            axs[j].plot(
                subset['step'],
                subset['rho'],
                c=colors[i],
                # s=30,
                label=f'SLAC@{i+1}',
                marker=markers[i],
                alpha=0.5
            )
        axs[j].grid()
        axs[j].legend(loc='upper right')
        axs[j].set_ylabel(title)
        
    font = {
        'size': 14,
    }
    
    fig.suptitle(f'SLAC@k over training steps PFSA(seed=0, num_symbols={vocab_size})', y=0.98)
    fig.text(0.5, 0.03, 'Training steps', ha='center', fontdict=font)
    fig.text(0.04, 0.5, 'SLAC@k over evaluation set', va='center', rotation='vertical', fontdict=font)
    plt.tight_layout(rect=[0.06, 0.04, 1, 0.99])
    # plt.savefig(f'figs/runs_{vocab_size}.pdf')
    plt.show()

In [48]:
# plot_run(trf_locations, 1000)

In [49]:
# plot_run(trf_locations, 5000)

In [50]:
lstm_or_trf = []
seeds = []
formalisms = []
entropy = []
train_data_ee = []
val_data_ee = []
num_symbols = []
num_states = []
max_rhos = []
max_rho_num_steps = []

k = 10

for loc in trf_locations:
    data = json.load(open(os.path.join(loc, 'hparams.json')))
    length_wise_metrics = pd.read_csv(os.path.join(loc, 'length_wise_metrics.tsv'), sep='\t')
    subset = length_wise_metrics[length_wise_metrics['seq_len'] <= k]
    means = subset.groupby('step').mean()
    means['step'] = means.index
    max_rho_at_k = means.max()['rho']
    max_rhos.append(max_rho_at_k)
    max_rho_num_steps.append(means[means['rho'] == max_rho_at_k]['step'].item())
    lstm_or_trf.append('trf')
    entropy.append(data['grammar_actual_entropy'])
    train_data_ee.append(data['train_data_ee'])
    val_data_ee.append(data['val_data_ee'])
    num_symbols.append(data['grammar_num_symbols'])
    num_states.append(data['grammar_formalism_arg'])
    seeds.append(data['grammar_seed'])
    formalisms.append(data['grammar_type'])
    
for loc in lstm_locations:
    data = json.load(open(os.path.join(loc, 'hparams.json')))
    length_wise_metrics = pd.read_csv(os.path.join(loc, 'length_wise_metrics.tsv'), sep='\t')
    subset = length_wise_metrics[length_wise_metrics['seq_len'] <= k]
    means = subset.groupby('step').mean()
    means['step'] = means.index
    max_rho_at_k = means.max()['rho']
    max_rhos.append(max_rho_at_k)
    max_rho_num_steps.append(means[means['rho'] == max_rho_at_k]['step'].item())
    lstm_or_trf.append('lstm')
    entropy.append(data['grammar_actual_entropy'])
    train_data_ee.append(data['train_data_ee'])
    val_data_ee.append(data['val_data_ee'])
    num_symbols.append(data['grammar_num_symbols'])
    num_states.append(data['grammar_formalism_arg'])
    seeds.append(data['grammar_seed'])
    formalisms.append(data['grammar_type'])
    
df = pd.DataFrame({
    'lstm_or_trf': lstm_or_trf,
    'formalism': formalisms,
    'seed': seeds,
    'entropy': entropy,
    'train_data_ee': train_data_ee,
    'val_data_ee': val_data_ee,
    'num_symbols': num_symbols,
    'num_states': num_states,
    'max_rho': max_rhos,
    'max_rho_num_steps': max_rho_num_steps
})

In [51]:
df['num_states'] = np.log2(df['num_states']).astype(int)
df['seed'] = df['seed'].astype(str)
df['ee'] = (df['train_data_ee'] + df['val_data_ee']) / 2

In [52]:
print(df)

   lstm_or_trf formalism seed    entropy  train_data_ee  val_data_ee  \
0          trf      pfsa    0   7.506837       5.984616     5.880957   
1          trf      pfsa    0   8.016413       8.557952     8.557813   
2          trf      pfsa    0   8.602941      10.356739    10.350471   
3          trf      pfsa    0   9.240372      11.740965    11.751851   
4          trf      pfsa    0   9.905296      12.876701    12.874202   
..         ...       ...  ...        ...            ...          ...   
67        lstm      pfsa    2  10.215924      10.223233    10.190331   
68        lstm      pfsa    2  10.849574      11.860232    11.839460   
69        lstm      pfsa    2  11.513406      13.071409    13.071351   
70        lstm      pfsa    2  12.191690      13.900462    13.896470   
71        lstm      pfsa    2  10.581832      13.807510    13.802970   

    num_symbols  num_states   max_rho  max_rho_num_steps         ee  
0          1000           1  0.783309               5000   5.9327

In [53]:
fig = px.scatter(
    df,
    'entropy',
    'max_rho',
    symbol='lstm_or_trf',
    size='num_symbols',
    title='SLAC@(1,10) vs. Entropy (size indicates # symbols)',
    color='num_states',
    labels={
        'entropy': 'Entropy (nats)',
        'max_rho': 'SLAC@(1,10)',
        'lstm_or_trf': 'LSTM/TRF',
        'num_symbols': '# Symbols',
        'num_states': 'log_2(# States)'
    },
    opacity=0.5
).update_layout(
    legend=dict(
        orientation='v',
        yanchor='top',
        y=0.8,
        xanchor='left',
        x=1.2,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='rgba(0, 0, 0, 0.2)',
        borderwidth=1
    ),
    margin=dict(r=200),
    hovermode='closest',
    plot_bgcolor='rgba(240, 240, 240, 0.5)',
    font=dict(size=20)
)
fig.show()
fig.write_image('figs/slac.pdf', format='pdf', width=1000)

In [54]:
def f(x, a, b, c, d):
    return a * np.log(b * x + c) + d

fig = px.scatter(
    df,
    'entropy',
    'ee',
    size='num_symbols',
    title='Excess entropy vs. Entropy',
    color='num_states',
    labels={
        'ee': 'Excess entropy',
        'entropy': 'Entropy',
        'max_rho': 'SLAC@(1,10)',
        'num_symbols': '# Symbols',
        'num_states': 'log2(# States)'
    },
    opacity=0.2
).update_layout(
    legend=dict(
        orientation='v',
        yanchor='top',
        y=0.8,
        xanchor='left',
        x=1.2,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='rgba(0, 0, 0, 0.2)',
        borderwidth=1
    ),
    margin=dict(r=200),
    hovermode='closest',
    plot_bgcolor='rgba(240, 240, 240, 0.5)',
    font=dict(size=20)
)

for i, vocab_size in enumerate(sorted(df['num_symbols'].unique())):
    p, _ = curve_fit(
        f,
        df['entropy'][df['num_symbols'] == vocab_size],
        df['ee'][df['num_symbols'] == vocab_size],
        p0=[1,1,1,1]
    )

    x = np.linspace(
        df['entropy'][df['num_symbols'] == vocab_size].min(),
        df['entropy'][df['num_symbols'] == vocab_size].max(),
        1000
    )
    fig.add_trace(
        go.Scatter(
            x = x,
            y = f(x, p[0], p[1], p[2], p[3]),
            mode = 'lines',
            name = f'Fit (vocab_size={vocab_size})',
            line=dict(width=2)
        )
    )
fig.show()
fig.write_image('figs/excess_vs_entropy.pdf', format='pdf', width=1000)


invalid value encountered in log



In [55]:
fig = px.scatter(
    df,
    'entropy',
    'max_rho_num_steps',
    symbol='lstm_or_trf',
    size='num_symbols',
    title='Training steps to max SLAC@(1,10) vs. Entropy (size indicates # symbols)',
    color='num_states',
    labels={
        'entropy': 'Entropy (nats)',
        'max_rho_num_steps': 'Training steps to max SLAC@(1,10)',
        'lstm_or_trf': 'LSTM/TRF',
        'num_symbols': '# Symbols',
        'num_states': 'log_2(# States)'
    },
    opacity=0.5
).update_layout(
    legend=dict(
        orientation='v',
        yanchor='top',
        y=0.8,
        xanchor='left',
        x=1.2,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='rgba(0, 0, 0, 0.2)',
        borderwidth=1
    ),
    margin=dict(r=200),
    hovermode='closest',
    plot_bgcolor='rgba(240, 240, 240, 0.5)',
    font=dict(size=20)
)
fig.show()
fig.write_image('figs/slac_steps.pdf', format='pdf', width=1000)

In [56]:
df['num_states'] = df['num_states'].astype(int)

model = smf.mixedlm(
    'max_rho_num_steps ~ num_symbols + num_states + lstm_or_trf + entropy',
    groups=df['seed'],
    data=df
)

result = model.fit()
print(result.summary())

                  Mixed Linear Model Regression Results
Model:                MixedLM    Dependent Variable:    max_rho_num_steps
No. Observations:     72         Method:                REML             
No. Groups:           3          Scale:                 2765546.9616     
Min. group size:      24         Log-Likelihood:        -607.2377        
Max. group size:      24         Converged:             Yes              
Mean group size:      24.0                                               
-------------------------------------------------------------------------
                     Coef.     Std.Err.   z    P>|z|   [0.025     0.975] 
-------------------------------------------------------------------------
Intercept          -18111.423 23145.757 -0.782 0.434 -63476.273 27253.426
lstm_or_trf[T.trf]   1705.556   391.971  4.351 0.000    937.306  2473.805
num_symbols            -1.354     1.451 -0.933 0.351     -4.198     1.491
num_states          -1572.643  2226.919 -0.706 0.480  -5


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


The Hessian matrix at the estimated parameter values is not positive definite.

