In [1]:
import os
import sys
import time
import copy

import numpy as np
import math
import random
import matplotlib.pyplot as plt

import functools
import argparse

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import pickle

import datasets, models
import importlib
importlib.reload(models)
import init, measures

In [14]:
from dataclasses import dataclass
@dataclass
class config:

    device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

    dataset = 'rhm'
    mode = 'masked'
    num_features = 6
    num_classes = 6
    num_synonyms = 9
    tuple_size = 3
    num_layers = 4
    seed_rules = 2362346
    num_tokens = 81 # context length, tuple_size**num_layers for the full input

    train_size = 131072
    batch_size = 128
    accumulation = 1
    test_size = 32768
    seed_sample = 34534
    replacement = False

    input_format = 'onehot'
    whitening = 1

    model = 'hcnn'
    depth = 4
    width = 256
    filter_size = 3
    # model = 'transformer_mla'
    # depth = 3
    # num_heads = 16
    # embedding_dim = 256
    bias = False
    seed_model = 359

    optim = 'sgd'
    lr = 1.0
    momentum = 0.0
    scheduler = None
    scheduler_time = 1024
    max_epochs = 1024

    print_freq = 64
    save_freq = 3
    loss_threshold = 1e-3

    outname = 'test'

config.rules = True
config.bonus = dict.fromkeys(['noise', 'synonyms', 'generate', 'size'])
config.bonus['size'] = config.test_size
config.check_rules = False

config.zipf = None
config.layer = config.num_layers

config.num_data = config.num_classes*config.num_synonyms**((config.tuple_size**config.num_layers-1)//(config.tuple_size-1))
config.input_size = config.tuple_size**config.num_layers
config.num_batches = config.train_size//config.batch_size
config.max_iters = config.max_epochs*config.num_batches

scales = []
for i in range(config.num_layers):
    scales.append(config.num_classes*config.num_synonyms**(2*i+1)/(1.-config.num_synonyms/(config.num_features**config.tuple_size)))
print(config.num_data, config.num_classes*config.num_synonyms**(config.num_layers), scales)

print(config.device)

886852976486075539896499261238299785606 39366 [56.347826086956516, 4564.173913043478, 369698.0869565217, 29945545.043478258]
mps


In [15]:
train_loader, test_loader = init.init_data(config)

print(config.rules.keys())

if config.bonus:

    print(config.bonus['features'].size())

    if 'synonyms' in config.bonus:
        for k in config.bonus['synonyms']:
            print(k, config.bonus['features'][0].argmax(dim=0)+1)
            print(k, config.bonus['synonyms'][k][0].argmax(dim=0)+1)
    if 'noise' in config.bonus:
        for k in config.bonus['noise']:
            print(k, config.bonus['features'][0].argmax(dim=0)+1)
            print(k, config.bonus['noise'][k][0].argmax(dim=0)+1)

Max dataset size cannot be represented with int64! Using sampling with replacement.




dict_keys([0, 1, 2, 3])
torch.Size([32768, 6, 81])
4 tensor([6, 3, 5, 3, 4, 3, 4, 5, 2, 5, 1, 4, 4, 1, 3, 3, 3, 3, 1, 6, 1, 1, 5, 5,
        6, 1, 5, 4, 2, 1, 3, 3, 3, 6, 1, 1, 6, 1, 1, 4, 5, 5, 4, 6, 4, 6, 6, 5,
        4, 5, 2, 2, 5, 3, 5, 4, 6, 2, 5, 2, 3, 5, 3, 1, 2, 1, 2, 2, 1, 4, 5, 5,
        4, 5, 3, 6, 3, 3, 6, 3, 1])
4 tensor([4, 6, 2, 5, 2, 5, 5, 2, 5, 2, 5, 2, 3, 5, 2, 2, 5, 1, 1, 1, 2, 6, 1, 1,
        6, 6, 3, 3, 3, 3, 2, 5, 1, 4, 1, 5, 1, 5, 5, 1, 6, 2, 2, 5, 2, 2, 1, 6,
        5, 2, 5, 2, 5, 3, 2, 2, 1, 5, 1, 4, 3, 4, 2, 1, 6, 1, 2, 2, 1, 2, 5, 3,
        3, 5, 3, 6, 3, 3, 6, 1, 1], device='mps:0')
3 tensor([6, 3, 5, 3, 4, 3, 4, 5, 2, 5, 1, 4, 4, 1, 3, 3, 3, 3, 1, 6, 1, 1, 5, 5,
        6, 1, 5, 4, 2, 1, 3, 3, 3, 6, 1, 1, 6, 1, 1, 4, 5, 5, 4, 6, 4, 6, 6, 5,
        4, 5, 2, 2, 5, 3, 5, 4, 6, 2, 5, 2, 3, 5, 3, 1, 2, 1, 2, 2, 1, 4, 5, 5,
        4, 5, 3, 6, 3, 3, 6, 3, 1])
3 tensor([6, 3, 5, 3, 4, 3, 4, 5, 2, 5, 1, 4, 4, 1, 3, 3, 3, 3, 1, 6, 1, 1, 5, 5,
        6, 1, 5, 

In [16]:
for x, y in train_loader:
    print(x.size(), y.size(), x.device)

torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([128, 6, 81]) torch.Size([128]) cpu
torch.Size([1

In [17]:
model = init.init_model(config)
model0 = copy.deepcopy( model)
param_count = sum([p.numel() for p in model.parameters()])
print(param_count)

595968


In [18]:
criterion, optimizer, scheduler = init.init_training( model, config)

In [19]:
%%time
dynamics, best = init.init_output(model, criterion, train_loader, test_loader, config)
print(dynamics[0])

if config.bonus:
    if 'synonyms' in config.bonus:
        for l in range(config.depth+1):
            for k in dynamics[0]['synonyms'][l].keys():
                print(f'depth {l}, level {k}, sensitivity: ', dynamics[0]['synonyms'][l][k].mean().item())
    if 'noise' in config.bonus:
        for l in range(config.depth+1):
            for k in dynamics[0]['noise'][l].keys():
                print(f'depth {l}, level {k}, sensitivity: ', dynamics[0]['noise'][l][k].mean().item())

{'t': 0, 'trainloss': 1.7918220672290772, 'trainacc': 0.1551513671875, 'testloss': 1.791839711368084, 'testacc': 0.15576171875, 'synonyms': {0: {4: tensor([0.4896, 0.4937, 0.4663, 0.4799, 0.4905, 0.4353, 0.4874, 0.4956, 0.4638,
        0.5204, 0.5201, 0.4920, 0.5007, 0.5256, 0.4779, 0.5184, 0.5226, 0.4922,
        0.5244, 0.5360, 0.4913, 0.5372, 0.5328, 0.4811, 0.5134, 0.5229, 0.4835],
       device='mps:0'), 3: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        0.3678, 0.3753, 0.3453, 0.4199, 0.4545, 0.3859, 0.3389, 0.3559, 0.3232,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
       device='mps:0'), 2: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 0.2375, 0.2420, 0.2669, 1.0000, 1.0000, 1.0000],
       device='mps:0'), 1: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 

In [20]:
# config.print_freq = 256
config.save_freq = 3

print_ckpts, save_ckpts = init.init_loglinckpt( config.print_freq, config.max_iters, freq=config.save_freq)
print_ckpt = next(print_ckpts)
save_ckpt = next(save_ckpts)

step = 0

## TRAINING

In [None]:
for epoch in range(config.max_epochs):

    model.train()
    optimizer.zero_grad()
    running_loss = 0.

    for batch_idx, (inputs, targets) in enumerate(train_loader):

        outputs = model(inputs.to(config.device))
        loss = criterion(outputs, targets.to(config.device))
        running_loss += loss.item()
        loss /= config.accumulation
        loss.backward()

        if ((batch_idx+1)%config.accumulation==0):
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            step += 1

            if step==print_ckpt:

                test_loss, test_acc = measures.test(model, test_loader, config.device)

                if test_loss<best['loss']: # update best model if loss is smaller
                    best['step'] = step
                    best['loss'] = test_loss
                    best['model'] = copy.deepcopy( model.state_dict())

                print('step : ',step, '\t train loss: {:06.4f}'.format(running_loss/(batch_idx+1)), ',test loss: {:06.4f}'.format(test_loss))
                print_ckpt = next(print_ckpts)

                if step>=save_ckpt:

                    print(f'Checkpoint at step {step}, saving data ...')

                    train_loss, train_acc = measures.test(model, train_loader, config.device)
                    save_dict = {'t': step, 'trainloss': train_loss, 'trainacc': train_acc, 'testloss': test_loss, 'testacc': test_acc}
                    if config.bonus:
                        if 'synonyms' in config.bonus:
                            save_dict['synonyms'] = measures.sensitivity( model, config.bonus['features'], config.bonus['synonyms'], config.device)
                        if 'noise' in config.bonus:
                            save_dict['noise'] = measures.sensitivity( model, config.bonus['features'], config.bonus['noise'], config.device)
                    dynamics.append(save_dict)
                    save_ckpt = next(save_ckpts)


    if (running_loss/(batch_idx+1)) <= config.loss_threshold:

        train_loss, train_acc = measures.test(model, train_loader, config.device)
        save_dict = {'t': step, 'trainloss': train_loss, 'trainacc': train_acc, 'testloss': test_loss, 'testacc': test_acc}
        if config.bonus:
            if 'synonyms' in config.bonus:
                save_dict['synonyms'] = measures.sensitivity( model, config.bonus['features'], config.bonus['synonyms'], config.device)
            if 'noise' in config.bonus:
                save_dict['noise'] = measures.sensitivity( model, config.bonus['features'], config.bonus['noise'], config.device)
        dynamics.append(save_dict)

        break

step :  1 	 train loss: 1.7910 ,test loss: 1.7918
Checkpoint at step 1, saving data ...
step :  2 	 train loss: 1.7915 ,test loss: 1.7918
Checkpoint at step 2, saving data ...
step :  3 	 train loss: 1.7916 ,test loss: 1.7917
Checkpoint at step 3, saving data ...
step :  4 	 train loss: 1.7913 ,test loss: 1.7916
Checkpoint at step 4, saving data ...
step :  5 	 train loss: 1.7914 ,test loss: 1.7916
Checkpoint at step 5, saving data ...


In [None]:
import math

fig, ax = plt.subplots(1, 2, figsize=(20,7))

ax[0].plot([d['t'] for d in dynamics], [1.-d['trainacc'] for d in dynamics], f'C0', label='training err.')
ax[0].plot([d['t'] for d in dynamics], [1.-d['testacc'] for d in dynamics], f'C1', label='test err.')

ax[1].plot([d['t'] for d in dynamics], [d['trainloss'] for d in dynamics], f'C0', label='training loss')
ax[1].plot([d['t'] for d in dynamics], [d['testloss'] for d in dynamics], f'C1', label='test loss')

L = config.num_layers
v = config.num_features
m = config.num_synonyms
s = config.tuple_size
time = [d['t'] for d in dynamics]

ngram_losses = [math.log(config.num_features), 1.4926, 0.5973, 0.2686] #v=16, m=4
# ngram_losses = [math.log(config.num_features), 2.1266, 1.0060, 0.4300, 0.2491, 0.2025, 0.1911, 0.1883, 0.1876] # v = 32, m = 8

for k in range(L+1):
    # compatible = (v-1)*((v**s-m*v)/(v**s-1-m*(v-1)))*((m*(v-1))/(v**s-1))**k + (v-1)*(m-1)/(v**s-1-m*(v-1))
    # ax[1].plot([t for t in time], [math.log(1+compatible) for t in time], 'k--')
    ax[1].plot([t for t in time], [ngram_losses[k] for t in time], 'k--')


ax[0].legend()
ax[0].set_xscale('log')
ax[0].set_yscale('log')

ax[1].legend()
ax[1].set_xscale('log')
ax[1].set_yscale('log')
# ax[1].set_ylim(1e-1,1e1)

In [None]:
if config.bonus:
    if 'synonyms' in config.bonus:
        for k in range(config.depth+1):
            for l in range(1,config.num_layers):
                print(f'depth {k}, level {l}, sensitivity: ', dynamics[0]['synonyms'][k][l].mean().item())

In [None]:
print(dynamics[0]['noise'].keys())

for key in dynamics[0]['noise'].keys():
    print(f'rep. {key+1}', dynamics[-1]['noise'][key])

In [None]:
fig, ax = plt.subplots((config.num_layers), 3, figsize=(20,14))


print(dynamics[-1]['synonyms'].keys())
print(dynamics[-1]['synonyms'][3].keys())

ax[0][0].set_title('synonyms')
ax[0][1].set_title('noise')
ax[0][2].set_title('relative')

for k in range(config.depth+1):

    l = 0
    ax[0][1].plot([d['t'] for d in dynamics], [d['noise'][k][l].to('cpu') for d in dynamics], f'C{k}' , label=f'{k+1}-th rep.')

    ax[l][0].set_ylabel(f'level {l}')
    ax[0][0].set_xscale('log')
    ax[0][1].set_xscale('log')
    ax[0][2].set_xscale('log')


    for l in range(1,config.num_layers):

        ax[l][0].plot([d['t'] for d in dynamics], [d['synonyms'][k][l].to('cpu') for d in dynamics], f'C{k}' , label=f'{k+1}-th rep.')
        ax[l][1].plot([d['t'] for d in dynamics], [d['noise'][k][l].to('cpu') for d in dynamics], f'C{k}' , label=f'{k+1}-th rep.')
        ax[l][2].plot([d['t'] for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{k}' , label=f'{k+1}-th rep.')

        ax[l][0].set_ylabel(f'level {l}')
        ax[l][0].set_xscale('log')
        ax[l][1].set_xscale('log')
        ax[l][2].set_xscale('log')

init_loss = dynamics[0]['testloss']
for l in range(config.num_layers):

    ax[l][0].plot([d['t'] for d in dynamics], [d['testloss']/init_loss for d in dynamics], f'k--')
    ax[l][1].plot([d['t'] for d in dynamics], [d['testloss']/init_loss for d in dynamics], f'k--')
    ax[l][2].plot([d['t'] for d in dynamics], [d['testloss']/init_loss for d in dynamics], f'k--')

ax[0][1].legend()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(12,5))

m = config.num_synonyms

l = 1
k = 3
ax[0].plot([d['t'] for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[1].plot([d['t']/(m**(l+2)) for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[2].plot([d['t']/(m**(2*(l+1)-1)) for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')


l = 2
k = 3
ax[0].plot([d['t'] for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[1].plot([d['t']/(m**(l+2)) for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[2].plot([d['t']/(m**(2*(l+1)-1)) for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')


ax[0].set_xscale('log')
ax[0].set_yscale('log')

ax[1].set_xscale('log')
ax[1].set_yscale('log')

ax[2].set_xscale('log')
ax[2].set_yscale('log')

ax[0].legend()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(12,5))

m = config.num_synonyms

l = 1
k = l
ax[0].plot([d['t'] for d in dynamics], [(1.-d['synonyms'][k][l][-1].to('cpu'))/((1.-d['noise'][k][l][-1].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[1].plot([d['t']/(m**(l+2)) for d in dynamics], [(1.-d['synonyms'][k][l][-1].to('cpu'))/((1.-d['noise'][k][l][-1].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[2].plot([d['t']/(m**(2*(l+1)-1)) for d in dynamics], [(1.-d['synonyms'][k][l][-1].to('cpu'))/((1.-d['noise'][k][l][-1].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')


l = 2
k = l
ax[0].plot([d['t'] for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[1].plot([d['t']/(m**(l+2)) for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')
ax[2].plot([d['t']/(m**(2*(l+1)-1)) for d in dynamics], [(1.-d['synonyms'][k][l].to('cpu'))/((1.-d['noise'][k][l].to('cpu'))) for d in dynamics], f'C{l-1}' , label=f'level {l}, {k}-th rep.')


ax[0].set_xscale('log')
ax[0].set_yscale('log')

ax[1].set_xscale('log')
ax[1].set_yscale('log')

ax[2].set_xscale('log')
ax[2].set_yscale('log')

ax[0].legend()