# Disjoint-domain network

### Ethan Blackwood
### October 23, 2020

**Goal**: Train and analyze the network in Rogers/McClelland 2008 with 4 disjoint domains (Figures R3-R5), which learns to extract the feature of being more or less similar to other items in the same domain, across the 4 domains which have no items, contexts or attributes in common.

In [1]:
import numpy as np
from datetime import datetime as dt
import importlib

import disjoint_domain as dd

try:
    importlib.reload(ddnet)
except NameError:
    import ddnet

Common training procedure:

In [2]:
def train_n_dd_nets(n=36, run_type='', net_params=None, train_params=None):

    # get some defaults
    (ctx_per_domain, n_domains, n_items, n_ctx, attrs_per_context,
     attrs_set_per_item) = dd.get_net_dims(attrs_per_context=60)
    device, torchfp = dd.init_torch()
    
    net_defaults = {
        'ctx_per_domain': ctx_per_domain,
        'attrs_per_context': attrs_per_context,
        'attrs_set_per_item': attrs_set_per_item,
        'n_domains': n_domains,
        'device': device,
        'torchfp': torchfp,
        'param_init_scale': 0.01,
        'cluster_info': '4-2-2',
        'repeat_attrs_over_domains': False
    }
    if net_params is None:
        net_params = {}
    net_params = {**net_defaults, **net_params}
    if net_params['device'].type == 'cuda':
        print('Using CUDA')
    else:
        print('Using CPU')
    
    train_defaults = {
        'lr': 0.01,
        'scheduler': None,
        'num_epochs': 3001,
        'batch_size': 16,
        'report_freq': 50,
        'snap_freq': 50,
        'snap_freq_scale': 'lin',
        'holdout_testing': 'none',
        'test_thresh': 0.97,
        'test_max_epochs': 10000,
        'reports_per_test': 4,
        'do_combo_testing': False
    }
    if train_params is None:
        train_params = {}
    train_params = {**train_defaults, **train_params}
    
    snaps_all = []
    reports_all = []
    parameters_all = []
    ys_all = []
    
    for i in range(n):
        print(f'Training Iteration {i+1}')
        print('---------------------')
        
        net = ddnet.DisjointDomainNet(**net_params)
        res = net.do_training(**train_params)
        
        snaps_all.append(res['snaps'])
        reports_all.append(res['reports'])
        if 'params' in res:
            parameters_all.append(res['params'])
            
        ys_all.append(net.y.cpu().numpy())

        print('')

    snaps = {}
    for snap_type in snaps_all[0].keys():
        snaps[snap_type] = np.stack([snaps_one[snap_type] for snaps_one in snaps_all])
        
    reports = {}
    for report_type in reports_all[0].keys():
        reports[report_type] = np.stack([reports_one[report_type] for reports_one in reports_all])
        
    if len(parameters_all) > 0:
        parameters = {}
        for param_type in parameters_all[0].keys():
            parameters[param_type] = np.stack([params_one[param_type] for params_one in parameters_all])
    else:
        parameters = None
        
    ys = np.stack(ys_all)
    
    if run_type != '':
        run_type += '_'

    save_name = f'data/{run_type}dd_res_{dt.now():%Y-%m-%d_%H-%M-%S}.npz'
    np.savez(save_name, snapshots=snaps, reports=reports, ys=ys, net_params=net_params,
             train_params=train_params, parameters=parameters)
    
    return save_name, net

In [3]:
def dho_net_params(cluster_info, last_domain_cluster_info=None):
    return {'cluster_info': cluster_info, 'last_domain_cluster_info': last_domain_cluster_info}
dho_train_params = {'holdout_testing': 'domain', 'test_thresh': 0.85}

Base network: test on one combination of item/context for each domain.

In [None]:
# Make sure still get cross-domain similarity when remaking attr vectors for each domain
train_n_dd_nets(run_type='44_dho', net_params=dho_net_params('4-4'), train_params=dho_train_params)
train_n_dd_nets(run_type='422_dho', net_params=dho_net_params('4-2-2'), train_params=dho_train_params)

Network with merged representation layer (items & contexts all go to all rep units)

In [None]:
train_n_dd_nets(run_type='merged_repr', net_params={'merged_repr': True})

In [None]:
# Try things a bit more systematically
scaling_params = {
    '': {},
    '_half_repr': {'item_repr_units': 8, 'ctx_repr_units': 8},
    '_half_hidden': {'hidden_units': 16},
    '_half_both': {'item_repr_units': 8, 'ctx_repr_units': 8, 'hidden_units': 8}
}

train_params = {'num_epochs': 8001}

for tag, scaling in scaling_params.items():
    train_n_dd_nets(run_type=f'normal{tag}', net_params=scaling, train_params=train_params)
    train_n_dd_nets(run_type=f'merged_repr{tag}', net_params={**scaling, 'merged_repr': True}, train_params=train_params)
    train_n_dd_nets(run_type=f'no_item_repr{tag}', net_params={**scaling, 'use_item_repr': False}, train_params=train_params)        
    train_n_dd_nets(run_type=f'no_ctx_repr{tag}', net_params={**scaling, 'use_ctx_repr': False}, train_params=train_params)
    
    if tag not in ['_half_repr', '_half_both']: # these would be pointless
        train_n_dd_nets(run_type=f'no_repr{tag}', net_params={**scaling, 'use_item_repr': False, 'use_ctx_repr': False}, train_params=train_params)

In [None]:
# Redo just the beginning of training while saving parameters to look more closely
train_params = {'num_epochs': 500, 'report_freq': 10, 'snap_freq': 10, 'param_snapshots': True}
tag = 'short_save_params'

train_n_dd_nets(run_type=tag, train_params=train_params)
train_n_dd_nets(run_type=tag + '_merged_repr', net_params={'merged_repr': True}, train_params=train_params)
train_n_dd_nets(run_type=tag + '_no_item_repr', net_params={'use_item_repr': False}, train_params=train_params)
train_n_dd_nets(run_type=tag + '_no_ctx_repr', net_params={'use_ctx_repr': False}, train_params=train_params)
train_n_dd_nets(run_type=tag + '_no_repr', net_params={'use_item_repr': False, 'use_ctx_repr': False}, train_params=train_params)

Do no-representation run without reducing # of units; also, save both hidden and rep layer snapshots

In [None]:
train_params = {'num_epochs': 5001}

train_n_dd_nets(train_params=train_params)
train_n_dd_nets(run_type='no_repr_reallocate',
                net_params={'use_item_repr': False, 'use_ctx_repr': False, 'hidden_units': 64},
                train_params=train_params)

train_n_dd_nets(run_type='merged_repr', net_params={'merged_repr': True},
                train_params=train_params)
train_n_dd_nets(run_type='no_item_repr_reallocate', train_params=train_params,
                net_params={'use_item_repr': False, 'hidden_units': 48})
train_n_dd_nets(run_type='no_ctx_repr_reallocate', train_params=train_params,
                net_params={'use_ctx_repr': False, 'hidden_units': 48})

Try just one domain to see if generalization still succeeds

In [None]:
train_n_dd_nets(run_type='1domain', net_params={'n_domains': 1}, train_params={'num_epochs': 5001})

Mess with LR a little since I realized the loss was weird

In [None]:
long_train_params = {'lr': 0.05 / 124, 'snap_freq': 500, 'num_epochs': 30001}
train_n_dd_nets(n=4, run_type='4domain_long', train_params=long_train_params)
train_n_dd_nets(n=4, run_type='1domain_long', net_params={'n_domains': 1}, train_params=long_train_params)

Actually try changing weight init and batch size instead to try to replicate paper

In [None]:
small_init_train_params = {'lr': 0.05, 'snap_freq': 500, 'num_epochs': 5001, 'batch_size': 1}
# train_n_dd_nets(n=1, run_type='4domain_small_init', train_params=small_init_train_params,
#                 net_params={'param_init_scale': 0.0005})
# train_n_dd_nets(n=1, run_type='1domain_small_init', train_params=small_init_train_params,
#                 net_params={'param_init_scale': 0.0005, 'n_domains': 1})

halfbatch_params = {'snap_freq': 500, 'num_epochs': 30001, 'batch_size': 62}
train_n_dd_nets(n=1, run_type='4domain_halfbatch', train_params=halfbatch_params)
train_n_dd_nets(n=1, run_type='1domain_halfbatch', train_params=halfbatch_params,
                net_params={'n_domains': 1})

Try a couple of larger batch sizes to see if we can get away with it (compare to "original")

In [None]:
train_n_dd_nets(n=18, run_type='batchsize/batch64', train_params={'batch_size': 64})
train_n_dd_nets(n=18, run_type='batchsize/batch32', train_params={'batch_size': 32})
train_n_dd_nets(n=18, run_type='batchsize/batch16', train_params={'batch_size': 16})

# Try 31 since it seems like there's something specifically weird about 32
train_n_dd_nets(n=18, run_type='batchsize/batch31', train_params={'batch_size': 31})

for sz in range(5, 66, 5):
    train_n_dd_nets(n=18, run_type=f'batchsize/batch{sz}', train_params={'batch_size': sz})

Hold out test on last domain - normal vs. equidistant

In [None]:
train_n_dd_nets(run_type='domain_ho_eqdist', train_params=dho_train_params,
                net_params=dho_net_params('4-2-2', '8'))
train_n_dd_nets(run_type='domain_ho_orig', train_params=dho_train_params,
                net_params=dho_net_params('4-2-2', '4-2-2'))
train_n_dd_nets(run_type='domain_ho_332', train_params=dho_train_params,
                net_params=dho_net_params('4-2-2', '3-3-2'))

In [None]:
train_n_dd_nets(run_type='domain_ho_eqdist_from_332', train_params=dho_train_params,
                net_params=dho_net_params('3-3-2', '8'))
train_n_dd_nets(run_type='domain_ho_orig_from_332', train_params=dho_train_params,
                net_params=dho_net_params('3-3-2', '4-2-2'))
train_n_dd_nets(run_type='domain_ho_332_from_332', train_params=dho_train_params,
                net_params=dho_net_params('3-3-2', '3-3-2'))

In [None]:
train_n_dd_nets(run_type='domain_ho_eqdist_from_eqdist', train_params=dho_train_params,
                net_params=dho_net_params('8', '8'))
train_n_dd_nets(run_type='domain_ho_orig_from_eqdist', train_params=dho_train_params,
                net_params=dho_net_params('8', '4-2-2'))
train_n_dd_nets(run_type='domain_ho_332_from_eqdist', train_params=dho_train_params,
                net_params=dho_net_params('8', '3-3-2'))

Results are still confusing, so go the other direction and add 5-1-2

In [None]:
train_n_dd_nets(run_type='domain_ho_512_from_512', train_params=dho_train_params,
                net_params=dho_net_params('5-1-2', '5-1-2'))

train_n_dd_nets(run_type='domain_ho_512_from_orig', train_params=dho_train_params,
                net_params=dho_net_params('4-2-2', '5-1-2'))
train_n_dd_nets(run_type='domain_ho_512_from_332', train_params=dho_train_params,
                net_params=dho_net_params('3-3-2', '5-1-2'))
train_n_dd_nets(run_type='domain_ho_512_from_eqdist', train_params=dho_train_params,
                net_params=dho_net_params('8', '5-1-2'))

train_n_dd_nets(run_type='domain_ho_orig_from_512', train_params=dho_train_params,
                net_params=dho_net_params('5-1-2', '4-2-2'))
train_n_dd_nets(run_type='domain_ho_332_from_512', train_params=dho_train_params,
                net_params=dho_net_params('5-1-2', '3-3-2'))
train_n_dd_nets(run_type='domain_ho_eqdist_from_512', train_params=dho_train_params,
                net_params=dho_net_params('5-1-2', '8'))

In [None]:
# Try 4-4 again since it's been a while. Want to see whether I can use this as a simpler version.
train_n_dd_nets(run_type='4-4', train_params={**dho_train_params, 'holdout_testing': 'none'},
                net_params=dho_net_params('4-4'))

In [None]:
train_n_dd_nets(run_type='4-4_rolled', train_params={**dho_train_params, 'holdout_testing': 'none'},
                net_params={**dho_net_params('4-4'), 'roll_attrs': True})

In [None]:
train_n_dd_nets(run_type='4-4_no_reuse', train_params={**dho_train_params, 'holdout_testing': 'none'},
                net_params={**dho_net_params('4-4_no_reuse')})

In [None]:
train_n_dd_nets(run_type='original_no_reuse', train_params={**dho_train_params, 'holdout_testing': 'none'},
                net_params={**dho_net_params('4-2-2_no_reuse')})

In [None]:
train_n_dd_nets(run_type='4-4_unequal', train_params={**dho_train_params, 'holdout_testing': 'none'},
                net_params={**dho_net_params('4-4_unequal')})

OK, use 4-4 scheme with unequal intragroup distance (chosen separately for each domain) and try changing intergroup distance. 

In [None]:
intergroup_dists = {'near': 24, 'far': 48}
for idist_name, idist in intergroup_dists.items():
    for jdist_name, jdist in intergroup_dists.items():
        train_n_dd_nets(n=18, run_type=f'intergroup/{idist_name}_to_{jdist_name}',
                        train_params=dho_train_params, net_params=dho_net_params(
                            {'clusters': '4-4', 'intergroup_dist': idist},
                            {'clusters': '4-4', 'intergroup_dist': jdist}
                        ))

In [None]:
# I have an idea, try with biases fixed
train_n_dd_nets(n=10, run_type='original_new')
train_n_dd_nets(n=10, run_type='fixed_biases', net_params={'fix_biases': True})

In [6]:
# Try to replicate figure R6
# train_n_dd_nets(n=1, run_type='original_single_long', train_params={'num_epochs': 30001})
# train_n_dd_nets(n=1, run_type='repeated_attrs_long', net_params={'repeat_attrs_over_domains': True},
#                train_params={'num_epochs': 30001})

train_n_dd_nets(n=10, run_type='original_longer', train_params={'num_epochs': 10000})
# train_n_dd_nets(n=10, run_type='repeated_attrs', net_params={'repeat_attrs_over_domains': True})
train_n_dd_nets(n=10, run_type='repeated_attrs_longer', net_params={'repeat_attrs_over_domains': True},
               train_params={'num_epochs': 10000})

Using CUDA
Training Iteration 1
---------------------
Epoch    0 end: loss = 234.109, weighted acc = 0.321
Epoch   50 end: loss = 112.410, weighted acc = 0.498
Epoch  100 end: loss = 111.689, weighted acc = 0.499
Epoch  150 end: loss = 111.097, weighted acc = 0.500
Epoch  200 end: loss = 110.768, weighted acc = 0.500
Epoch  250 end: loss = 110.715, weighted acc = 0.500
Epoch  300 end: loss = 110.589, weighted acc = 0.500
Epoch  350 end: loss = 110.523, weighted acc = 0.500
Epoch  400 end: loss = 110.426, weighted acc = 0.500
Epoch  450 end: loss = 110.387, weighted acc = 0.500
Epoch  500 end: loss = 110.226, weighted acc = 0.500
Epoch  550 end: loss = 106.438, weighted acc = 0.500
Epoch  600 end: loss =  88.318, weighted acc = 0.476
Epoch  650 end: loss =  73.273, weighted acc = 0.460
Epoch  700 end: loss =  62.012, weighted acc = 0.462
Epoch  750 end: loss =  51.569, weighted acc = 0.468
Epoch  800 end: loss =  42.631, weighted acc = 0.474
Epoch  850 end: loss =  36.166, weighted acc 

('data/repeated_attrs_longer_dd_res_2021-07-08_12-08-40.npz',
 DisjointDomainNet(
   (item_to_rep): Linear(in_features=32, out_features=16, bias=False)
   (ctx_to_rep): Linear(in_features=16, out_features=16, bias=False)
   (rep_to_hidden): Linear(in_features=32, out_features=32, bias=False)
   (hidden_to_attr): Linear(in_features=32, out_features=960, bias=False)
   (criterion): BCELoss()
 ))

In [46]:
# Kind of sanity check: with no biases and all internal layer merged, there should be
# no cross-domain structure using Spearman's correlation.

train_n_dd_nets(run_type='all_merged_fixed_biases',
                net_params={'use_item_repr': False, 'use_ctx_repr': False, 'hidden_units': 64, 'fix_biases': True})
train_n_dd_nets(run_type='all_merged_fixed_biases_44',
                net_params={'use_item_repr': False, 'use_ctx_repr': False, 'hidden_units': 64, 'fix_biases': True,
                            'cluster_info': '4-4'})

Using CUDA
Training Iteration 1
---------------------
Epoch    0 end: loss = 165.737, weighted acc = 0.036
Epoch   50 end: loss = 111.175, weighted acc = 0.500
Epoch  100 end: loss = 110.898, weighted acc = 0.500
Epoch  150 end: loss = 110.646, weighted acc = 0.500
Epoch  200 end: loss = 110.231, weighted acc = 0.500
Epoch  250 end: loss = 108.545, weighted acc = 0.500
Epoch  300 end: loss = 101.803, weighted acc = 0.498
Epoch  350 end: loss =  88.044, weighted acc = 0.488
Epoch  400 end: loss =  66.370, weighted acc = 0.481
Epoch  450 end: loss =  41.998, weighted acc = 0.491
Epoch  500 end: loss =  36.207, weighted acc = 0.519
Epoch  550 end: loss =  33.965, weighted acc = 0.529
Epoch  600 end: loss =  33.064, weighted acc = 0.529
Epoch  650 end: loss =  31.901, weighted acc = 0.530
Epoch  700 end: loss =  30.739, weighted acc = 0.533
Epoch  750 end: loss =  28.596, weighted acc = 0.535
Epoch  800 end: loss =  25.742, weighted acc = 0.540
Epoch  850 end: loss =  23.040, weighted acc 

DisjointDomainNet(
  (item_to_rep): Identity()
  (ctx_to_rep): Identity()
  (rep_to_hidden): Linear(in_features=48, out_features=64, bias=False)
  (hidden_to_attr): Linear(in_features=64, out_features=960, bias=False)
  (criterion): BCELoss()
)

In [209]:
# Do some more simple runs to test the attribute frequency idea

train_params = {'snap_freq': 20, 'report_freq': 20, 'num_epochs': 2001}
all_merged = {'use_item_repr': False, 'use_ctx_repr': False, 'hidden_units': 64}

# All merged
train_n_dd_nets(n=10, run_type='all_merged_2group',
                net_params={**all_merged, 'cluster_info': {'clusters': '4-4', 'intragroup_dists': [4, 12]}},
                train_params=train_params)

# Try other 3-group settings
train_n_dd_nets(n=10, run_type='all_merged_332',
                net_params={**all_merged, 'cluster_info': {'clusters': '3-3-2'}},
                train_params=train_params)
train_n_dd_nets(n=10, run_type='all_merged_512',
                net_params={**all_merged, 'cluster_info': {'clusters': '5-1-2'}},
                train_params=train_params)

Using CUDA
Training Iteration 1
---------------------
Epoch    0 end: loss = 205.063, weighted acc = 0.370
Epoch   20 end: loss = 115.076, weighted acc = 0.495
Epoch   40 end: loss = 113.161, weighted acc = 0.498
Epoch   60 end: loss = 112.261, weighted acc = 0.499
Epoch   80 end: loss = 111.905, weighted acc = 0.500
Epoch  100 end: loss = 112.021, weighted acc = 0.500
Epoch  120 end: loss = 112.010, weighted acc = 0.500
Epoch  140 end: loss = 111.682, weighted acc = 0.500
Epoch  160 end: loss = 111.512, weighted acc = 0.500
Epoch  180 end: loss = 111.567, weighted acc = 0.500
Epoch  200 end: loss = 111.380, weighted acc = 0.500
Epoch  220 end: loss = 111.076, weighted acc = 0.500
Epoch  240 end: loss = 110.978, weighted acc = 0.500
Epoch  260 end: loss = 110.611, weighted acc = 0.500
Epoch  280 end: loss = 109.351, weighted acc = 0.500
Epoch  300 end: loss = 107.287, weighted acc = 0.498
Epoch  320 end: loss = 103.276, weighted acc = 0.496
Epoch  340 end: loss =  96.730, weighted acc 

DisjointDomainNet(
  (item_to_rep): Identity()
  (ctx_to_rep): Identity()
  (rep_to_hidden): Linear(in_features=48, out_features=64, bias=False)
  (hidden_to_attr): Linear(in_features=64, out_features=960, bias=False)
  (criterion): BCELoss()
)

### Double dissociation - attribute frequency vs. hierarchical structure

In [4]:
dissoc_net_params = {'attrs_per_context': 224, 'attrs_set_per_item': 112}
dissoc_train_params = {'snap_freq': 20, 'report_freq': 20, 'num_epochs': 2001}

# structured with unequal frequencies
train_n_dd_nets(run_type='structured_uneq_freq', train_params=dissoc_train_params,
                net_params={**dissoc_net_params, 'cluster_info': {
                    'clusters': '4-2-2',
                    'intergroup_dist': 24,
                    'intragroup_dists': [16, 6, 8, 20]
                }})

# structured with equal frequencies
train_n_dd_nets(run_type='structured_eq_freq', train_params=dissoc_train_params,
                net_params={**dissoc_net_params, 'cluster_info': '4-2-2_eq-freq'})

# unstructured with equal frequencies
train_n_dd_nets(run_type='unstructured_eq_freq', train_params=train_params,
                net_params={**dissoc_net_params, 'cluster_info': {
                    'clusters': '8',
                    'intragroup_dists': [32]
                }})

#TODO: unstructured with unequal frequencies

Using CUDA
Training Iteration 1
---------------------
Epoch    0 end: loss = 911.038, weighted acc = 0.313
Epoch   20 end: loss = 454.470, weighted acc = 0.484
Epoch   40 end: loss = 452.606, weighted acc = 0.481
Epoch   60 end: loss = 457.312, weighted acc = 0.473
Epoch   80 end: loss = 452.310, weighted acc = 0.481
Epoch  100 end: loss = 453.930, weighted acc = 0.475
Epoch  120 end: loss = 452.433, weighted acc = 0.480
Epoch  140 end: loss = 452.669, weighted acc = 0.482
Epoch  160 end: loss = 450.909, weighted acc = 0.484
Epoch  180 end: loss = 452.322, weighted acc = 0.476
Epoch  200 end: loss = 450.005, weighted acc = 0.477
Epoch  220 end: loss = 456.360, weighted acc = 0.477
Epoch  240 end: loss = 452.075, weighted acc = 0.485
Epoch  260 end: loss = 453.995, weighted acc = 0.478
Epoch  280 end: loss = 449.180, weighted acc = 0.481
Epoch  300 end: loss = 451.529, weighted acc = 0.482
Epoch  320 end: loss = 451.517, weighted acc = 0.480
Epoch  340 end: loss = 454.211, weighted acc 

NameError: name 'train_params' is not defined

## Old stuff

Try smaller item representation to see if it puts off using domain information for longer

In [None]:
train_n_dd_nets(run_type='small_item_repr', net_params={'item_repr_ratio': 0.25})

In [None]:
train_n_dd_nets(run_type='all_ratios_0.5', 
                net_params={'item_repr_ratio': 0.5, 'ctx_repr_ratio': 0.5, 'hidden_ratio': 0.5},
                train_params={'num_epochs': 4001})

Train for longer to try to look at potential re-emergence of separation by type

In [None]:
train_n_dd_nets(run_type='longer', train_params={'num_epochs': 6001, 'snap_freq': 100, 'report_freq': 100})

In [None]:
train_n_dd_nets(run_type='half_hidden_longer', net_params={'hidden_ratio': 0.5},
                train_params={'num_epochs': 6001, 'snap_freq': 100, 'report_freq': 100})

Troubleshooting, try with no testing

In [None]:
train_n_dd_nets(run_type='no_test', train_params={'do_combo_testing': False,
                                                  'holdout_testing': 'none'})

Base network with hold-out testing

In [None]:
train_n_dd_nets(run_type='ho_both', train_params={'holdout_testing': 'all'})

Try holding out only item or context at a time

In [None]:
train_n_dd_nets(run_type='ho_item', train_params={'holdout_testing': 'item'})

In [None]:
train_n_dd_nets(run_type='ho_context', train_params={'holdout_testing': 'context'})

Try simple item tree, to see if separation b/w item classes still happens when they're all equally "typical"

Result: actually creates a really different pattern - no similarity b/w subgroups of different domains that exceeds similarity within a domain.

In [None]:
train_n_dd_nets(run_type='simplified_no_holdout', net_params={'item_clusters': '4-4'},
                train_params={'holdout_testing': 'none'})