In [None]:
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import torchvision
from torchvision.transforms import ToTensor, Normalize, Compose

In [None]:
from scripts.architecture import MLP, MLPManual
from scripts.train import *
from scripts.plot_utils import plot_loss_accuracy, plotValAccuracy, fillSubplot

In [None]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'large',
          'figure.figsize': (16, 8),
         'axes.labelsize': 'large',
         'axes.titlesize':'large',
         'xtick.labelsize':'large',
         'ytick.labelsize':'large'}
pylab.rcParams.update(params)

In [None]:
print(torch.__version__)
print(np.__version__)

## Create Parity Data Iterator

In [None]:
transforms = Compose([
    ToTensor(),
    Normalize((0.1307,), (0.3081,))
])

# doesn't perform and transformation until we call the loader
trainset = torchvision.datasets.MNIST(root='data', train=True, download=True, transform=transforms)
testset = torchvision.datasets.MNIST(root='data', train=False, download=True, transform=transforms)

In [None]:
learn_rate = 0.05
num_epochs = 20
batch_size = 128
loss_type = "Binary Cross Entropy"
loss_fn = torch.nn.BCELoss()
B_initialization = "uniform"
optim = "SGD"
momentum, nesterov_momentum = False, False
weight_decay = 1e-3
measure_alignment = False

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

### For k = 1

In [None]:
k=1
model = MLP(k, "ReLU", loss_type)
optimizer = torch.optim.Adadelta(model.parameters(), lr=learn_rate, weight_decay = weight_decay)

trainLostList_Ada1, trainAccList_Ada1, valLossList_Ada1, valAccList_Ada1  = train_model(model, k, trainset, testset, loss_type, loss_fn, optimizer, num_epochs, 
                                                                                        batch_size, validate_model = True, performance=accuracy, device=device, 
                                                                                        lr_scheduler=None, updateWManually=False)

plot_loss_accuracy(trainLostList_Ada1,valLossList_Ada1,trainAccList_Ada1,valAccList_Ada1,num_epochs)

In [None]:
model2 = MLP(k, "ReLU", loss_type)
optimizer = torch.optim.SGD(model2.parameters(), lr=learn_rate, weight_decay=weight_decay)

trainLostList_sgd1, trainAccList_sgd1, valLossList_sgd1, valAccList_sgd1  = train_model(model2, k, trainset, testset, loss_type, loss_fn, optimizer, num_epochs, 
                                                                                        batch_size, validate_model = True, performance=accuracy, device=device,lr = learn_rate, 
                                                                                        lr_scheduler=None, updateWManually=False)

In [None]:
k=1
modelManual = MLPManual(k, learn_rate, loss_type, "BP", None, optim, device, measure_alignment, False)

trainLostList_sgd1_scratch, trainAccList_sgd1_scratch, \
valLossList_sgd1_scratch, valAccList_sgd1_scratch,_,_,_  = train_model_manually(modelManual, k, trainset, testset, loss_type, loss_fn, num_epochs, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment, validate_model = True, device=device)

In [None]:
modelManualDFA = MLPManual(k, learn_rate, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
trainLostList_sgd1_dfa, trainAccList_sgd1_dfa, \
valLossList_sgd1_dfa, valAccList_sgd1_dfa,_,_,_  =  train_model_manually(modelManualDFA, k, trainset, testset, loss_type, loss_fn, num_epochs, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment, validate_model = True, device=device)


### For k = 3

In [None]:
k = 3
model3 = MLP(k,"ReLU", loss_type)
optimizer = torch.optim.Adadelta(model3.parameters(), lr=learn_rate, weight_decay=weight_decay)

trainLostList_Ada3, trainAccList_Ada3, \
valLossList_Ada3, valAccList_Ada3  = train_model(model3, k, trainset, testset, loss_type, loss_fn, optimizer, num_epochs, batch_size, validate_model = True, 
                                                 performance=accuracy, device=device, lr_scheduler=None)

plot_loss_accuracy(trainLostList_Ada3,valLossList_Ada3,trainAccList_Ada3,valAccList_Ada3,num_epochs)

In [None]:
model4 = MLP(k, "ReLU", loss_type)
optimizer = torch.optim.SGD(model4.parameters(), lr=learn_rate, weight_decay=weight_decay)

trainLostList_sgd3, trainAccList_sgd3, valLossList_sgd3, valAccList_sgd3  = train_model(model4, k, trainset, testset, loss_type, loss_fn, optimizer, num_epochs, 
                                                                                        batch_size, validate_model = True, performance=accuracy, device=device, 
                                                                                        lr_scheduler=None)

In [None]:
modelManual3 = MLPManual(k, learn_rate, loss_type, "BP", None, optim, device, measure_alignment, False)
trainLostList_sgd3_scratch, trainAccList_sgd3_scratch, \
valLossList_sgd3_scratch, valAccList_sgd3_scratch,_,_,_  = train_model_manually(modelManual3, k, trainset, testset, loss_type, loss_fn, num_epochs, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment, validate_model = True, device=device)

In [None]:
learn_rate = 0.02 # one of the best lr that I got for uniform B, with 0.05 training didn't perform well
modelManual3DFA = MLPManual(k, learn_rate, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
trainLostList_sgd3_dfa, trainAccList_sgd3_dfa, \
valLossList_sgd3_dfa, valAccList_sgd3_dfa,_,_,_  = train_model_manually(modelManual3DFA, k, trainset, testset, loss_type, loss_fn, num_epochs, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment, validate_model = True, device=device)

In [None]:
fig, (ax1, ax3) = plt.subplots(1, 2, figsize=(16,8))

ax1.plot(range(1,21),valAccList_sgd1, color = "blue", label = "SGD BP Pytorch") 
ax1.plot(range(1,21),valAccList_Ada1, color = "green", label = "Adadelta BP Pytorch")
ax1.plot(range(1,21),valAccList_sgd1_scratch, color = "orange", label = "SGD BP Dogan") 
ax1.plot(range(1,21),valAccList_sgd1_dfa, color = "red", label = "SGD DFA Dogan")
ax1.set_ylim(0.40,1.05)
ax1.set_title("Test Accuracy k=1")
ax1.set_xlabel("Iteration")
ax1.set_ylabel("Accuracy")
ax1.set_xticks(range(1,21))
ax1.legend()
ax1.grid(True)


ax3.plot(range(1,21),valAccList_sgd3, color = "blue", label = "SGD BP Pytorch")
ax3.plot(range(1,21),valAccList_Ada3, color = "green", label = "Adadelta BP Pytorch")
ax3.plot(range(1,21),valAccList_sgd3_scratch, color = "orange", label = "SGD BP Dogan")
ax3.plot(range(1,21),valAccList_sgd3_dfa, color = "red", label = "SGD DFA Dogan")
ax3.set_ylim(0.40,1.05)
ax3.set_title("Test Accuracy k=3")
ax3.set_xlabel("Iteration")
ax3.set_ylabel("Accuracy")
ax3.set_xticks(range(1,21))
ax3.legend()
ax3.grid(True)

fig.savefig("plots/k13_SGD_Ada_BP_DFA.png")

### Try with the same weights (SGD BP Pytorch vs SGD BP Dogan)

In [None]:
k = 3
modelx = MLP(k, "ReLU", loss_type).to(device)

w1 = copy.deepcopy(modelx.state_dict()["layer1.weight"]).to(device)
w2 = copy.deepcopy(modelx.state_dict()["layer2.weight"]).to(device)

optimizer = torch.optim.SGD(modelx.parameters(), lr=learn_rate)

trainLostList_sgd3_w, trainAccList_sgd3_w, valLossList_sgd3_w, valAccList_sgd3_w = train_model(modelx, k, trainset, testset, loss_type, loss_fn, optimizer, num_epochs, 
                                                                                                batch_size, validate_model = True, performance=accuracy, device=device, 
                                                                                                lr=learn_rate, lr_scheduler=None, updateWManually=True)

In [None]:
modelManualx = MLPManual(k, learn_rate, loss_type, "BP", None, optim, device, measure_alignment, (w1.t(),w2.t()))
trainLostList_sgd3_scratch_w, trainAccList_sgd3_scratch_w, \
valLossList_sgd3_scratch_w, valAccList_sgd3_scratch_w, _,_,_  = train_model_manually(modelManualx, k, trainset, testset, loss_type, loss_fn, num_epochs, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment, validate_model = True, device=device)

In [None]:
plt.figure(figsize=(15,8))
plt.plot(range(1,21),valAccList_sgd3_w, color = "blue", label = "BP SGD Pytorch")
plt.plot(range(1,21),valAccList_sgd3_scratch_w, color = "green", label = "BP SGD Dogan")

plt.ylim(0.4,1.05)
plt.title("Test Accuracy k=3")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.gca().xaxis.set_major_locator(mticker.MultipleLocator(1))
plt.legend()
plt.grid(True)

plt.savefig("plots/k3_SGD_BP_sameWeights.png")

plt.show();

# They are gonna be different, because I recreate the data every epoch
# but the pattern should be very similar

### DFA Experiments

In [None]:
# run DFA up to 100 epochs to see if we get similar result as BP
k=3
learn_rate = 0.01
measure_alignment_DFA = True
modelManual4 = MLPManual(k, learn_rate, loss_type, "DFA", B_initialization, optim, device, measure_alignment_DFA, False)
trainLostList_sgad4_scratch, trainAccList_sgd4_scratch, \
valLossList_sgd4_scratch, valAccList_sgd4_scratch, \
similarity_w2B, similarity_w1_grads, similarity_w2_grads  = train_model_manually(modelManual4, k, trainset, testset, loss_type, loss_fn, 100, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment_DFA, validate_model = True, device=device)
plt.figure(figsize=(16,8))                                                                          
plotValAccuracy(valAccList_sgd4_scratch, 100, "DFA Validation", 3)

### Alignment

In [None]:
fig, (ax1,  ax3, ax4) = plt.subplots(1, 3, figsize=(16,8))

ax1.plot(similarity_w2B)
ax1.set_title("w2.T() and B Cosine Similarity")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Similarity")
ax1.grid(True)

ax3.plot(similarity_w1_grads)
ax3.set_title("W1 Grad Cosine Similarity")
ax3.set_xlabel("Epoch")
ax3.set_ylabel("Similarity")
ax3.grid(True)

ax4.plot(similarity_w2_grads)
ax4.set_title("W2 Grad Similarity")
ax4.set_xlabel("Epoch")
ax4.set_ylabel("Similarity")
ax4.grid(True)

### Hyperparameter Tuning

In [None]:
def tuneLearningRate_Torch(lr_array : np.array, optim: str, k:int, loss_type):
    listofValAcc = []
    for learning_rate in lr_array:
        print(f"Learning rate: {learn_rate}")
        model = MLP(k, "BP", loss_type)
        if optim == "SGD":
            optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, weight_decay=weight_decay)
        else:
            optimizer = torch.optim.Adadelta(model.parameters(), lr=learning_rate, weight_decay = weight_decay)

        trainLostListLoc, trainAccListLoc, valLossListLoc, valAccListLoc  = train_model(model, k, trainset, testset, loss_type, loss_fn, optimizer, num_epochs, 
                                                                                        batch_size, validate_model = True, performance=accuracy, device=device, 
                                                                                        lr_scheduler=None, updateWManually=False)

        last5 = valAccListLoc[-10:]
        meanOfLast5 = sum(last5) / len(last5)
        listofValAcc.append(meanOfLast5)
        listofValAccnp = np.array(listofValAcc)
        idx = np.argsort(listofValAccnp)
        best_lr = lr_array[idx][-1]
    
    print("Best learning rate is: ", best_lr)
    return best_lr

In [None]:
# k = 3
# torch_bp_sgd_lr_array_ = np.linspace(0.05, 0.15, 6)
# torch_bp_ada_lr_array = np.linspace(0.10, 0.20, 6)

# best_torch_bp_sgd = tuneLearningRate_Torch(torch_bp_sgd_lr_array_, "SGD", k, loss_type)
# best_torch_bp_ada = tuneLearningRate_Torch(torch_bp_ada_lr_array, "Adadelta", k, loss_type)

In [None]:
best_torch_bp_sgd = 0.11
best_torch_bp_ada = 0.18

In [None]:
def tuneLearningRate_Manual(lr_array : np.array, training_method: str, init_B: str, optim: str, k:int, loss_type):
    listofValAcc = []
    for learning_rate in lr_array:
        print(f"Learning rate: {learn_rate}")
        model = MLPManual(k, learning_rate, loss_type, training_method, init_B, optim, device, measure_alignment, False)
        trainLostListLoc, trainAccListLoc, valLossListLoc, valAccListLoc ,_,_,_ = train_model_manually(model, k, trainset, testset, loss_type, loss_fn, num_epochs,
                                                                                                batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment, 
                                                                                                validate_model = True, device=device)
        last5 = valAccListLoc[-10:]
        meanOfLast5 = sum(last5) / len(last5)
        listofValAcc.append(meanOfLast5)
        listofValAccnp = np.array(listofValAcc)
        idx = np.argsort(listofValAccnp)
        best_lr = lr_array[idx][-1]
    
    print("Best learning rate is: ", best_lr)
    return best_lr


In [None]:
# manual_bp_sgd_lr_array = np.linspace(0.01, 0.07, 6)
# manual_bp_adagrad_lr_array = np.linspace(0.005, 0.01, 6)
# manual_bp_adadelta_lr_array = np.linspace(0.1, 1, 6)
# manual_bp_rmsprop_lr_array = np.linspace(0.0001, 0.0005, 6)
# manual_bp_adam_lr_array = np.linspace(0.001, 0.005, 6)


# best_manual_bp_sgd = tuneLearningRate_Manual(manual_bp_sgd_lr_array, "BP", None, "SGD", k, loss_type)
# best_manual_bp_adagrad = tuneLearningRate_Manual(manual_bp_adagrad_lr_array, "BP", None, "Adagrad", k, loss_type)
# best_manual_bp_adadelta = tuneLearningRate_Manual(manual_bp_adadelta_lr_array, "BP", None, "Adadelta", k, loss_type)
# best_manual_bp_rmsprop = tuneLearningRate_Manual(manual_bp_rmsprop_lr_array, "BP", None, "RMSProp", k, loss_type)
# best_manual_bp_adam = tuneLearningRate_Manual(manual_bp_adam_lr_array, "BP", None, "Adam", k, loss_type)

In [None]:
best_manual_bp_sgd = 0.08
best_manual_bp_adagrad = 0.007
best_manual_bp_adadelta = 1
best_manual_bp_rmsprop = 0.00026
best_manual_bp_adam = 0.0042

In [None]:
# Tune their learning rates to get best one, it is done by checking the last 10 val Accuracy
# k=3
# lr_array_uni = np.linspace(0.01, 0.025, 6)
# lr_array_std_uni = np.linspace(0.0015, 0.0035, 6)
# lr_array_gauss = np.linspace(0.01, 0.02, 6)
# lr_array_std_gauss = np.linspace(0.0005, 0.001, 6)

# best_manual_dfa_sgd_uni = tuneLearningRate(lr_array_uni, "DFA", "uniform", optim, k, loss_type)
# best_manual_dfa_sgd_std_uni = tuneLearningRate(lr_array_std_uni, "DFA", "standard uniform", optim, k, loss_type)
# best_manual_dfa_sgd_gaussian = tuneLearningRate(lr_array_gauss, "DFA", "gaussian", optim, k, loss_type)
# best_manual_dfa_sgd_std_gaussian = tuneLearningRate(lr_array_std_gauss, "DFA", "standard gaussian", optim, k, loss_type)

In [None]:
# tuning these parameters take too much time, so let's run once and store them statically
best_manual_dfa_sgd_uni = 0.02
best_manual_dfa_sgd_std_uni = 0.0031
best_manual_dfa_sgd_gaussian = 0.012
best_manual_dfa_sgd_std_gaussian = 0.001

In [None]:
# Tune their learning rates to get best one, it is done by checking the last 10 val Accuracy
# k=3

# lr_array_adagrad = np.linspace(0.001, 0.005, 6)
# lr_array_adadelta = np.linspace(0.1, 1, 6)
# lr_array_rmsprop = np.linspace(0.0004, 0.0006, 6)
# lr_array_adam = np.linspace(0.001, 0.003, 6)

# best_manual_dfa_adagrad_uni = tuneLearningRate_Manual(lr_array_adagrad, "DFA", B_initialization, "Adagrad", k, loss_type)
# best_manual_dfa_adadelta_uni = tuneLearningRate_Manual(lr_array_adadelta, "DFA", B_initialization, "Adadelta", k, loss_type)    
# best_manual_dfa_rmsprop_uni = tuneLearningRate_Manual(lr_array_rmsprop, "DFA", B_initialization, "RMSProp", k, loss_type)
# best_manual_dfa_adam_uni = tuneLearningRate_Manual(lr_array_adam, "DFA", B_initialization, "Adam", k, loss_type)

In [None]:
best_manual_dfa_adagrad_uni = 0.004
best_manual_dfa_adadelta_uni = 1
best_manual_dfa_rmsprop_uni = 0.0004
best_manual_dfa_adam_uni = 0.0023

### Random Matrix Experiments

In [None]:
# have a plot with different B initialization methods with 20 epochs with their best lr
initializations = ["standard uniform", "uniform", "standard gaussian", "gaussian", "BP"]
num_epochs=20
K=3
optim = "SGD"

fig = plt.figure(figsize=(15,9))
for ini in initializations:
    if ini == "uniform":
        modelManualx = MLPManual(K, best_manual_dfa_sgd_uni, loss_type, "DFA", ini, optim, device, measure_alignment, False)
    elif ini == "standard uniform":
        modelManualx = MLPManual(K, best_manual_dfa_sgd_std_uni, loss_type, "DFA", ini, optim, device, measure_alignment, False)
    elif ini == "gaussian":
        modelManualx = MLPManual(K, best_manual_dfa_sgd_gaussian, loss_type, "DFA", ini, optim, device, measure_alignment, False)
    elif ini == "standard gaussian":
        modelManualx = MLPManual(K, best_manual_dfa_sgd_std_gaussian, loss_type, "DFA", ini, optim, device, measure_alignment, False)
    else:
        modelManualx = MLPManual(K, best_manual_bp_sgd, loss_type, "BP", None, optim, device, measure_alignment, False)
    trainLostList, trainAccList, \
    valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
    plotValAccuracy(valAccList, num_epochs, ini, K)

fig.tight_layout()
plt.savefig("plots/k3_SGD_DFA_best_randomBInit.png")

In [None]:
# have plot for different learning rates (different lr_array for each random matrix)
num_epochs = 20
K = 3
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(16,8))
lr_array_uni = np.linspace(0.01, 0.03, 3)
lr_array_std_uni = np.linspace(0.0015, 0.0035, 3)
lr_array_gauss = np.linspace(0.01, 0.02, 3)
lr_array_std_gauss = np.linspace(0.0005, 0.0015, 3)

for init in ["standard uniform", "uniform", "standard gaussian", "gaussian"]:
    if init == "standard uniform":
        for lr in lr_array_std_uni:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", init, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax1, init)
    elif init == "uniform":
        for lr in lr_array_uni:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", init, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax2, init)
    elif init == "standard gaussian":
        for lr in lr_array_std_gauss:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", init, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax3, init)
    elif init == "gaussian":
        for lr in lr_array_gauss:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", init, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax4, init)

fig.tight_layout()
plt.savefig("plots/k3_SGD_DFA_BInitsWDifferentlrs.png")

### Optimizers Experiments

In [None]:
# have a plot with different B initialization methods with 20 epochs with their best lr
optims = ["SGD", "Adagrad", "Adadelta", "RMSProp", "Adam"]
num_epochs=20
K=3

fig = plt.figure(figsize=(15,9))
for optim in optims:
    if optim == "SGD":
        modelManualx = MLPManual(K, best_manual_dfa_sgd_uni, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
    elif optim == "Adagrad":
        modelManualx = MLPManual(K, best_manual_dfa_adagrad_uni, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
    elif optim == "Adadelta":
        modelManualx = MLPManual(K, best_manual_dfa_adadelta_uni, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
    elif optim == "RMSProp":
        modelManualx = MLPManual(K, best_manual_dfa_rmsprop_uni, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
    elif optim == "Adam":
        modelManualx = MLPManual(K, best_manual_dfa_adam_uni, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)

    trainLostList, trainAccList, \
    valLossList, valAccList ,_,_,_ = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
    plotValAccuracy(valAccList, num_epochs, optim, K)
    
fig.tight_layout()
plt.savefig("plots/k3_All_DFA_bestoptims.png")

In [None]:
# have plot for different learning rates (different lr_array for each random matrix)
num_epochs = 20
K = 3
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 5, figsize=(16,8))
lr_array_sgd = np.linspace(0.015, 0.035, 3)
lr_array_adagrad = np.linspace(0.001, 0.005, 3)
lr_array_adadelta = np.linspace(0.8, 1, 3)
lr_array_rmsprop = np.linspace(0.0001, 0.0005, 3)
lr_array_adam = np.linspace(0.001, 0.004, 3)

for optim in ["SGD", "Adagrad", "Adadelta", "RMSProp", "Adam"]:
    if optim == "SGD":
        for lr in lr_array_sgd:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax1, optim)

    elif optim == "Adagrad":
        for lr in lr_array_adagrad:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax2, optim)
    elif optim == "Adadelta":
        for lr in lr_array_adadelta:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax3, optim)
    elif optim == "RMSProp":
        for lr in lr_array_rmsprop:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax4, optim)
    else:
        for lr in lr_array_adam:
            print(f"Learning rate: {lr}")
            modelManualx = MLPManual(K, lr, loss_type, "DFA", B_initialization, optim, device, measure_alignment, False)
            trainLostList, trainAccList, \
            valLossList, valAccList ,_,_,_  = train_model_manually(modelManualx, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                            batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                            validate_model = True, device=device)
            fillSubplot(valAccList, num_epochs, str(round(lr,4)), ax5, optim)

fig.tight_layout()
plt.savefig("plots/k3_All_DFA_optimsWDifferentlrs.png")

### Main Experiment

In [None]:
# Reproduce the experiments from Learning Parities with Neural Networks
learn_rate = 0.05
num_epochs = 20
loss_type = "Binary Cross Entropy"
fig, (ax1, ax3) = plt.subplots(1, 2, figsize=(16,8))
for K in [1,3]:
    for activation in ["Adadelta", "NTK", "Gaussian features", "ReLU features", "SGD"]:
        model = MLP(K, activation, loss_type)
        if "features" in activation:
            # deactivate the first layer
            optimizer = torch.optim.Adadelta(model.layer2.parameters(), lr = learn_rate, weight_decay=weight_decay)
        elif "NTK" in activation:
            paramsToUpdate = list(model.layer1.parameters()) + list(model.layer2.parameters())
            optimizer = torch.optim.Adadelta(paramsToUpdate, lr = learn_rate, weight_decay=weight_decay)
        elif "SGD" in activation:
            optimizer = torch.optim.SGD(model.parameters(), lr = best_torch_bp_sgd, weight_decay=weight_decay)
        else:
            optimizer = torch.optim.Adadelta(model.parameters(), lr = best_torch_bp_ada, weight_decay=weight_decay)

        print("Activation:",activation)

        trainLostList, trainAccList, valLossList, valAccList  = train_model(model, K, trainset, testset, loss_type, loss_fn, optimizer, num_epochs,
                                                                            batch_size, validate_model = True, performance=accuracy,
                                                                            device="cuda:0", lr_scheduler=None)

        if K == 1:
            fillSubplot(valAccList, num_epochs, activation, ax1, "k = " + str(K))
        else:
            fillSubplot(valAccList, num_epochs, activation, ax3, "k = " + str(K))

fig.tight_layout()
fig.supxlabel('Epoch')
fig.supylabel('Test Accuracy')
fig.savefig("plots/k13_SGD_ada_BP_reproduced.svg")
plt.show()

In [None]:
K = 3
num_epochs = 20
loss_type = "Binary Cross Entropy"

fig = plt.figure(figsize=(15,9))
for activation in ["BP SGD", "DFA SGD", "Adadelta BP", "Adadelta DFA"]:
    #print("Activation:",activation)
    if activation == "BP SGD":
        modelManual3 = MLPManual(K, best_manual_bp_sgd, loss_type, "BP", None, "SGD", device, measure_alignment, False)

    elif activation == "DFA SGD":
        modelManual3 = MLPManual(K, best_manual_dfa_sgd_uni, loss_type, "DFA", "uniform", "SGD", device, measure_alignment, False)

    elif activation == "Adadelta BP":
        modelManual3 = MLPManual(K, best_manual_bp_adadelta, loss_type, "BP", None, "Adadelta", device, measure_alignment, False)

    else: # Adadelta DFA
        modelManual3 = MLPManual(K, best_manual_dfa_adadelta_uni, loss_type, "DFA", "uniform", "Adadelta", device, measure_alignment, False)

    trainLostList, trainAccList, valLossList, valAccList ,_,_,_  = train_model_manually(modelManual3, K, trainset, testset, loss_type, loss_fn, num_epochs,
                                                                                batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                                                validate_model = True, device=device)


    plotValAccuracy(valAccList, num_epochs, activation, K)

fig.tight_layout()
fig.savefig("plots/k3_best_SGD_delta_BP_DFA.png")
plt.show()
dataset = MNISTParity(trainset, K, 128)
dataset.plotRandomData()

### Compare BP and DFA with Adaptive methods

In [None]:
def getMeanStd(model):
    results = {}
    for i in range(1,4):
        trainLostListLoc, trainAccListLoc, valLossListLoc, valAccListLoc, _, _, _ = train_model_manually(model, k, trainset, testset, loss_type, loss_fn, num_epochs,
                                                                                                batch_size, momentum, nesterov_momentum, weight_decay, measure_alignment,
                                                                                                validate_model = True, device=device)
        results[i] = valAccListLoc
        
    liste = []
    for i in results:
        liste.append(results[i][-1])

    return liste

In [None]:
k=3
model_bp_sgd = MLPManual(k, best_manual_bp_sgd, loss_type, "BP", None, "SGD", device, measure_alignment)
model_bp_adagrad = MLPManual(k, best_manual_bp_adagrad, loss_type, "BP", None, "Adagrad", device, measure_alignment)
model_bp_rmsprop = MLPManual(k, best_manual_bp_rmsprop, loss_type, "BP", None, "RMSProp", device, measure_alignment)
model_bp_adadelta = MLPManual(k, best_manual_bp_adadelta, loss_type, "BP", None, "Adadelta", device, measure_alignment)

model_dfa_sgd = MLPManual(k, best_manual_dfa_sgd_uni, loss_type, "DFA", B_initialization, "SGD", device, measure_alignment)
model_dfa_adagrad = MLPManual(k, best_manual_dfa_adagrad_uni, loss_type, "DFA", B_initialization, "Adagrad", device, measure_alignment)
model_dfa_rmsprop = MLPManual(k, best_manual_dfa_rmsprop_uni, loss_type, "DFA", B_initialization, "RMSProp", device, measure_alignment)
model_dfa_adadelta = MLPManual(k, best_manual_dfa_adadelta_uni, loss_type, "DFA", B_initialization, "Adadelta", device, measure_alignment)


df = pd.DataFrame(columns = ["Train_Method", "Optimizer", "Results"])
idx = 0
for model in [model_bp_sgd, model_bp_adagrad, model_bp_rmsprop, model_bp_adadelta, model_dfa_sgd, model_dfa_adagrad, model_dfa_rmsprop, model_dfa_adadelta]:
    liste = getMeanStd(model)
    train_Method = model.train_method
    optim = model.optim
    for value in liste:
        df.loc[idx,:] = [train_Method,optim,value]
        idx += 1

df["Error"] = df["Results"].apply(lambda x: 1-x)
df.to_csv("run.csv")

In [None]:
g = sns.catplot(
    data=df, kind="bar",
    x="Optimizer", y="Error", hue="Train_Method",
    ci="sd", palette="dark", alpha=.6)
g.despine(left=True)
g.set_axis_labels("", "Test Error")
g.fig.suptitle("BP vs DFA with Adaptive Methods")
g.legend.set_title("")
g.fig.set_size_inches(12,8)
plt.grid()
g.tight_layout()
g.savefig("plots/mainExperiment.svg")

### t-SNE Experiment

In [None]:
from sklearn.manifold import TSNE

In [None]:
k=1
optim = "SGD"
learn_rate = best_manual_bp_sgd
modelManual3 = MLPManual(k, learn_rate, loss_type, "BP", None, optim, device, measure_alignment, False)
trainLostList_sgd3_scratch, trainAccList_sgd3_scratch, \
valLossList_sgd3_scratch, valAccList_sgd3_scratch ,_,_,_  = train_model_manually(modelManual3, k, trainset, testset, loss_type, loss_fn, num_epochs, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment, validate_model = True, device=device)

In [None]:
trainData = MNISTParity(trainset, k, batch_size)
y_hat, a1, h1 =  modelManual3(trainData.data[:5000].to(device), modelManual3.w1, modelManual3.w2)
h1.shape

In [None]:
tsne = TSNE(n_components=2, verbose=1, random_state=123)
z = tsne.fit_transform(h1.cpu().detach().numpy()) 

In [None]:
df = pd.DataFrame()
df["y"] = trainData.targets[:5000].numpy()
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

plt.axis("off")
sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 2),
                data=df).set(title="Hidden representation k=1");

In [None]:
df = pd.DataFrame()
df["y"] = trainData.original_target[:5000].numpy()
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

plt.axis("off")
sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 10),
                data=df).set(title="Hidden representation k=1 with Digits");

#### k=3

In [None]:
k=3
learn_rate = 0.02
modelManual3 = MLPManual(k, learn_rate, loss_type, "BP", None, optim, device, measure_alignment, False)
trainLostList_sgd3_scratch, trainAccList_sgd3_scratch, \
valLossList_sgd3_scratch, valAccList_sgd3_scratch ,_,_,_  = train_model_manually(modelManual3, k, trainset, testset, loss_type, loss_fn, num_epochs, batch_size, momentum,
                                                                         nesterov_momentum, weight_decay, measure_alignment, validate_model = True, device=device)

In [None]:
trainData = MNISTParity(trainset, k, batch_size)
y_hat, a1, h1 = modelManual3(trainData.data[:5000].to(device), modelManual3.w1, modelManual3.w2)

In [None]:
tsne = TSNE(n_components=2, verbose=1, random_state=123)
z = tsne.fit_transform(h1.cpu().detach().numpy()) 

In [None]:
df = pd.DataFrame()
df["y"] = trainData.targets[:5000].numpy()
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

plt.axis("off")
sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 2),
                data=df).set(title="Hidden representation k=3"); 

In [None]:
W1 = modelManual3.w1
W1.shape

In [None]:
# get the corresponding weights for single images (consider the case k=3)
def getW1ForImage(k, w):
    tensorList = []
    if k == 0:
        idx = 0
    elif k == 1:
        idx = 28
    else:
        idx = 56

    for i in range(1,29):
        #print(idx ,":", idx+28)
        tensorList.append(w[idx:idx+28,:])
        idx = idx+84
    return torch.vstack(tensorList)

In [None]:
w01 = getW1ForImage(0,W1).cpu() # for example this is the weight matrix for left image
w11 = getW1ForImage(1,W1).cpu()
w21 = getW1ForImage(2,W1).cpu()

In [None]:
# get the hidden representations for each image
hidList = []
for data in trainData.data[:5000]:
    im1 = torch.flatten(data[:,0:28])
    im2 = torch.flatten(data[:,28:56])
    im3 = torch.flatten(data[:,56:84])

    hid1 = torch.matmul(im1,w01)
    hid2 = torch.matmul(im2,w11)
    hid3 = torch.matmul(im3,w21)
    hidList.append(torch.vstack([hid1,hid2,hid3]))

data = torch.vstack(hidList)

In [None]:
tsne = TSNE(n_components=2, verbose=1, random_state=123)
z = tsne.fit_transform(data.numpy()) 

In [None]:
df = pd.DataFrame()
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

df["y"] = torch.hstack([trainData.left_target[:5000], trainData.middle_target[:5000], trainData.right_target[:5000]]).numpy()

plt.axis('off')
sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 10),
                data=df).set(title="Hidden representation k=3 with Digits");

In [None]:
# just need to change the labels to binary
df["y"] = torch.hstack([trainData.left_target[:5000]%2, trainData.middle_target[:5000]%2, trainData.right_target[:5000]%2]).numpy()

plt.axis('off')
sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 2),
                data=df).set(title="Hidden representation k=3 with Parity");

In [None]:
# faster but out of memory :(
from tsne_torch import TorchTSNE as TSNE
X_emb = TSNE(n_components=2, perplexity=30, n_iter=1000, verbose=True).fit_transform(data) 