In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import random
from uuid import uuid4
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import getopt
import sys
import os
import math
import time
import argparse
from visdom import Visdom
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

sys.path.insert(0, os.path.join('..', '..'))

import torch as T
from torch.autograd import Variable as var
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils import clip_grad_norm_

from dnc.dnc import DNC
from dnc.sdnc import SDNC
from dnc.sam import SAM
from dnc.util import *

from dnc.lib import exp_loss, InputStorage, mse, criterion, CELoss, L1loss, ENDSYM, tensor2string, LEARNABLEOBJECTIVES, LEARNTHISOBJECTIVES, RETURNOTHEROBJ

T.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x76a020ba1ab0>

# Todo:
- generate_data, InputStorage to Torch objects

In [2]:
viz = Visdom()
# assert viz.check_connection()


dataoutputformat = 2 # binary / mod 2

NoneClassOutput = False

outputformat = dataoutputformat
if NoneClassOutput:
  outputformat = dataoutputformat +1


def llprint(message):
  sys.stdout.write(message)
  sys.stdout.flush()



st = InputStorage()

def genSeq(sizeTpl, min=None, max=None, default=True):
  if default:
    return np.random.binomial(1, 0.5, sizeTpl)
  assert min is not None and max is not None
  return np.random.randint(min, max, sizeTpl)

def calcsum(sequenceA, sequenceB, maxval=dataoutputformat, batch_size=100, length=6): #calculate sum of two binary numbers
    sumsequence = np.zeros((batch_size, length +1))
    assert len(sequenceA) == len(sequenceB)
    for k in range(len(sequenceA)):
      carry = 0 # carry bit
      for j in reversed(range(len(sequenceA[k]))):
          sumsequence[k][j+1] = (sequenceA[k][j][0] + sequenceB[k][j][0] + carry) % maxval
          carry = (sequenceA[k][j][0] + sequenceB[k][j][0] + carry) // maxval
      sumsequence[k][0] = carry
    return sumsequence

def generate_data(batch_size, length, maxlength, testoccurance=True, transposeInput=False):
  minSeq = 0
  maxSeq = dataoutputformat # 2= binary, 10=decimal

  input_data = np.zeros((batch_size, maxlength, maxlength), dtype=np.float32)
  target_output = np.zeros((batch_size, maxlength, outputformat), dtype=np.float32)
  sequence1 = genSeq((batch_size, length, 1), min=minSeq, max=maxSeq, default=False)
  sequence2 = genSeq((batch_size, length, 1), min=minSeq, max=maxSeq, default=False)

  if testoccurance: # test if the sequence is in the test data, replace if so
    for i in range(batch_size):
      input_test_data = np.zeros((1, maxlength, maxlength), dtype=np.float32)
      input_test_data[0, 0:length, 0:1] = sequence1[i] #first sequence
      input_test_data[0, length, 1] = ENDSYM  #pause
      input_test_data[0, length+1:length*2+1, 2:3] = sequence2[i] #second sequence
      input_test_data[0, length*2+1, 3] = ENDSYM  #pause
      while st.isSaved(input_test_data[0], flag="testData"):
        if np.random.binomial(1, 0.5, 1) == 1: # replace first sequence
          sequence1[i] = genSeq((length, 1), min=minSeq, max=maxSeq, default=False)
          input_test_data[0, 0:length, 0:1] = sequence1[i]
        else: # replace second sequence
          sequence2[i] = genSeq((length, 1), min=minSeq, max=maxSeq, default=False)
          input_test_data[0, length+1:length*2+1, 2:3] = sequence2[i]

  input_data[:, 0:length, 0:1] = sequence1 #first sequence
  input_data[:, length, 1] = ENDSYM  #pause
  input_data[:, length+1:length*2+1, 2:3] = sequence2 #second sequence
  input_data[:, length*2+1, 3] = ENDSYM  #pause
  if transposeInput:
    for i in range(batch_size):
      input_data[i] = input_data[i].T

  
  
  cs = calcsum(sequence1, sequence2, maxval=dataoutputformat, batch_size=batch_size, length=length)
  offset = 0
  if NoneClassOutput:
    offset = 1

  for i in range(batch_size):
    for j in reversed(range(1, cs.shape[1]+1)):
      target_output[i, -j, offset+int(cs[i,-j])] = 1
      
    if NoneClassOutput:
      for j in range(target_output.shape[1]):
        if np.sum(target_output[i, j]) == 0:
          target_output[i, j, 0] = 1


  return input_data, target_output




def combLoss(prediction, target):
  sumlos = 0
  for i in range(prediction.shape[0]):
    labels = target[i].argmax(dim=1)
    sumlos += CELoss(prediction[i], labels)
  return sumlos/prediction.shape[0]

def incrementCurriculum(trainError, epoch, sequence_length, maxsequence_length, curriculum_fre):
  return epoch != 0 and sequence_length < maxsequence_length and epoch % curriculum_fre == 0

def calcAccuracy(prediction, target, total=False):
  if not isinstance(prediction, T.Tensor):
    raise Exception("prediction is not a tensor")
  if not isinstance(target, T.Tensor):
    raise Exception("target is not a tensor")
  
  firstone = 0
  for i in range(target.shape[1]):
    if target[:,i].sum() != 0:
      firstone = i
      break
  prediction = prediction[:,firstone:]
  target = target[:, firstone:]
  labels = target.argmax(dim=2)
  accuracy = (prediction.argmax(dim=2) == labels).int().to(T.float32).mean().item()
  #accuracy = T.sum(T.isclose(prediction, target, atol=0.25).int().to(T.float32), dim=2).mean().item()/outputformat
      
  if total:
    return 1 if accuracy == 1 else 0
  return accuracy

#calcAccuracy(T.tensor([[[0,1],[0,0]]]), T.tensor([[[0,1],[0,0]]]))
  

Setting up a new session...


In [3]:
#d = generate_data(1, 3, 9, testoccurance=False)
#print(d)

In [4]:
import copy
from dnc.lib import STEPBYSTEPOBJ
import pickle

import os

batch_size = 100#int(1360*(1-0.15))-1
sequence_length = 8
sequence_max_length = 8
iterations = int(2*10**3) #200000
summarize_freq = int(iterations/100)
check_freq = int(iterations/10)
curriculum_freq = int(iterations/10)

print("batch_size", batch_size)
print("sequence_length", sequence_length)
print("iterations", iterations)
print("summarize_freq", summarize_freq)
print("check_freq", check_freq)
print("curriculum_freq", curriculum_freq)


  # input_size = output_size = args.input_size
mem_slot = 40#48 #112
mem_size = 1
read_heads = 1
curriculum_increment = 1
input_size = 3*sequence_max_length + 2
output_size = 64

replaceWithWrong = True

num_layers = 3 #5 

print("input_size", input_size)
print("output_size", output_size)
print("mem_slot", mem_slot)
print("mem_size", mem_size)
print("read_heads", read_heads)
print("num_layers", num_layers)


# mem operations = input_size*num_layers

batch_size 100
sequence_length 8
iterations 2000
summarize_freq 20
check_freq 200
curriculum_freq 200
input_size 26
output_size 64
mem_slot 40
mem_size 1
read_heads 1
num_layers 3


In [5]:
def create_directory_if_not_exists(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print("Directory created successfully!")
    else:
        print("Directory already exists.")

name = 'add_' + str(uuid4().hex)[:3] + ''

lastcp = None

create_directory_if_not_exists(name)

datas = []

print(name)

loadcp = False#'add_868/checkpoint_500.pth'#'add_dc0/checkpoint_1500.pth'#False#'add_e9a/checkpoint_1000.pth'#'add_587/checkpoint_1000.pth' #= 'checkpoint_add_46_242000.pth

print(input_size, output_size)
print(name)

Directory created successfully!
add_e26
26 64
add_e26


In [6]:
def lossfunction(x, y):
  return mse(x, y)

def lossfnwithReturnOther(output, target, otherReturn, OneAndZero=False, writegateThreshold=0.0, printlosses=False, returnTuple=False, learnthisobjective=LEARNTHISOBJECTIVES, alwaysone=True, epoch=1):
  
  if epoch % 50 == 0:
    with open(f'{name}/output_lossfn.txt', 'a') as lossfile:
      print("EPOCH: ", epoch, file=lossfile)
      for key in otherReturn.keys():
        if isinstance(otherReturn[key], T.Tensor):
          print(key, otherReturn[key].shape, file=lossfile)
          for i in range(otherReturn[key].shape[1] // num_layers):
            print(key, " Input ", i, file=lossfile)
            print(otherReturn[key][0, i*num_layers:(i+1)*num_layers], file=lossfile)

      
  sf = 100 # scale factor for softmax
  base = 0
  if learnthisobjective["general_loss"]:
    #print("GL ", output.shape, target.shape)
    smoutput = output[:, -(sequence_max_length+1):, :]
    smtarget = target[:, -(sequence_max_length+1):, :]
    #labels = smtarget.argmax(dim=2)
    #base = CELoss(smoutput, labels)
    #print("l1", smtarget[0])
    #print("l2", smtarget[0].argmax(dim=1))
    #print("l3", smoutput[0])
    #print("l4", smoutput[0].argmax(dim=1))
    #print("l5", CELoss(smoutput[0], smtarget[0].argmax(dim=1)))

    for i in range(smoutput.shape[0]):
      labels = smtarget[i].argmax(dim=1)
      base += CELoss(smoutput[i], labels)

    #base += criterion(smoutput, smtarget)#mse(smoutput, smtarget) + exp_loss(smoutput, smtarget) #T.sum(T.abs(smoutput - smtarget)) / batch_size
    #(exp_loss(smoutput, smtarget) +  mse(smoutput, smtarget))*target.numel()/batch_size

  allocation_weight_loss = 0
  if learnthisobjective["allocation_weights"]  and isinstance(otherReturn["allocation_weights"], T.Tensor):
    zarr = T.zeros_like(otherReturn["allocation_weights"])
    for b in range(otherReturn["allocation_weights"].shape[0]):
      for i in range(otherReturn["allocation_weights"].shape[1]):
        #zarr[b, i, :, (i % input_size) % zarr.shape[3]] = 1
        zarr[b, i, :, (i // num_layers) % zarr.shape[3]] = 1

    if epoch % 50 == 0:
      with open(f'{name}/output_lossfn.txt', 'a') as lossfile:
        print("Allocation Weights (target): ", zarr.shape, file=lossfile)
        for i in range(zarr.shape[1] // num_layers):
            print("Allocation Weights (target) Input ", i, file=lossfile)
            print(zarr[0, i*num_layers:(i+1)*num_layers], file=lossfile)
    allocation_weight_loss += lossfunction(otherReturn["allocation_weights"], zarr) #T.sum(T.abs(otherReturn["allocation_weights"] - zarr)) / batch_size

  allocation_gate_loss = 0
  if learnthisobjective["allocation_gate"]  and isinstance(otherReturn["allocation_gate"], T.Tensor):
    if OneAndZero:
      allocation_gate_loss += lossfunction(T.pow(((otherReturn["allocation_gate"]-0.5)*2),2), T.ones(otherReturn["allocation_gate"].shape))
      #T.sum(T.abs(T.pow(((otherReturn["allocation_gate"]-0.5)*2),2) - T.ones(otherReturn["allocation_gate"].shape))) / batch_size
    else: # no writing by content similarity
      allocation_gate_loss += lossfunction(otherReturn["allocation_gate"], T.ones(otherReturn["allocation_gate"].shape))
      
      #T.sum(T.abs(otherReturn["allocation_gate"]-T.ones(otherReturn["allocation_gate"].shape))) / batch_size

  write_gate_loss = 0
  if learnthisobjective["write_gate"]  and isinstance(otherReturn["write_gate"], T.Tensor):
    write_gate_loss += lossfunction(otherReturn["write_gate"], T.ones(otherReturn["write_gate"].shape))
    #T.sum(T.abs(otherReturn["write_gate"] - T.ones(otherReturn["write_gate"].shape))) / batch_size

  read_modes_loss = 0
  if learnthisobjective["read_modes"]  and isinstance(otherReturn["read_modes"], T.Tensor):
    read_modes_loss += lossfunction(otherReturn["read_modes"], T.nn.functional.softmax(otherReturn["read_modes"].clone().detach()*sf, 3))
    #T.sum(T.abs(otherReturn["read_modes"] - T.nn.functional.softmax(otherReturn["read_modes"].clone().detach()*sf, 3))) / batch_size

  write_weights_loss = 0
  if learnthisobjective["write_weights"]  and isinstance(otherReturn["write_weights"], T.Tensor):
    #write_weights_loss = T.sum(T.abs(otherReturn["write_weights"] - T.nn.functional.softmax(otherReturn["write_weights"].clone().detach()*sf, 3))) / batch_size
    zarr = T.zeros_like(otherReturn["write_weights"])
    for b in range(otherReturn["write_weights"].shape[0]):
      for i in range(otherReturn["write_weights"].shape[1]):
        zarr[b, i, :, (i // num_layers) % zarr.shape[3]] = 1
    write_weights_loss += lossfunction(otherReturn["write_weights"], zarr)
    #T.sum(T.abs(otherReturn["write_weights"] - zarr)) / batch_size
    if epoch % 10 == 0:
      with open(f'{name}/output_lossfn.txt', 'a') as lossfile:
        print("Write Weights (target): ", zarr.shape, file=lossfile)
        for i in range(zarr.shape[1] // num_layers):
            print("Write Weights (target) Input ", i, file=lossfile)
            print(zarr[0, i*num_layers:(i+1)*num_layers], file=lossfile)
  
  usage_vector_loss = 0
  if learnthisobjective["usage_vector"]  and isinstance(otherReturn["usage_vector"], T.Tensor):
    uv = T.zeros_like(otherReturn["usage_vector"])
    for b in range(otherReturn["usage_vector"].shape[0]):
      for i in range(otherReturn["usage_vector"].shape[1]-1):
        uv[b, i+1:, (i // num_layers) % uv.shape[2]] = 1
    usage_vector_loss += lossfunction(otherReturn["usage_vector"], uv)
    #T.sum(T.abs(otherReturn["usage_vector"] - uv)) / batch_size
    #print("Usage Vector LOSS: ", usage_vector_loss)
    #print("Usage Vector (output): ", otherReturn["usage_vector"])
    #print("Usage Vector (target): ", uv)
    if epoch % 10 == 0:
      with open(f'{name}/output_lossfn.txt', 'a') as lossfile:
        print("Usage Vector (target): ", uv.shape, file=lossfile)
        for i in range(uv.shape[1] // num_layers):
            print("Usage Vector (target) Input ", i, file=lossfile)
            print(uv[0, i*num_layers:(i+1)*num_layers], file=lossfile)


  if printlosses:
    print("losses: ", base, "\n Allocation Weight ", allocation_weight_loss,  "\n allocation gate", allocation_gate_loss, "\n write gate", write_gate_loss, "\n read modes", read_modes_loss,"\n write weights", write_weights_loss, "\n usage vector", usage_vector_loss)
  
  
  if returnTuple:
    return base, allocation_weight_loss, usage_vector_loss, allocation_gate_loss, write_gate_loss, read_modes_loss, write_weights_loss
  return base + allocation_weight_loss + allocation_gate_loss + write_gate_loss + read_modes_loss + write_weights_loss + usage_vector_loss



otherkey:  allocation_weights
torch.Size([100, 80, 1, 48])
otherkey:  allocation_gate
torch.Size([100, 80, 1])
otherkey:  write_gate
torch.Size([100, 80, 1])
otherkey:  write_weights
torch.Size([100, 80, 1, 48])
otherkey:  read_modes
torch.Size([100, 80, 1, 48])
otherkey:  read_weights
torch.Size([100, 80, 1, 48])
otherkey:  free_gates
torch.Size([100, 80, 1])
otherkey:  erase_vector
torch.Size([100, 80, 1, 1])
otherkey:  write_vector
torch.Size([100, 80, 1, 1])
otherkey:  usage_vector
torch.Size([100, 80, 48])

In [7]:
class CaclulateFactors:
  def __init__(self, iterations, nofactors=7, justOnes=False, softmax=False, softmaxTemp=1, minimum=0.01, maximum=100, resetepoch=50, sizeadjust=False, factadjust=None):
    self.losses = T.zeros((iterations+1, nofactors))
    self.factors = T.ones(nofactors)
    self.noFactors = nofactors
    self.justOnes = justOnes
    self.softmax = softmax
    self.softmaxTemp = softmaxTemp
    self.minimum = minimum
    self.maximum = maximum
    self.sizeAdjust = T.ones(nofactors)
    self.performsizeadjust = sizeadjust
    if factadjust is not None and isinstance(factadjust, T.Tensor):
      self.factadjust = factadjust
      self.adjusttwice = True
      print("Adjusting twice")
      print(factadjust)
    else:
      self.adjusttwice = False

  def setFactors(self, factors):
    self.factors = factors
  
  def setJustOnes(self, justOnes):
    self.justOnes = justOnes

  def reset(self):
    self.factors = T.ones(self.noFactors)

  def __call__(self, epoch, currentlosses, lookback=5, resetinterval=int(iterations//50), rescaleby=2):
    if self.performsizeadjust and epoch > 1:
      lb = max(1, epoch-lookback)
      sumlosses = T.sum(self.losses[epoch-lb:epoch]) / lb
      devisors = T.sum(self.losses[epoch-lb:epoch], dim=0)/lb
      devisors = T.where(devisors < 10**-2, 10**-2, devisors)
      self.sizeAdjust = T.nan_to_num(sumlosses / devisors, nan=1, posinf=1, neginf=1)
      self.sizeAdjust = T.where(self.sizeAdjust > 10**2, 10**2, self.sizeAdjust)
      self.sizeAdjust = T.where(self.sizeAdjust < 10**-2, 10**-2, self.sizeAdjust)
      self.sizeAdjust = T.where(self.sizeAdjust == 0, 1, self.sizeAdjust)

      #print(T.sum(self.losses[epoch-lookback:epoch], dim=0))
      #print("sumlosses: ", sumlosses)
      #print("Size Adjust: ", self.sizeAdjust)
      
    for i in range(len(currentlosses)):
      if isinstance(currentlosses[i], T.Tensor):
        self.losses[epoch,i] = currentlosses[i].detach().item()

    if self.justOnes:
      ret = T.ones(self.noFactors)
      if self.softmax:
        ret =  T.nn.functional.softmax(ret*self.softmaxTemp, 0)
      if self.performsizeadjust:
        ret = ret * self.sizeAdjust
      if self.adjusttwice:
        ret = ret * self.factadjust
      return ret

    
    if T.nonzero(self.losses[epoch] > 1).squeeze().numel() <= 1:
      self.factors = T.ones(self.noFactors)
      ret = self.factors.clone()
      if self.softmax:
        ret = T.nn.functional.softmax(ret*self.softmaxTemp, 0)
      if self.performsizeadjust:
        ret = ret * self.sizeAdjust
      if self.adjusttwice:
        ret = ret * self.factadjust
      return ret

    
    if epoch <= 1 or (epoch % resetinterval) == 0 or self.justOnes:
      self.factors = T.ones(self.noFactors)
      ret = self.factors.clone()

      if self.softmax:
        ret = T.nn.functional.softmax(ret*self.softmaxTemp, 0)
      if self.performsizeadjust:
        ret = ret * self.sizeAdjust
      if self.adjusttwice:
        ret = ret * self.factadjust
      return ret

    oldlosses = self.losses[max(0,epoch-lookback):epoch]
    oldlosses = T.where(oldlosses == 0, self.losses[epoch].unsqueeze(1).T.expand(oldlosses.shape), oldlosses)
    omeandist = T.mean(oldlosses, dim=0)-self.losses[epoch]
    # signdiff = T.sign(self.losses[epoch]-T.mean(oldlosses, dim=0))
    firstsignificant = T.where(omeandist > 1.2, 0, T.floor(T.nan_to_num(T.log10(T.mean(T.abs(oldlosses), dim=0)), nan=0, posinf=0, neginf=0)))

    omeandist = omeandist / 10**firstsignificant
    meandist = T.where(omeandist > 1.2, 2.01, omeandist)
    meandist = T.where(omeandist < 0, 0.01, meandist)    
    meandist = (2-meandist)+1


    newfactors = self.factors*meandist
    if T.any((newfactors > self.maximum)):
      newfactors = newfactors / rescaleby
    newfactors[self.losses[epoch] < 10**-4] = 1 # if loss is zero, keep factor at 1 NEW

    newfactors[newfactors < self.minimum] = self.minimum

    self.factors = newfactors


    ret = self.factors.clone()
    if self.softmax:
      ret = T.nn.functional.softmax(ret*self.softmaxTemp, 0)
    if self.performsizeadjust:
      ret = ret * self.sizeAdjust
    if self.adjusttwice:
      ret = ret * self.factadjust
    return ret



In [8]:
def CalcLossonValidationData(st, rnn, mhx, batch_size, input_size):
  testset = st.getDataByFlag("testData") # get test data
  testlosses = []
  testaccuracy = []
  if len(testset) == 0:
    raise ValueError("No test data available")
  

  for k in range(int(len(testset) / batch_size)+1): # split testdata into batch_size chunks
    input_TEST_data = np.zeros((batch_size, input_size, input_size))
    target_TEST_output = np.zeros((batch_size, input_size, outputformat))
    for i in range(batch_size):
      if i + k * batch_size < len(testset):
        input_TEST_data[i] = testset[k*batch_size+i]["input"]
        target_TEST_output[i] = testset[k*batch_size+i]["output"]
      else: # if there is not enough test data fill the remaining slots with random entries
        robj = random.choice(testset)
        input_TEST_data[i] = robj["input"]
        target_TEST_output[i] = robj["output"]

    input_TEST_data = var(T.from_numpy(input_TEST_data)).type(T.float32)
    target_TEST_output = var(T.from_numpy(target_TEST_output)).type(T.float32)
    if rnn.debug:
      TEST_output, _, _, _ = rnn(input_TEST_data, (None, mhx, None), reset_experience=True, pass_through_memory=True, retOther=True)
    else:
      TEST_output, _, _ = rnn(input_TEST_data, (None, mhx, None), reset_experience=True, pass_through_memory=True, retOther=True)

    #print(TEST_output)
    MyTestloss = combLoss((TEST_output), target_TEST_output).item() # calculate test loss
    MyTestaccuracy = calcAccuracy(TEST_output, target_TEST_output)
    testlosses.append(MyTestloss)
    testaccuracy.append(MyTestaccuracy)
  Testloss = np.mean(testlosses) # calculate test loss mean
  Testaccuracy = np.mean(testaccuracy) # calculate test accuracy mean
  return Testloss, Testaccuracy
    

In [9]:
def generateTrainingData(sequenceMaxLen, st):
    Traindata = {}
    
    for seqLen in tqdm(range(1, sequenceMaxLen)):
        Traindata[seqLen] = []
        inputspace = outputformat*2**seqLen
        for i in range(inputspace):
            for j in range(inputspace):
                sequenceA = np.zeros((1,seqLen, 1))
                sequenceB = np.zeros((1,seqLen, 1))
                for k in range(seqLen):
                    sequenceA[0][k] = (i >> k) & 1
                    sequenceB[0][k] = (j >> k) & 1
                sumsequence = calcsum(sequenceA, sequenceB, maxval=outputformat, batch_size=1, length=seqLen)
                input_data = np.zeros((1, input_size, input_size))
                target_output = np.zeros((1, input_size, outputformat))
                input_data[0, 0:seqLen, 0:1] = sequenceA
                input_data[0, seqLen, 1] = ENDSYM
                input_data[0, seqLen+1:seqLen*2+1, 2:3] = sequenceB
                input_data[0, seqLen*2+1, 3] = ENDSYM

                if st.isSaved(input_data[0], flag="testData"):
                    continue
                

                offset = 0
                if NoneClassOutput:
                    offset = 1
                for j in reversed(range(1, sumsequence.shape[1]+1)):
                    target_output[0, -j, offset+int(sumsequence[0,-j])] = 1
                if NoneClassOutput:
                    for j in range(target_output.shape[1]):
                        if np.sum(target_output[0, j]) == 0:
                            target_output[0, j, 0] = 1
                Traindata[seqLen].append({"input": input_data, "target": target_output})
    return Traindata


def getTrainingData(storedTrainingData, sequenceMinLen, sequenceLen, shuffle=True, batchsize=100, outputbatchsize=100, factors=None):
    if not isinstance(factors, T.Tensor) and factors.size != (sequenceLen-sequenceMinLen) and any(factors < 0) and any(factors > 1):
        factors = T.ones(sequenceLen-sequenceMinLen)
        print("Invalid factors, using default factors", factors.size != (sequenceLen-sequenceMinLen), any(factors < 0), any(factors > 1))
    allTrainData = []
    for i in range(sequenceMinLen, sequenceLen):
        random.shuffle(storedTrainingData[i])
        choosesize = int(len(storedTrainingData[i]) * factors[i-sequenceMinLen].item())
        print("Sequence Length: ", i, " Choosesize: ", choosesize)
        allTrainData.extend(storedTrainingData[i][0:choosesize])

    print("Possible Training Data: ", len(allTrainData))
    if shuffle:
        random.shuffle(allTrainData)
    if batchsize is None:
        batchsize = len(allTrainData)
    input_data = np.zeros((outputbatchsize, input_size, input_size))
    target_output = np.zeros((outputbatchsize, input_size, outputformat))
    for i in range(batchsize):
        alltrainindex = i 
        if i >= outputbatchsize:
            break
        if i >= len(allTrainData):
            alltrainindex = random.randint(0, len(allTrainData)-1)
        input_data[i] = allTrainData[alltrainindex]["input"]
        target_output[i] = allTrainData[alltrainindex]["target"]
    return input_data, target_output

def zeroslike(arr):
    if isinstance(arr, T.Tensor):
        return T.zeros_like(arr)
    if isinstance(arr, np.ndarray):
        return np.zeros_like(arr)
    
def shuffleNPArrays(arr1, arr2):
    size0 = arr1.shape[0]
    usedindices = []
    unusedindices = list(range(size0))
    newarr1 = zeroslike(arr1)
    newarr2 = zeroslike(arr2)
    for i in range(size0):
        index = random.choice(unusedindices)
        #print(index)
        usedindices.append(index)
        #print(usedindices)
        unusedindices.remove(index)
        #print(unusedindices)
        newarr1[index] = arr1[i]
        newarr2[index] = arr2[i]
        #print(newarr1)
        #print(newarr2)
    return newarr1, newarr2






In [10]:
def testshufflenparrays():
    a = np.array([[1,2,3],[4,5,6],[7,8,9]])
    b = np.array([[10,20,30],[40,50,60],[70,80,90]])
    print(a)
    print(b)
    a, b = shuffleNPArrays(a, b)
    print(a)
    print(b)

#testshufflenparrays()

In [None]:
settings = { 
  "address_every_slot": False,
  "factadjust": T.Tensor([1, 0, 0, 0, 0, 0, 0]),
}
modifcations = True
if modifcations:
  settings["address_every_slot"] = True
  settings["factadjust"] = T.Tensor([5,1,1,1,1,1,1])

import datetime
import select


rnn = DNC(
        input_size=input_size,
        hidden_size=output_size*3, #new *2
        output_size=outputformat, #new binary: 3 -> none, 0, 1
        rnn_type='lstm',
        num_layers=num_layers,
        num_hidden_layers=2, #1
        dropout=0,
        nr_cells=mem_slot,
        cell_size=mem_size,
        read_heads=read_heads,
        gpu_id=-1,
        debug='store_true',
        batch_first=True,
        independent_linears=True,
        nonlinearity='celu', #tanh
        address_every_slot=settings["address_every_slot"]
    )

testrnn = DNC(
  input_size=input_size,
  hidden_size=output_size*3, #new *2
  output_size=outputformat, #new binary: 3 -> none, 0, 1
  rnn_type='lstm',
  num_layers=num_layers,
  num_hidden_layers=2, #1
  dropout=0,
  nr_cells=mem_slot,
  cell_size=mem_size,
  read_heads=read_heads,
  gpu_id=-1,
  debug='store_true',
  batch_first=True,
  independent_linears=True,
  nonlinearity='celu', #tanh
  address_every_slot=settings["address_every_slot"]
)

with open(f'{name}/output.txt', 'a') as f:
  print(name)
  print(name, file=f)
  
  
  
  if loadcp != False:
    rnn.load_state_dict(T.load(loadcp, weights_only=True))
    rnn.eval()
    rnn.train()
  
  print(rnn)
  print(rnn, file=f)

  last_save_losses = []

  optimizer = optim.Adam(rnn.parameters(), lr=0.001, eps=1e-9, betas=[0.9, 0.98]) # 0.0001
 
  for i in range(2, sequence_max_length,1): # generate test data
    inputdataspace = 2**i*dataoutputformat # 2 i bit sequences
    testdatasize = int(inputdataspace*0.05)+1 #5%
    input_data, target_output = generate_data(testdatasize, i, input_size)
    for i in range(testdatasize):
      st.saveInput(input_data[i], output=target_output[i], withoutIncrement=True, flag="testData") #saveData

  storedTrainingData = generateTrainingData(sequence_max_length, st) # generate training data


  (chx, mhx, rv) = (None, None, None)
  Testloss = 0 # loss of test data
  
  learnthisobjective = copy.deepcopy(LEARNTHISOBJECTIVES)
  for key in learnthisobjective.keys():
    learnthisobjective[key] = True #all False


  learnthisobjective["read_modes"] = False



  factors = CaclulateFactors(iterations, nofactors=7, justOnes=True, 
                             softmax=False, softmaxTemp=1*10**-3, sizeadjust=False, 
                             factadjust=settings["factadjust"]
                             )

  lastobjectivechange = 0
  optimizerdict = optimizer.state_dict()


  learnthisobjextivecounter = 0

  start_time = datetime.datetime.now()

  NPinput_data, NPtarget_output = getTrainingData(storedTrainingData, 
                                                  1, sequence_length, 
                                                  shuffle=True, batchsize=None, 
                                                  outputbatchsize=batch_size, 
                                                  factors=T.Tensor([1,1,1,1,1,0.5,0.25])
                                                  )
  
  for epoch in tqdm(range(iterations + 1)):
    elapsed = datetime.datetime.now() - start_time
    rate = (elapsed.total_seconds() / (epoch + 1)) if epoch > 0 else 0
    remaining_time = datetime.timedelta(seconds=rate * (iterations + 1 - epoch - 1))
    finish_time = start_time + elapsed + remaining_time
    tqdm.write(f"Epoch {epoch}/{iterations+1} | ETA: {finish_time.strftime('%Y-%m-%d %H:%M:%S')}")



    summarize = (epoch % summarize_freq == 0)
    take_checkpoint = (epoch != 0) and (epoch % check_freq == 0)


    input_data, target_output = shuffleNPArrays(NPinput_data, NPtarget_output)
    input_data = var(T.from_numpy(input_data)).type(T.float32)
    target_output = var(T.from_numpy(target_output)).type(T.float32)


    bucket = 5
    smallbatchsize = int(batch_size/bucket)

    loss = 0
    accuracy = 0
    currentlosses = (0,0,0,0,0,0,0)
    base = 0
    allocation_weight_loss = 0
    cwl = 0
    allocation_gate_loss = 0
    write_gate_loss = 0
    read_modes_loss = 0
    write_content_weights_loss = 0
    Rloss = 0
    partiallosses = T.zeros(7)
    currfactors = T.zeros(7)

    for i in range(bucket):
      optimizer.zero_grad()
      currentinput = input_data[i*smallbatchsize:(i+1)*smallbatchsize]
      currenttarget = target_output[i*smallbatchsize:(i+1)*smallbatchsize]
      if rnn.debug:
        currentoutput, (chx, mhx, rv), v, otherReturn = rnn(currentinput, (None, mhx, None), reset_experience=True, pass_through_memory=True, retOther=True)
      else:
        currentoutput, (chx, mhx, rv), otherReturn = rnn(currentinput, (None, mhx, None), reset_experience=True, pass_through_memory=True, retOther=True)
      currentloss = combLoss((currentoutput), currenttarget)
      currentaccuracy = calcAccuracy(currentoutput, currenttarget)
      loss += currentloss.item()
      accuracy += currentaccuracy
      currentlosses = lossfnwithReturnOther(currentoutput, currenttarget, otherReturn, printlosses=epoch % 13 == 0, returnTuple=True, learnthisobjective=learnthisobjective, epoch=epoch)
      curbase, curallocation_weight_loss, curcwl, curallocation_gate_loss, curwrite_gate_loss, curread_modes_loss, curwrite_content_weights_loss = currentlosses
      base += curbase.detach() if isinstance(curbase, T.Tensor) else 0
      allocation_weight_loss += curallocation_weight_loss.detach() if isinstance(curallocation_weight_loss, T.Tensor) else 0
      cwl += curcwl.detach() if isinstance(curcwl, T.Tensor) else 0
      allocation_gate_loss += curallocation_gate_loss.detach() if isinstance(curallocation_gate_loss, T.Tensor) else 0
      write_gate_loss += curwrite_gate_loss.detach() if isinstance(curwrite_gate_loss, T.Tensor) else 0
      read_modes_loss += curread_modes_loss.detach() if isinstance(curread_modes_loss, T.Tensor) else 0
      write_content_weights_loss += curwrite_content_weights_loss.detach() if isinstance(curwrite_content_weights_loss, T.Tensor) else 0
      curcurrfactors = factors(epoch, currentlosses)
      curRloss = 0
      curpartiallosses = T.zeros(len(currentlosses))
      
      for i in range(curcurrfactors.shape[0]):
        if T.isclose(curcurrfactors[i],T.tensor(0.0)):
          continue
        curpartiallosses[i] = curcurrfactors[i] * currentlosses[i]
        curRloss += curcurrfactors[i] * currentlosses[i]

      currfactors += curcurrfactors
      Rloss += curRloss
      Rloss = Rloss.detach()
      partiallosses += curpartiallosses.detach()

      if np.isnan(curRloss.item()) or np.isinf(curRloss.item()) or np.isclose(curRloss.item(), 0):
        print("Loss is nan or inf or close to zero")
        continue
      print(curRloss.item())
      curRloss.backward()
      T.nn.utils.clip_grad_norm_(rnn.parameters(), 30)
      optimizer.step()
      curRloss = curRloss.detach()
      mhx = { k : (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() }


    loss /= bucket
    accuracy /= bucket
    base /= bucket
    allocation_weight_loss /= bucket
    cwl /= bucket
    allocation_gate_loss /= bucket
    write_gate_loss /= bucket
    read_modes_loss /= bucket
    write_content_weights_loss /= bucket
    Rloss /= bucket
    partiallosses /= bucket
    currfactors /= bucket


    if epoch % summarize_freq == 0:
      currentweights= rnn.state_dict()
      testrnn.load_state_dict(currentweights)
      Testloss, Testaccuracy = CalcLossonValidationData(st, testrnn, None, batch_size, input_size) # mhx -> None
     
   
    

    if epoch == 10:
      for mybool in RETURNOTHEROBJ["bools"]:
        otherkey = [kc[1] for kc in RETURNOTHEROBJ["keycombs"] if kc[0] == mybool]
        #print("otherkey: ", otherkey)
        if len(otherkey) == 0:
          continue
        otherkey = otherkey[0]
        print("otherkey: ", otherkey)
        if otherReturn[otherkey] is not None and isinstance(otherReturn[otherkey], T.Tensor):
          print(otherReturn[otherkey].shape)


    
    
    partiallosses = partiallosses.detach().tolist()
    print(f"Epoch: {epoch}, Accuracy: {accuracy}, Loss: {loss}, weighted Loss: {Rloss.item()}, factors: {currfactors}, losses: {currentlosses}",file=f)
    if epoch < 100 or epoch % 7 == 0:
      print(f"""Epoch: {epoch}, \n 
            Accuracy: {accuracy}, \n  
            Loss: {loss}, \n 
            weighted Loss: {Rloss.item()}, \n
            factors: {currfactors}, \n
            losses: {currentlosses} \n
            factored losses: {partiallosses} \n
""")
    if summarize:
      print("REAL Loss: ", Rloss.item())
      print("Factors: ", currfactors)
      print("losses: ", currentlosses)


    datas.append({
      "epoch": epoch, 
      "loss": loss, 
      "testloss": Testloss, 
      "sequencelength": sequence_length, 
      "accuracy": accuracy,
      "testaccuracy": Testaccuracy,
      "loss_base": base.item() if isinstance(base, T.Tensor) else 0,
      "loss_allocation_weight": allocation_weight_loss.item() if isinstance(allocation_weight_loss, T.Tensor) else 0,
      "loss_allocation_gate": allocation_gate_loss.item() if isinstance(allocation_gate_loss, T.Tensor) else 0,
      "loss_write_gate": write_gate_loss.item() if isinstance(write_gate_loss, T.Tensor) else 0,
      "loss_read_modes": read_modes_loss.item() if isinstance(read_modes_loss, T.Tensor) else 0,
      "loss_write_weights": write_content_weights_loss.item() if isinstance(write_content_weights_loss, T.Tensor) else 0,
      "loss_usage_vector": cwl.item() if isinstance(cwl, T.Tensor) else 0,
      "factor_base": currfactors[0].item(),
      "factor_allocation_weight": currfactors[1].item(),
      "factor_allocation_gate": currfactors[2].item(),
      "factor_write_gate": currfactors[3].item(),
      "factor_read_modes": currfactors[4].item(),
      "factor_write_weights": currfactors[5].item(),
      "factor_usage_vector": currfactors[6].item(),
      "factored_loss_base": partiallosses[0],
      "factored_loss_allocation_weight": partiallosses[1],
      "factored_loss_allocation_gate": partiallosses[2],
      "factored_loss_write_gate": partiallosses[3],
      "factored_loss_read_modes": partiallosses[4],
      "factored_loss_write_weights": partiallosses[5],
      "factored_loss_usage_vector": partiallosses[6],
      "factors": currfactors,
      "weighted_loss": Rloss.item()
      }) #append to the datas df

    


    #objectivesorder = ["general_loss"]
    #objectivesorder = ["allocation_weights", "allocation_gate", "write_gate", "write_weights", "usage_vector", "general_loss"]
    # objectivesorder.append("read_modes")
    # if epoch > (int(iterations//(len(objectivesorder)*2)) + lastobjectivechange) and learnthisobjextivecounter < len(objectivesorder) : #int(iterations//12)
    #   startlearningthis = objectivesorder[learnthisobjextivecounter % len(objectivesorder)]
    #   learnthisobjextivecounter += 1
    #   while learnthisobjective[startlearningthis] and learnthisobjextivecounter < len(objectivesorder):
    #     startlearningthis = objectivesorder[learnthisobjextivecounter % len(objectivesorder)]
    #     learnthisobjextivecounter += 1
    #   learnthisobjective[startlearningthis] = True
    #   print(f"Learning {startlearningthis} at {epoch}", file=f)
    #   print(f"Learning {startlearningthis} at {epoch}")
    #   lastobjectivechange = epoch
    #   optimizer.load_state_dict(optimizerdict)
    #   optimizer.zero_grad()


    


   
    loss_value = loss

    mhx = { k : (v.detach() if isinstance(v, var) else v) for k, v in mhx.items() }

    last_save_losses.append(loss_value)
    loss = np.mean(last_save_losses)

    if summarize:
      llprint("\n\tAvg. Loss: %.4f\n" % (loss))
      llprint("\n\t Accuracy: %.4f\n" % (accuracy))
      llprint("\n\tAvg. Test Loss: %.4f\n" % (Testloss))
      llprint("\n\t Test Accuracy: %.4f\n" % (Testaccuracy))
      if np.isnan(loss):
        continue
        #raise Exception('nan Loss')
      print("\n")

    if summarize and rnn.debug:
      last_save_losses = []

      viz.heatmap(
            v['memory'],
            opts=dict(
                xtickstep=10,
                ytickstep=2,
                title= name + 'Memory, t: ' + str(epoch) + ', loss: ' + str(loss),
                ylabel='layer * time',
                xlabel='mem_slot * mem_size'
            )
        )

      viz.heatmap(
            v['link_matrix'][-1].reshape(mem_slot, mem_slot),
            opts=dict(
                xtickstep=10,
                ytickstep=2,
                title=name + 'Link Matrix, t: ' + str(epoch) + ', loss: ' + str(loss),
                ylabel='mem_slot',
                xlabel='mem_slot'
            )
      )
     
      viz.heatmap(
            v['precedence'],
            opts=dict(
                xtickstep=10,
                ytickstep=2,
                title=name + 'Precedence, t: ' + str(epoch) + ', loss: ' + str(loss),
                ylabel='layer * time',
                xlabel='mem_slot'
            )
      )

    if incrementCurriculum(loss, epoch, sequence_length, sequence_max_length, curriculum_freq):
      #sequence_length = sequence_length + curriculum_increment
      #print("Increasing max length to " + str(sequence_length))
      pass

    if take_checkpoint:
      cur_weights = rnn.state_dict()
      T.save(cur_weights, f'{name}/checkpoint_{epoch}.pth')
      lastcp = f'{name}/checkpoint_{epoch}.pth'
      df = pd.DataFrame(datas)
      pickle.dump(df, open(f"{name}/df_{epoch}.pkl", "wb"))


  




add_e26

----------------------------------------
DNC(26, 192, num_layers=3, nr_cells=40, read_heads=1, cell_size=1, nonlinearity=celu, independent_linears=True, debug=store_true)
DNC(
  (lstm_layer_0): LSTM(27, 192, num_layers=2, batch_first=True)
  (lstm_layer_1): LSTM(193, 192, num_layers=2, batch_first=True)
  (lstm_layer_2): LSTM(193, 192, num_layers=2, batch_first=True)
  (rnn_layer_memory_shared): Memory(
    (read_keys_transform): Linear(in_features=192, out_features=1, bias=True)
    (read_strengths_transform): Linear(in_features=192, out_features=1, bias=True)
    (write_key_transform): Linear(in_features=192, out_features=1, bias=True)
    (write_strength_transform): Linear(in_features=192, out_features=1, bias=True)
    (erase_vector_transform): Linear(in_features=192, out_features=1, bias=True)
    (write_vector_transform): Linear(in_features=192, out_features=1, bias=True)
    (free_gates_transform): Linear(in_features=192, out_features=1, bias=True)
    (allocation_gate_

100%|██████████| 7/7 [01:50<00:00, 15.77s/it]


Adjusting twice
tensor([1., 0., 0., 0., 0., 0., 0.])
Sequence Length:  1  Choosesize:  16
Sequence Length:  2  Choosesize:  64
Sequence Length:  3  Choosesize:  256
Sequence Length:  4  Choosesize:  1024
Sequence Length:  5  Choosesize:  4096
Sequence Length:  6  Choosesize:  8192
Sequence Length:  7  Choosesize:  16384
Possible Training Data:  30032


  0%|          | 0/2001 [00:00<?, ?it/s]

Epoch 0/2001 | ETA: 2025-02-27 09:59:47
losses:  tensor(13.7403, grad_fn=<AddBackward0>) 
 Allocation Weight  tensor(0.0377, grad_fn=<AddBackward0>) 
 allocation gate tensor(0.2496, grad_fn=<AddBackward0>) 
 write gate tensor(0.2355, grad_fn=<AddBackward0>) 
 read modes 0 
 write weights tensor(0.0252, grad_fn=<AddBackward0>) 
 usage vector tensor(0.1914, grad_fn=<AddBackward0>)
13.740313529968262
losses:  tensor(13.5675, grad_fn=<AddBackward0>) 
 Allocation Weight  tensor(0.0376, grad_fn=<AddBackward0>) 
 allocation gate tensor(0.2503, grad_fn=<AddBackward0>) 
 write gate tensor(0.2352, grad_fn=<AddBackward0>) 
 read modes 0 
 write weights tensor(0.0252, grad_fn=<AddBackward0>) 
 usage vector tensor(0.1923, grad_fn=<AddBackward0>)
13.567450523376465
losses:  tensor(13.3289, grad_fn=<AddBackward0>) 
 Allocation Weight  tensor(0.0376, grad_fn=<AddBackward0>) 
 allocation gate tensor(0.2511, grad_fn=<AddBackward0>) 
 write gate tensor(0.2347, grad_fn=<AddBackward0>) 
 read modes 0 
 wri

  0%|          | 1/2001 [00:46<25:56:00, 46.68s/it]



Epoch 1/2001 | ETA: 2025-02-27 22:59:22
13.313023567199707
13.553184509277344
13.260293960571289
13.468916893005371
13.693408966064453


  0%|          | 2/2001 [01:22<22:16:11, 40.11s/it]

Epoch: 1, 
 
            Accuracy: 0.550000011920929, 
  
            Loss: 0.5842223644256592, 
 
            weighted Loss: 13.45776653289795, 

            factors: tensor([1., 0., 0., 0., 0., 0., 0.]), 

            losses: (tensor(13.6934, grad_fn=<AddBackward0>), tensor(0.0375, grad_fn=<AddBackward0>), tensor(0.1916, grad_fn=<AddBackward0>), tensor(0.2506, grad_fn=<AddBackward0>), tensor(0.2356, grad_fn=<AddBackward0>), 0, tensor(0.0252, grad_fn=<AddBackward0>)) 

            factored losses: [13.45776653289795, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 


Epoch 2/2001 | ETA: 2025-02-28 01:14:11
13.15988826751709
13.54710578918457
13.657340049743652
13.507205963134766
13.505313873291016


  0%|          | 3/2001 [01:58<21:21:02, 38.47s/it]

Epoch: 2, 
 
            Accuracy: 0.55, 
  
            Loss: 0.5949205040931702, 
 
            weighted Loss: 13.475370407104492, 

            factors: tensor([1., 0., 0., 0., 0., 0., 0.]), 

            losses: (tensor(13.5053, grad_fn=<AddBackward0>), tensor(0.0375, grad_fn=<AddBackward0>), tensor(0.1915, grad_fn=<AddBackward0>), tensor(0.2511, grad_fn=<AddBackward0>), tensor(0.2366, grad_fn=<AddBackward0>), 0, tensor(0.0252, grad_fn=<AddBackward0>)) 

            factored losses: [13.475370407104492, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 


Epoch 3/2001 | ETA: 2025-02-28 02:30:06
13.144377708435059
13.715059280395508
13.722031593322754
13.238771438598633
13.502870559692383


  0%|          | 4/2001 [02:34<20:49:42, 37.55s/it]

Epoch: 3, 
 
            Accuracy: 0.55, 
  
            Loss: 0.5816901445388794, 
 
            weighted Loss: 13.464620590209961, 

            factors: tensor([1., 0., 0., 0., 0., 0., 0.]), 

            losses: (tensor(13.5029, grad_fn=<AddBackward0>), tensor(0.0375, grad_fn=<AddBackward0>), tensor(0.1917, grad_fn=<AddBackward0>), tensor(0.2519, grad_fn=<AddBackward0>), tensor(0.2374, grad_fn=<AddBackward0>), 0, tensor(0.0252, grad_fn=<AddBackward0>)) 

            factored losses: [13.464620590209961, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 


Epoch 4/2001 | ETA: 2025-02-28 03:13:03
13.5504789352417
13.316232681274414


  0%|          | 4/2001 [02:55<24:18:02, 43.81s/it]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(datas) # plot loss 
pickle.dump(df, open(f"{name}/df_total.pkl", "wb"))
print(df.columns)


fig = go.Figure()
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss"], mode='lines', name='Train Data'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["testloss"], mode='lines', name='Test Data'))
fig.update_layout(title='Losses', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()
fig.write_image(f"{name}/losses.png")

fig = go.Figure()
fig.add_trace(go.Scatter(x=df["epoch"], y=df["accuracy"], mode='lines', name='Train Data'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["testaccuracy"], mode='lines', name='Test Data'))
fig.update_layout(title='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.show()
fig.write_image(f"{name}/accuracy.png")

if "factored_loss_base" in df.columns:
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_loss_base"], mode='lines', name='Base Loss'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_loss_allocation_weight"], mode='lines', name='Allocation Weight Loss'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_loss_allocation_gate"], mode='lines', name='Allocation Gate Loss'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_loss_write_gate"], mode='lines', name='Write Gate Loss'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_loss_read_modes"], mode='lines', name='Read Modes Loss'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_loss_write_weights"], mode='lines', name='Write Weights Loss'))
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_loss_usage_vector"], mode='lines', name='Usage Vector Loss'))
  fig.update_layout(title='Factored Losses', xaxis_title='Epoch', yaxis_title='Loss', yaxis_type="log")
  fig.show()
  fig.write_image(f"{name}/factored_losses.png")

fig = go.Figure()
fig.add_trace(go.Scatter(x=df["epoch"], y=df["weighted_loss"], mode='lines', name='Weighted Loss'))
fig.update_layout(title='Weighted Loss', xaxis_title='Epoch', yaxis_title='Loss', yaxis_type="log")
fig.show()
fig.write_image(f"{name}/weighted_loss.png")

fig = go.Figure()
fig.add_trace(go.Scatter(x=df["epoch"], y=df["sequencelength"], mode='lines', name='Sequence Length'))
fig.update_layout(title='Sequence Length', xaxis_title='Epoch', yaxis_title='Length')
fig.show()
fig.write_image(f"{name}/sequencelength.png")

fig = go.Figure()
fig.add_trace(go.Scatter(x=df["epoch"], y=df["factor_base"], mode='lines', name='Base Factor'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["factor_allocation_weight"], mode='lines', name='Allocation Weight Factor'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["factor_allocation_gate"], mode='lines', name='Allocation Gate Factor'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["factor_write_gate"], mode='lines', name='Write Gate Factor'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["factor_read_modes"], mode='lines', name='Read Modes Factor'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["factor_write_weights"], mode='lines', name='Write Weights Factor'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["factor_usage_vector"], mode='lines', name='Usage Vector Factor'))
fig.update_layout(title='Factors', xaxis_title='Epoch', yaxis_title='Factor')
fig.show()
fig.write_image(f"{name}/factors.png")

fig = go.Figure()
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss_base"], mode='lines', name='Base Loss'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss_allocation_weight"], mode='lines', name='Allocation Weight Loss'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss_allocation_gate"], mode='lines', name='Allocation Gate Loss'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss_write_gate"], mode='lines', name='Write Gate Loss'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss_read_modes"], mode='lines', name='Read Modes Loss'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss_write_weights"], mode='lines', name='Write Weights Loss'))
fig.add_trace(go.Scatter(x=df["epoch"], y=df["loss_usage_vector"], mode='lines', name='Usage Vector Loss'))
fig.update_layout(title='Losses', xaxis_title='Epoch', yaxis_title='Loss', yaxis_type="log")
fig.show()
fig.write_image(f"{name}/partial_losses.png")

for loss in ["loss_base", "loss_allocation_weight", "loss_allocation_gate", "loss_write_gate", "loss_read_modes", "loss_write_weights", "loss_usage_vector"]:
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df["epoch"], y=df[loss], mode='lines', name=loss))
  fig.update_layout(title=loss, xaxis_title='Epoch', yaxis_title='Loss', yaxis_type="log")
  fig.show()
  fig.write_image(f"{name}/loss_{loss}.png")
  
  newkey = "factor"+loss[4:]
  if not newkey in df.columns:
    continue
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df["epoch"], y=df[newkey], mode='lines', name=newkey))
  fig.update_layout(title=newkey, xaxis_title='Epoch', yaxis_title='Factor')
  fig.show()
  fig.write_image(f"{name}/{newkey}.png")

  if not "factored_"+loss  in df.columns:
    continue
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df["epoch"], y=df["factored_"+loss], mode='lines', name="Factored "+loss))
  fig.update_layout(title="Factored "+loss, xaxis_title='Epoch', yaxis_title='Loss', yaxis_type="log")
  fig.show()
  fig.write_image(f"{name}/factored_{loss}.png")



Index(['epoch', 'loss', 'testloss', 'sequencelength', 'accuracy',
       'testaccuracy', 'loss_base', 'loss_allocation_weight',
       'loss_allocation_gate', 'loss_write_gate', 'loss_read_modes',
       'loss_write_weights', 'loss_usage_vector', 'factor_base',
       'factor_allocation_weight', 'factor_allocation_gate',
       'factor_write_gate', 'factor_read_modes', 'factor_write_weights',
       'factor_usage_vector', 'factored_loss_base',
       'factored_loss_allocation_weight', 'factored_loss_allocation_gate',
       'factored_loss_write_gate', 'factored_loss_read_modes',
       'factored_loss_write_weights', 'factored_loss_usage_vector', 'factors',
       'weighted_loss'],
      dtype='object')


In [None]:
# #from dnc.dnc import DNC

# if 'rnn' in locals() or 'rnn' in globals():
#   del rnn

# rnn = DNC(
#         input_size=input_size,
#         hidden_size=output_size,
#         rnn_type='rnn',
#         #rnn_type='lstm',
#         num_layers=num_layers,
#         num_hidden_layers=1,
#         dropout=0,
#         nr_cells=mem_slot,
#         cell_size=mem_size,
#         read_heads=read_heads,
#         gpu_id=-1,
#         debug='store_true',
#         batch_first=True,
#         independent_linears=True,
#         nonlinearity='tanh',
#     )

# if not 'name' in locals() or not 'name' in globals():
#   name = 'add_9b2'
# if not 'lastcp' in locals() or not 'lastcp' in globals():
#   lastcp = f'{name}/checkpoint_1000.pth'
  
print(name)

pickle.dump(NPinput_data, open(f"{name}/NPinput_data.pkl", "wb"))
pickle.dump(NPtarget_output, open(f"{name}/NPtarget_output.pkl", "wb"))

input_data, target_output = shuffleNPArrays(NPinput_data, NPtarget_output)
input_data = input_data[0:1]
target_output = target_output[0:1]

with open(f"{name}/output_2.txt", "w") as f:
  batch_size=1
  rnn.load_state_dict(T.load(lastcp, weights_only=True))
  rnn.eval()
  
  stepByStep = copy.deepcopy(STEPBYSTEPOBJ)

  i=0
  llprint("\nIteration %d/%d" % (i, iterations))
  # We test now the learned generalization using sequence_max_length examples
  #random_length = np.random.randint(2, sequence_length  + 1)
  #input_data, target_output = generate_data(1, random_length, input_size)

  

  
  input_data = var(T.from_numpy(input_data)).type(T.float32)
  target_output = var(T.from_numpy(target_output)).type(T.float32)

  labels = target_output.argmax(dim=2)

  stepByStep["CurrI"] = i
  stepByStep["currentObj"] = copy.deepcopy(stepByStep["defObj"])
  stepByStep["currentObj"]["i"] = i 
  stepByStep["input"] = input_data.detach().numpy()
  stepByStep["target"] = target_output.detach().numpy()
  stepByStep["MEMORYCOLUMNS"] = mem_slot
  stepByStep["INPUTSIZE"] = input_size
  stepByStep["OUTPUTSIZE"] = output_size # dataoutputsize?
  stepByStep["read_heads"] = read_heads

  stepByStep["INTERMEDIATEOUTPUT"] = output_size
  stepByStep["DNCOUPTPUT"] = output_size
    
  if rnn.debug:
    output, (chx, mhx, rv), v = rnn(input_data, (None, None, None), reset_experience=True, pass_through_memory=True, stepByStep=stepByStep)
  else:
    output, (chx, mhx, rv) = rnn(input_data, (None, None, None), reset_experience=True, pass_through_memory=True, stepByStep=stepByStep)

  print("input_data: ", input_data)
  print("output: ", output)
  print("target_output: ", target_output)
  print("labels: ", labels)
  print("accuracy: ", calcAccuracy(output.type(T.float32), target_output))

  stepByStep["output"] = output
  stepByStep["objects"].append(copy.deepcopy(stepByStep["currentObj"]))
  stepByStep['loss'] = str(combLoss(output.type(T.float32), target_output).item())
  stepByStep['accuracy'] = (output.type(T.float32).argmax(dim=2) == labels).int().to(T.float32).mean().item()
  #output = output[:, -1, :].sum().data.cpu().numpy()
  #target_output = target_output.sum().data.cpu().numpy()
  print("loss", combLoss(output.type(T.float32), target_output).item())
  print(stepByStep["input"].shape)
  print(stepByStep["output"].shape)
  print(stepByStep["target"].shape)
  #raise Exception("STOP")

  print(stepByStep)

  pickle.dump(stepByStep, open(f"{name}/stepByStep.pkl", "wb"))

  print("\n\n")
  print("Input: ", tensor2string(input_data[0]), file=f)
  print("Output: ", tensor2string(output[0]), file=f)
  print("Target: ", tensor2string(target_output[0]), file=f)
  print("CE Loss: ", str(mse(output[0].to(dtype=T.float32), target_output[0]).item()), file=f)
  print("Log Loss: ", str(criterion(output[0].to(dtype=T.float32), target_output[0]).item()), file=f)
  print("Exp Loss: ", str(exp_loss(output[0].to(dtype=T.float32), target_output[0]).item()), file=f)
  print("\n\n")
  print("CE Loss: ", str(mse(output.to(dtype=T.float32), target_output).item()), file=f)
  print("Log Loss: ", str(criterion(output.to(dtype=T.float32), target_output).item()), file=f)
  print("Exp Loss: ", str(exp_loss(output.to(dtype=T.float32), target_output).item()), file=f)
  print("\n\n")

  try:
    print("\nReal value: ", ' = ' + str(int(target_output[0])))
    print("Predicted:  ", ' = ' + str(int(output // 1)) + " [" + str(output) + "]")
  except Exception as e:
    pass

  

add_1f0

Iteration 0/2000

input_data:  tensor([[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0., -9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
 