In [2]:
from pyexpat import model
import importlib
import torch
import torch.nn.functional as F
import numpy as np
from time import time
import torch.nn as nn
import itertools

from PIL.ImageChops import offset
from numpy import character
from scipy._lib.array_api_compat import device
from torch.utils.data import DataLoader, TensorDataset
import src.helpers as helpers
import src.fredkin_layers as fredkins
from src.helpers import create_permutation_lookup_table, print_fredkin_layer
import math

importlib.reload(fredkins)
importlib.reload(helpers)

<module 'src.helpers' from '/Users/erk/Library/Mobile Documents/com~apple~CloudDocs/Persönlich/Studium_Data Science/Masterarbeit/src/helpers.py'>

In [38]:
#Create model and run training for basic boolean functions
NUM_EPOCHS = 100
LEARNING_RATE = 0.2
BATCH_SIZE = 4
BOOL_FCT = "biimplication"
SEED=None
Z = 0.0
C = 'a'

class FredkinNet(nn.Module):
    def __init__(self,din:int,dout:int,seed:int=SEED,verb=False):
        super().__init__()
        if seed is not None:
            seed = [seed+i for i in range(3)]
        else:
            seed = [seed for i in range(3)]
        self.verb=verb
        self.fred1 = fredkins.FredkinXLayer(din,4,seed=seed[0],random_connections=False,offset=0,wgts_initial='evenly',z=Z,verb=False)
        self.fred2 = fredkins.FredkinXLayer(4,4,seed=seed[1],random_connections=False,offset=1,wgts_initial='evenly',z=Z,verb=False)
        self.fred3 = fredkins.FredkinXLayer(4,2,seed=seed[1],random_connections=False,offset=1,wgts_initial='evenly',z=Z,verb=False)
        #self.fred3 = fredkins.Fredkin3plus6Layer(9,6,seed=seed,offset=1,z=Z,verb=False)
        #self.fred4 = fredkins.Fredkin3plus6Layer(6,3,seed=seed,offset=1,z=Z,verb=False)
        #self.fred5 = fredkins.Fredkin24Layer(6,dout,seed=seed,offset=1,z=Z,verb=False)
        
        
    def forward(self,x):
        out = self.fred1(x)
        out = self.fred2(out)
        out = self.fred3(out)
        if self.verb: print(f"output-vector: {out} with shape {out.shape}")
        #out = out[:,[0,2,4]]
        #out = self.fred3(out)
        #out = self.fred4(out)
        #out = self.fred5(out)
        #out = out.mean(dim=1)
        #print(out.shape)
        out = out[:,1]
        if self.verb: print(f"interpretable output: {out}")
        return out

net = FredkinNet(2,3,verb=False)

x_train,labels_train = helpers.generate_bool_fct_data(name=BOOL_FCT,return_triple=False,c=C,verb=True)
x_test,labels_test = helpers.generate_bool_fct_data(name=BOOL_FCT,return_triple=False,c=C)
train_data = TensorDataset(x_train,labels_train)
val_data = TensorDataset(x_test,labels_test)

train_loader = DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=False)
val_loader = DataLoader(val_data,batch_size=BATCH_SIZE,shuffle=False)

wgts_params = []
selector_params = []
for name, p in net.named_parameters():
    if not p.requires_grad:
        continue
    if 'wgts' in name:
        wgts_params.append(p)
    elif 'selector' in name:
        selector_params.append(p)

optim = torch.optim.Adam([
    {'params': selector_params, 'lr': LEARNING_RATE**4},
    {'params': wgts_params, 'lr': LEARNING_RATE},
],weight_decay=1e-5)


#optim = torch.optim.Adam(net.parameters(),lr=LEARNING_RATE,weight_decay=1e-5)
criterion = torch.nn.BCELoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(1,NUM_EPOCHS+1):
    #TRAIN
    net.train()
    running_loss =0.0
    running_samples = 0
    
    for x, y in train_loader:
        #print("x,y: ",x,y)
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        
        logits = net(x)
        logits = logits - torch.sign(logits)*(1/100000)*torch.abs(logits)
        #print(f"logits: {logits}")
        print(logits)
        loss = criterion(logits,y)
        
        optim.zero_grad()
        loss.backward()
        print("\n--- Full Gradients ---")
        for name, param in net.named_parameters():
            #break
            print(f"{name}:\n{param.grad}\n")
        print("-----------------------\n")
        #print("x,y: ",x,y)
        optim.step()
        
        bs = x.size(0)
        running_loss += loss.item()*bs
        running_samples += bs
    
    epoch_loss = running_loss/running_samples
    #VALIDATE
    net.eval()
    val_loss = 0.0
    val_samples = 0
    correct =0
    
    with torch.no_grad():
        for x, y in train_loader:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            logits = net(x)
            #print(f"logits eval: {logits}")
            
            loss = criterion(logits,y)
            
            bs = x.size(0)
            val_loss += loss.item()*bs
            val_samples += bs
            
            #probs = torch.sigmoid(logits)
            probs = logits
            preds = (probs >= 0.5).float()
            #print(f"probs: {probs}, preds: {preds}, expected: {y}")
            correct += (preds == y).float().sum().item()
    val_loss = val_loss/val_samples
    val_acc = correct / val_samples
    print(f"Epoch {epoch:02d} train_loss = {epoch_loss:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
    if val_acc >= 0.99 and epoch>=1:
        NUM_EPOCHS = epoch
        break
 
lookup_table = create_permutation_lookup_table(inputs = [0,1,2,3])
print("\n--DETAILED PARAMS--\n")
for name, module in net.named_modules():
    helpers.print_fredkin_layer(module,lookup_table=lookup_table)
print("\n--PORT ASSIGNMENTS--\n")
for name, module in net.named_modules():
    if module._get_name()=='Fredkin3plus6Layer': helpers.print_fredkin_layer3plus6(module,lookup_table=lookup_table)
print("\nEPOCHS RUN:", NUM_EPOCHS)


Boolean Function:  tensor([[0., 0.],
        [0., 1.],
        [1., 0.],
        [1., 1.]])  Labels:  tensor([1., 0., 0., 1.])
tensor([0.3518, 0.5741, 0.4259, 0.6481], grad_fn=<SubBackward0>)

--- Full Gradients ---
fred1.wgts:
tensor([[-0.0031,  0.0015, -0.0067, -0.0007,  0.0015,  0.0075],
        [-0.0031,  0.0015, -0.0067, -0.0007,  0.0015,  0.0075],
        [-0.0092,  0.0044, -0.0201, -0.0020,  0.0044,  0.0225],
        [-0.0010,  0.0005, -0.0022, -0.0002,  0.0005,  0.0025]])

fred2.wgts:
tensor([[-0.0185,  0.0037, -0.0025,  0.0037,  0.0037,  0.0100],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0062,  0.0012, -0.0008,  0.0012,  0.0012,  0.0033]])

fred3.wgts:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0247,  0.0049,  0.0011,  0.0049,  0.0049,  0.0088]])

-----------------------

Epoch 01 train_loss = 0.7217 val_loss=25.0000 val_acc=0.7500
tensor([0.4009, 

In [3]:
# %% [markdown]
# # Custom 2-Layer Neuron Gradient Calculator

# %% 
import sympy as sp

# --- Define symbols ---
w1, w2, w3 = sp.symbols('w1 w2 w3')   # Learnable weights
a, b = sp.symbols('a b')               # Inputs
t = sp.symbols('t')                    # Target for BCELoss

# --- Softmax ---
exp_w1 = sp.exp(w1)
exp_w2 = sp.exp(w2)
exp_w3 = sp.exp(w3)
sum_exp = exp_w1 + exp_w2 + exp_w3

p1 = exp_w1 / sum_exp
p2 = exp_w2 / sum_exp
p3 = exp_w3 / sum_exp

# --- Define helper functions for output coefficients ---
f1 = a*1 + (1-a)*0
f2 = a*a + (1-a)*b
f3 = b*a + (1-b)*0

g1 = (1-a)*1 + a*0
g2 = (1-a)*a + a*b
g3 = (1-b)*a + b*0

# --- Layer 1 outputs ---
out1_1 = p1*f1 + p2*f2 + p3*f3
out2_1 = p1*g1 + p2*g2 + p3*g3

# --- Layer 2 outputs (take layer1 outputs as inputs) ---
a2, b2 = out1_1, out2_1  # inputs for layer2

f1_2 = a2*1 + (1-a2)*0
f2_2 = a2*a2 + (1-a2)*b2
f3_2 = b2*a2 + (1-b2)*0

g1_2 = (1-a2)*1 + a2*0
g2_2 = (1-a2)*a2 + a2*b2
g3_2 = (1-b2)*a2 + b2*0

out1_2 = p1*f1_2 + p2*f2_2 + p3*f3_2
out2_2 = p1*g1_2 + p2*g2_2 + p3*g3_2

# --- BCELoss ---
loss = - (t * sp.log(out2_2) + (1-t) * sp.log(1 - out2_2))

# --- Compute gradients wrt weights ---
#grad_w1 = sp.simplify(sp.diff(loss, w1))
#grad_w2 = sp.simplify(sp.diff(loss, w2))
#grad_w3 = sp.simplify(sp.diff(loss, w3))

grad_w1 = sp.diff(loss, w1)
grad_w2 = sp.diff(loss, w2)
grad_w3 = sp.diff(loss, w3)

# --- Display results ---
print("Loss:")
sp.pprint(loss)
print("\nGradients:")
print("dL/dw1 =")
sp.pprint(grad_w1)
print("\ndL/dw2 =")
sp.pprint(grad_w2)
print("\ndL/dw3 =")
sp.pprint(grad_w3)


Loss:
       ⎛⎛⎛         w₃               w₁        ⎛ 2            ⎞  w₂⎞ ⎛          ↪
       ⎜⎜⎜    a⋅b⋅ℯ              a⋅ℯ          ⎝a  + b⋅(1 - a)⎠⋅ℯ  ⎟ ⎜ a⋅(1 - b ↪
       ⎜⎜⎜─────────────── + ─────────────── + ────────────────────⎟⋅⎜───────── ↪
       ⎜⎜⎜ w₁    w₂    w₃    w₁    w₂    w₃      w₁    w₂    w₃   ⎟ ⎜ w₁    w₂ ↪
       ⎜⎝⎝ℯ   + ℯ   + ℯ     ℯ   + ℯ   + ℯ       ℯ   + ℯ   + ℯ     ⎠ ⎝ℯ   + ℯ   ↪
- t⋅log⎜────────────────────────────────────────────────────────────────────── ↪
       ⎜                                                                       ↪
       ⎝                                                                       ↪

↪    w₃               w₁                        w₂⎞   ⎛         w₃             ↪
↪ )⋅ℯ        (1 - a)⋅ℯ       (a⋅b + a⋅(1 - a))⋅ℯ  ⎟   ⎜    a⋅b⋅ℯ               ↪
↪ ────── + ─────────────── + ─────────────────────⎟ + ⎜─────────────── + ───── ↪
↪     w₃    w₁    w₂    w₃       w₁    w₂    w₃   ⎟   ⎜ w₁    w₂    w₃    w₁   ↪
↪  + ℯ     ℯ   + ℯ   

In [12]:
#CREATE MAJORITY BIT DATASET
#Generate some data
BATCH_SIZE = 10
x_train,labels_train = helpers.create_majority_dataset(samplesize=1000,n_bits=11)
x_test,labels_test = helpers.create_majority_dataset(samplesize=1000,n_bits=11)
print(x_train)
print(labels_train)
train_data = TensorDataset(x_train,labels_train)
val_data = TensorDataset(x_test,labels_test)
train_loader = DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=False)
val_loader = DataLoader(val_data,batch_size=BATCH_SIZE,shuffle=False)

tensor([[0., 0., 1.,  ..., 1., 1., 0.],
        [1., 1., 0.,  ..., 1., 0., 0.],
        [0., 0., 1.,  ..., 1., 0., 1.],
        ...,
        [0., 0., 1.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 1., 1.,  ..., 1., 0., 0.]])
tensor([1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1.,
        1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1.,
        1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0.,
        0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0.,
        1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1.,
        1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1.,
        0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
        1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
        1., 1., 1., 0., 0.

In [4]:
#Define a Fredkin net with n layers, each with same input and output size
class FredkinNet01(nn.Module):
    def __init__(self,din:int,dout:int,n_layers:int,seed:int=None,random_connections=True,fredkin_type:chr='Fredkin3plus6Layer',verb=False):
        super().__init__()
        self.din=din
        self.dout=dout
        self.verb = verb
        self.type = fredkin_type
        self.seed=seed
        self.ran_conn=random_connections
        if self.type == 'Fredkin3plus6Layer':
            self.input_layer = fredkins.Fredkin3plus6Layer(din,dout,seed=self.seed,random_connections=self.ran_conn,offset=0,verb=verb)
            self.layers = nn.ModuleList([
                fredkins.Fredkin3plus6Layer(self.dout,self.dout,seed=self.seed+i+1 if seed is not None else None,random_connections=self.ran_conn,offset=1,verb=self.verb) for i in range(n_layers-1)
            ])
            #self.out_layer = fredkins.Fredkin3plus6Layer(din,din,seed=seed,random_connections=self.ran_conn,offset=1,verb=self.verb)
        elif self.type =='Fredkin6Layer':
            self.input_layer = fredkins.Fredkin6Layer(din,dout,seed=self.seed,random_connections=self.ran_conn,offset=0,verb=verb)
            self.layers = nn.ModuleList([
                fredkins.Fredkin6Layer(self.dout,self.dout,seed=self.seed+i+1 if seed is not None else None,random_connections=self.ran_conn,offset=1,verb=self.verb) for i in range(n_layers-1)
            ])
        elif self.type=='Fredkin24Layer':
            self.input_layer = fredkins.Fredkin24Layer(din,dout,seed=self.seed,random_connections=self.ran_conn,offset=0,verb=verb)
            self.layers = nn.ModuleList([
                fredkins.Fredkin24Layer(self.dout,self.dout,seed=self.seed+i+1 if seed is not None else None,random_connections=self.ran_conn,offset=1,verb=self.verb) for i in range(n_layers-1)
            ])
        elif self.type=='Fredkin60Layer':
            self.input_layer = fredkins.Fredkin60Layer(din,dout,seed=self.seed,random_connections=self.ran_conn,offset=0,verb=verb)
            self.layers = nn.ModuleList([
                fredkins.Fredkin60Layer(self.dout,self.dout,seed=self.seed+i+1 if seed is not None else None,random_connections=self.ran_conn,offset=1,verb=self.verb) for i in range(n_layers-1)
            ])
        elif self.type=='FredkinXLayer':
            self.input_layer = fredkins.FredkinXLayer(din,dout,seed=self.seed,random_connections=self.ran_conn,offset=0,verb=verb)
            self.layers = nn.ModuleList([
                fredkins.FredkinXLayer(self.dout,self.dout,seed=self.seed+i+1 if seed is not None else None,random_connections=self.ran_conn,offset=1,verb=self.verb) for i in range(n_layers-1)
            ])
        elif self.type=='FredkinDepXLayer':
            self.input_layer = fredkins.FredkinDepXLayer(din,dout,seed=self.seed,random_connections=self.ran_conn,offset=0,verb=verb)
            self.layers = nn.ModuleList([
                fredkins.FredkinDepXLayer(self.dout,self.dout,seed=self.seed+i+1 if seed is not None else None,random_connections=self.ran_conn,offset=1,verb=self.verb) for i in range(n_layers-1)
            ])
        else: raise NotImplemented("Fredkin Layer type not implemented")
    
    def forward(self, x):
        x = self.input_layer(x)
        for layer in self.layers:
            x = layer(x)
        
        out1 = x[:,:self.dout//2+self.dout%2]
        out2 = x[:,self.dout//2+self.dout%2:]
        out1 = out1.sum(dim=1,keepdim=True)
        out2 = out2.sum(dim=1,keepdim=True)
        score = torch.cat([out1,out2],dim=1)
        probs = F.softmax(score,dim=1)
        
        #out = x[:,self.dout//2+self.dout%2]
        return probs
        
        
        

In [94]:
#Define a Fredkin net with n layers, each with same input and output size
class FredkinNet02(nn.Module):
    def __init__(self, din: int, dout: int, n_layers: int, seed: int = None, random_connections=True,
                 fredkin_type: chr = 'FredkinXLayer', verb=False):
        super().__init__()
        self.din = din
        self.dout = dout
        self.verb = verb
        self.type = fredkin_type
        self.seed = seed
        self.ran_conn = random_connections
        if self.type=='FredkinXLayer':
            self.input_layer = fredkins.FredkinXLayer(din,dout,seed=self.seed,random_connections=self.ran_conn,offset=0,verb=verb)
            layer_dout = self.dout
            n_layers = int(math.log2(self.dout))
            self.layers = nn.ModuleList([])
            for i in range(n_layers):
                self.layers.append(fredkins.FredkinXLayer(layer_dout,layer_dout//2,seed=self.seed+i+1 if seed is not None else None,random_connections=self.ran_conn,offset=1,verb=self.verb))
                layer_dout = layer_dout//2
                
        else: raise NotImplemented("Fredkin Layer type not implemented")
        
    def forward(self, x):
        x = self.input_layer(x)
        for layer in self.layers:
            x = layer(x)
        
        #out1 = x[:,:self.dout//2+self.dout%2]
        #out2 = x[:,self.dout//2+self.dout%2:]
        #out1 = out1.sum(dim=1,keepdim=True)
        #out2 = out2.sum(dim=1,keepdim=True)
        #score = torch.cat([out1,out2],dim=1)
        #probs = F.softmax(score,dim=1)
        return x
        
        #out = x[:,self.dout//2+self.dout%2]
        #return probs

In [20]:
#Create model and run training for bit majority classification
NUM_EPOCHS = 150
LEARNING_RATE = 0.02
#BATCH_SIZE = 1
SEED=None
Z = 1.0
C = 0.0


#Define Fredkin Net
class FredkinNet(nn.Module):
    def __init__(self,din:int,dout:int,seed:int=SEED,verb=False):
        super().__init__()
        self.verb=verb
        self.fred1 = fredkins.Fredkin3plus6Layer(din,18,seed=seed,random_connections=False,offset=0,z=Z,verb=False)
        self.fred2 = fredkins.Fredkin3plus6Layer(18,18,seed=seed,random_connections=False,offset=1,z=Z,verb=False)
        self.fred3 = fredkins.Fredkin3plus6Layer(18,18,seed=seed,random_connections=False,offset=1,z=Z,verb=False)
        self.fred4 = fredkins.Fredkin3plus6Layer(18,18,seed=seed,random_connections=False,offset=1,z=Z,verb=False)
        self.fred5 = fredkins.Fredkin3plus6Layer(18,18,seed=seed,random_connections=False,offset=1,z=Z,verb=False)
        #self.fred6 = 
        
        
    def forward(self,x):
        out = self.fred1(x)
        out = self.fred2(out)
        if self.verb: print(f"output-vector: {out} with shape {out.shape}")
        #out = out[:,[0,2,4]]
        out = self.fred3(out)
        out = self.fred4(out)
        out = self.fred5(out)
        
        first_half = out[:,:9]
        second_half = out[:,9:]
        first_half = first_half.sum(dim=1,keepdim=True)
        second_half = second_half.sum(dim=1,keepdim=True)
        score = torch.cat([first_half,second_half],dim=1)
        probs = F.softmax(score,dim=1)
                                      
        
        #out=out[:,::3]
        #out = out.mean(dim=1)
        
        #print(out.shape)
        #out = out[:,2]
        if self.verb: print(f"interpretable output: {out}")
        return probs

#net = FredkinNet(5,3,verb=False)
net = FredkinNet01(din=11,dout=110,n_layers=4,fredkin_type='FredkinXLayer',seed=SEED,random_connections=False,verb=False)

#Prepare training
wgts_params = []
selector_params = []
for name, p in net.named_parameters():
    if not p.requires_grad:
        continue
    if 'wgts' in name:
        wgts_params.append(p)
    elif 'selector' in name:
        selector_params.append(p)

optim = torch.optim.Adam([
    {'params': selector_params, 'lr': LEARNING_RATE**4,'weight_decay':5e-5},
    {'params': wgts_params, 'lr': LEARNING_RATE, 'weight_decay':0.0},
])


#optim = torch.optim.Adam(net.parameters(),lr=LEARNING_RATE,weight_decay=1e-5)
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
#criterion = nn.BCELoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(1,NUM_EPOCHS+1):
    #TRAIN
    net.train()
    running_loss =0.0
    running_samples = 0
    
    for x, y in train_loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        
        logits = net(x)
        #print(logits)
        #print(f"logits: {logits}")
        loss = criterion(logits,y.long())
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        bs = x.size(0)
        running_loss += loss.item()*bs
        running_samples += bs
    
    epoch_loss = running_loss/running_samples
    #VALIDATE
    net.eval()
    val_loss = 0.0
    val_samples = 0
    correct =0
    
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            logits = net(x)
            #print(f"logits eval: {logits}")
            loss = criterion(logits,y.long())
            
            bs = x.size(0)
            val_loss += loss.item()*bs
            val_samples += bs
            
            #probs = torch.sigmoid(logits)
            probs = logits
            #preds = (probs >= 0.5).float()
            preds = torch.argmax(probs,dim=1)
            #print(f"Epoch {epoch} predictions. probs: {probs}, preds: {preds}, actual: {y}")
            correct += (preds == y).float().sum().item()
    val_loss = val_loss/val_samples
    val_acc = correct / val_samples
    print(f"Epoch {epoch:02d} train_loss = {epoch_loss:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
    if val_acc >= 0.99999:
        NUM_EPOCHS = epoch
        break
 
lookup_table = create_permutation_lookup_table(inputs = [0,1,2,3])
print("\n--PARAMS--\n")
for name, module in net.named_modules():
    helpers.print_fredkin_layer(module,lookup_table=lookup_table)
print("\n--PORT ASSIGNMENTS--\n")
for name, module in net.named_modules():
    if module._get_name()=='Fredkin3plus6Layer': helpers.print_fredkin_layer3plus6(module,lookup_table=lookup_table)
print("EPOCHS RUN:", NUM_EPOCHS)



Epoch 01 train_loss = 0.4443 val_loss=0.4214 val_acc=0.8920
Epoch 02 train_loss = 0.3375 val_loss=0.4463 val_acc=0.8670
Epoch 03 train_loss = 0.3259 val_loss=0.3888 val_acc=0.9250
Epoch 04 train_loss = 0.3221 val_loss=0.3888 val_acc=0.9250
Epoch 05 train_loss = 0.3202 val_loss=0.3888 val_acc=0.9250
Epoch 06 train_loss = 0.3191 val_loss=0.3149 val_acc=1.0000
Epoch 07 train_loss = 0.3183 val_loss=0.3149 val_acc=1.0000
Epoch 08 train_loss = 0.3178 val_loss=0.3149 val_acc=1.0000
Epoch 09 train_loss = 0.3174 val_loss=0.3868 val_acc=0.9270
Epoch 10 train_loss = 0.3171 val_loss=0.3868 val_acc=0.9270
Epoch 11 train_loss = 0.3169 val_loss=0.3868 val_acc=0.9270
Epoch 12 train_loss = 0.3167 val_loss=0.3868 val_acc=0.9270
Epoch 13 train_loss = 0.3165 val_loss=0.3868 val_acc=0.9270
Epoch 14 train_loss = 0.3164 val_loss=0.3868 val_acc=0.9270
Epoch 15 train_loss = 0.3163 val_loss=0.4480 val_acc=0.8650
Epoch 16 train_loss = 0.3162 val_loss=0.4480 val_acc=0.8650
Epoch 17 train_loss = 0.3161 val_loss=0.

In [90]:
#VALIDATE
net.eval()
val_loss = 0.0
val_samples = 0
correct =0

with torch.no_grad():
    for x, y in val_loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        logits = net(x)
        #print(f"logits eval: {logits}")
        loss = criterion(logits,y.long())
        
        bs = x.size(0)
        val_loss += loss.item()*bs
        val_samples += bs
        
        #probs = torch.sigmoid(logits)
        probs = logits
        #preds = (probs >= 0.5).float()
        preds = torch.argmax(probs,dim=1)
        #print(f"Epoch {epoch} predictions. probs: {probs}, preds: {preds}, actual: {y}")
        correct += (preds == y).float().sum().item()
val_loss = val_loss/val_samples
val_acc = correct / val_samples
print(f"Epoch {epoch:02d} train_loss = {epoch_loss:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}")

ValueError: Using a target size (torch.Size([4])) that is different to the input size (torch.Size([4, 2])) is deprecated. Please ensure they have the same size.

In [66]:
#Create model and run training
NUM_EPOCHS = 20
LEARNING_RATE = 10
BATCH_SIZE = 2

class FredkinNet(torch.nn.Module):
    def __init__(self,din,dout):
        super().__init__()
        self.fred1 = Fredkin24Layer(din,3,seed=42)
        self.fred2 = Fredkin24Layer(9,3,seed=43)
        self.fred3 = Fredkin24Layer(9,3,seed=44)
        self.fred4 = Fredkin24Layer(9,3,seed=45)
        self.fred5 = Fredkin24Layer(9,3,seed=46)
        self.fred6 = Fredkin24Layer(9,3,seed=47)
        self.fred7 = Fredkin24Layer(9,3,seed=48)
        self.fred8 = Fredkin24Layer(9,dout,seed=49)
        
    def forward(self,x):
        out = self.fred1(x)
        out = self.fred2(out)
        out = self.fred3(out)
        out = self.fred4(out)
        out = self.fred5(out)
        out = self.fred6(out)
        out = self.fred7(out)
        out = self.fred8(out)
        #print(out)
        #a_primes = out[:,1::3]     #(batch, dout)
        out_summmed = out.sum(dim=1) #(batch,)
        return out_summmed
    
train_loader = DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=True)
val_loader = DataLoader(val_data,batch_size=BATCH_SIZE,shuffle=False)

net = FredkinNet(9,1)
#Diagnosis
param = net.fred1.wgts  # example
print("Before backward:", param.grad)

optim = torch.optim.Adam(net.parameters(),lr=LEARNING_RATE)
criterion = torch.nn.BCEWithLogitsLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(1,NUM_EPOCHS+1):
    #TRAIN
    net.train()
    running_loss =0.0
    running_samples = 0
    
    for x, y in train_loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        
        logits = net(x)
        loss = criterion(logits,y)
        
        optim.zero_grad()
        loss.backward()
        
        print("After backward:", param.grad)
        print("Grad stats:", param.grad.mean(), param.grad.std())

        
        optim.step()
        
        bs = x.size(0)
        running_loss += loss.item()*bs
        running_samples += bs
    
    epoch_loss = running_loss/running_samples
    #VALIDATE
    net.eval()
    val_loss = 0.0
    val_samples = 0
    correct =0
    
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            logits = net(x)
            loss = criterion(logits,y)
            
            bs = x.size(0)
            val_loss += loss.item()*bs
            val_samples += bs
            
            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()
            correct += (preds == y).float().sum().item()
    val_loss = val_loss/val_samples
    val_acc = correct / val_samples
    print(f"Epoch {epoch:02d} train_loss = {epoch_loss:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
        
        

Before backward: None
After backward: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
Grad stats: tensor(0.) tensor(0.)
After backward: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
Grad stats: tensor(0.) tensor(0.)
After backward: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0.,