In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np

In [2]:
import requests
import inspect
import re

github_raw_url = 'https://raw.githubusercontent.com/dallingordon/LABNET/noisekd/NoiseKD.py'  # Replace with the actual URL of the Python file
response = requests.get(github_raw_url)

if response.status_code == 200:
    with open('NoiseKD.py', 'wb') as f:
        f.write(response.content)
else:
    print(f"Failed to download file from {github_raw_url}")

In [3]:
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


In [4]:
from NoiseKD import Teacher,ToyTransformer, count_parameters

In [5]:
##This is the code that lops off the softmax
#use the same pretrained weights, that works

#this takes the old code and replaces the softmax layer with that commented bit
input_class = inspect.getsource(ToyTransformer)
print(input_class)
new_class_name = "ToyTransformerNoSoft"
new_code = re.sub("ToyTransformer", new_class_name,input_class,)
new_code = re.sub("x = F\.softmax\(x, dim=1\)", "# removed softmax",new_code,)
print(new_code)

#this executes the class def, then you can treat it as if it was imported.
try:
    exec(new_code)

except Exception as e:
    print(f"An error occurred: {e}")

#this instantiates them
#x = NewTransformer(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, dropout,sequence_length)
#y = ToyTransformer(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, dropout,sequence_length)

#this was juse me making sure the batching stuff works.
#x.load_state_dict(torch.load('good_toy_5.pth')) ##doesn't have softmax
#y.load_state_dict(torch.load('good_toy_5.pth')) ##does
#x.eval()
#y.eval() ##eval it is..

#x(teacher_toy.val_inputs[0:10])[0], F.softmax(x(teacher_toy.val_inputs[0:4])[0]), y(teacher_toy.val_inputs[0:10])[0]



class ToyTransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, dropout,sequence_length):
        super(ToyTransformer, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim, dropout, batch_first=True),
            num_layers
        )
        self.fc1 = nn.Linear(embedding_dim * sequence_length, vocab_size)  # Intermediate linear layer
        self.fc2 = nn.Linear(vocab_size, vocab_size)  # Final linear layer

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = torch.flatten(x, start_dim=1)  # Flatten the output
        x = F.relu(self.fc1(x))  # Apply the intermediate linear layer with ReLU activation
        x = self.fc2(x)  # Apply the final linear layer
        x = F.softmax(x, dim=1)  # Apply softmax activation


So, i want both the student and the teacher to do logits.  i want loss (mse) for the logits, but then i want accuracy post softmax.  so, lets do it

In [6]:
embedding_dim = 16
num_heads = 8
hidden_dim  = 11
num_layers = 2
dropout = 0.1
vocab_size = 80
class_num = vocab_size
batch_size = 50
sequence_length = 160

In [7]:
TT = ToyTransformerNoSoft(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, dropout,sequence_length)

In [8]:
count_parameters(TT)

215702

In [9]:
teacher_toy = Teacher(TT,(sequence_length,)) #don't specify batch!!

In [10]:
##some of these configs made for more diverse outputs in teachers:
config_args = {"dist_type" : "ints" ##worked well
                      , "gen_m" : vocab_size
                      , "gen_n" : 100
                      , "gen_epochs" : 100
                      , "gen_lr" : 0.001
                      , "random_shuffle" : 0.1
                      , "out_type" : "one-hot" }

config_args_high_epochs = {"dist_type" : "ints" ##okay, but not as well as config_args.
                      , "gen_m" : vocab_size
                      , "gen_n" : 1000
                      , "gen_epochs" : 100
                      , "gen_lr" : 0.001
                      , "random_shuffle" : 0.1
                      , "out_type" : "one-hot" }

config_args_higher_lr = {"dist_type" : "ints" ##lower was worse.  raise it. 0.003 looks great.  this is the best.
                      , "gen_m" : vocab_size
                      , "gen_n" : 2000
                      , "gen_epochs" : 50
                      , "gen_lr" :  0.003 ##0.003
                      , "random_shuffle" : 0.8
                      , "out_type" : "one-hot" }

config_args_less_data = {"dist_type" : "ints" ##worse
                      , "gen_m" : vocab_size
                      , "gen_n" : 500
                      , "gen_epochs" : 100
                      , "gen_lr" : 0.003
                      , "random_shuffle" : 0.1
                      , "out_type" : "one-hot" }

config_args_high_shuffle = {"dist_type" : "ints" ##one bar..
                      , "gen_m" : vocab_size
                      , "gen_n" : 10_000
                      , "gen_epochs" : 10
                      , "gen_lr" : 0.05
                      , "random_shuffle" : 0.9
                      , "out_type" : "one-hot" }

config_args_small_batch = {"dist_type" : "ints" ##this one was the first to do well.  not just one bar and the rest nearly zero.
                      , "gen_m" : vocab_size
                      , "gen_n" : 10_000
                      , "gen_epochs" : 10
                      , "gen_lr" : 0.05
                      , "random_shuffle" : 0.5
                      , "batch_size" : 10
                      , "out_type" : "one-hot" }

config_ab = {"dist_type" : "ints" ##worked well
                      , "gen_m" : vocab_size
                      , "gen_n" : 5000
                      , "gen_epochs" : 200
                      , "gen_lr" : 0.005
                      , "random_shuffle" : 0.0
                      , "out_type" : "one-hot"
                      , "dist_type" : 'hetero'
                      , "alpha" : 1
                      , "beta" : 4} #maybe increase epochs?
config_debug = {"dist_type" : "ints" ##worked well
                      , "gen_m" : vocab_size
                      , "gen_n" : 5
                      , "gen_epochs" : 2
                      , "gen_lr" : 0.005
                      , "random_shuffle" : 0.0
                      , "out_type" : "one-hot"
                      , "dist_type" : 'hetero'
                      , "alpha" : 1
                      , "beta" : 4} #maybe increase epochs?

In [11]:
#teacher_toy.configure(**config_ab) #this is dying.  might be time for colab!!
teacher_toy.load_state_dict('/content/drive/MyDrive/Research/good_toy_4.pth')

Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
lets try ints!


In [12]:

args = { 'val_train' : "val"
                      , 'n' : 10_000
                      , 'dist_type' : 'ints'
                      , 'm' : vocab_size
                      , 'std': 1.0
                      , 'batch_size' : 50
                      , 'store_outputs': True
        }

teacher_toy.generate_data(**args)

Generating val data :: 100%|██████████| 200/200 [00:09<00:00, 22.18it/s]


In [None]:
#teacher_toy.val_targets[0:10] ##logits!!

In [13]:
from torch.optim.lr_scheduler import LambdaLR
import math

warmup_steps = 10_000  # Adjust this as needed steps is total items passed through
total_steps = 20_000  # Adjust this as needed i'd like to calculate this....


# Define a learning rate scheduler with warmup
def lr_lambda(current_step):
    if current_step < warmup_steps:
        # During warmup, increase learning rate linearly
        return float(current_step) / float(max(1, warmup_steps))
    else:
        # After warmup, decrease learning rate using some schedule
        # You can use any LR schedule you prefer here
        # For example, you can use a learning rate schedule like CosineAnnealing
        return 0.5 * (1 + math.cos(math.pi * (current_step - warmup_steps) / (total_steps - warmup_steps)))



""" Training loop
for step in range(total_steps):
    optimizer.zero_grad()
    # Compute your loss and backpropagation here
    loss.backward()
    optimizer.step()
    scheduler.step()
"""

' Training loop\nfor step in range(total_steps):\n    optimizer.zero_grad()\n    # Compute your loss and backpropagation here\n    loss.backward()\n    optimizer.step()\n    scheduler.step()\n'

In [14]:
from torch.optim import Optimizer

class CosineAnnealingLRWithPeriod(Optimizer):
    def __init__(self, optimizer, high_lr, low_lr, total_epochs, period_epochs):
        defaults = dict(lr=high_lr)
        params = optimizer.param_groups
        self.optimizer = optimizer
        self.high_lr = high_lr
        self.low_lr = low_lr
        self.total_epochs = total_epochs
        self.period_epochs = period_epochs
        self.current_epoch = 0

        self.lr_schedule = []

        super(CosineAnnealingLRWithPeriod, self).__init__(params,defaults)

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.current_epoch
        else:
            self.current_epoch = epoch

        if self.current_epoch <= self.total_epochs:
            self.current_epoch += 1
            lr = self.low_lr + 0.5 * (self.high_lr - self.low_lr) * (
                1 + math.cos((self.current_epoch / self.period_epochs) * math.pi)
            )
            for param_group in self.param_groups:
                param_group['lr'] = lr
            self.lr_schedule.append(lr)
            self.optimizer.step()
        else:
          print(self.current_epoch, epoch)
          raise ValueError('Epoch out of range.')

# optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
# scheduler = CosineAnnealingLRWithPeriod(optimizer, high_lr=0.1, low_lr=0.01, total_epochs=100, period_epochs=20)

# During training loop:
# for epoch in range(num_epochs):
#     scheduler.step(epoch)
#     train_one_epoch()

In [15]:
class UniformRandomLRScheduler:
    def __init__(self, optimizer, low_lr, high_lr):
        self.optimizer = optimizer
        self.low_lr = low_lr
        self.high_lr = high_lr


    def step(self):


        new_lr = np.random.uniform(self.low_lr, self.high_lr)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = new_lr
        #print(f"Epoch {epoch + 1}: Learning Rate = {new_lr}")

# Example usage:
import torch.optim as optim
""" This is what i used to get to 85%.
low_lr = 0.00001
high_lr = 0.01
"""
low_lr = 0.000001
high_lr = 0.0001


# Create the scheduler and pass in the optimizer


In [16]:
class NormalRandomLRScheduler:
    def __init__(self, optimizer, mean_lr, std_lr, low_lr, high_lr):
        self.optimizer = optimizer
        self.mean_lr = mean_lr
        self.std_lr = std_lr
        self.low_lr = low_lr
        self.high_lr = high_lr

    def step(self, epoch):
        new_lr = max(min(np.random.normal(self.mean_lr, self.std_lr), self.high_lr), self.low_lr)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = new_lr
        print(f"Epoch {epoch + 1}: Learning Rate = {new_lr:.6f}")

# Example usage:
import torch.optim as optim

mean_lr = 0.005
std_lr = 0.001
low_lr = 0.001
high_lr = 0.01

# Create your optimizer
#optimizer = optim.SGD(model.parameters(), lr=mean_lr)

# Create the scheduler and pass in the optimizer and other parameters
#scheduler = NormalRandomLRScheduler(optimizer, mean_lr, std_lr, low_lr, high_lr)


In [17]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
student = ToyTransformerNoSoft(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, dropout,sequence_length)
load_path =  "/content/drive/MyDrive/KD/toymodel.pth"
load_path =  "/content/drive/MyDrive/KD/toymodel_1.pth" #fine tuned with lr 0.001, loss is still decreasing...86.83 val acc
student.load_state_dict(torch.load(load_path))
student = student.to(device)

In [None]:
#############non-repeating train
learning_rate =  0.001 #0.001 is working great with this toy model.
momentum = 0.95
val_batch_size = 10

file_path = "/content/drive/MyDrive/KD/toymodel_2.pth" #toymodel.pth was made with teacher with goodtoy4.pth, 85% after 500 epochs
num_epochs = 100
batches_per_epoch = 1000 #more data is better yo.  i cranked it up again and i like this.  i think reduce lr.
batch_size = 100
data_per_batch = batch_size

criterion =  nn.MSELoss() #nn.CrossEntropyLoss() # #nn.KLDivLoss() #nn.CrossEntropyLoss()  #i think stick to mse for now.  this probs just needs lots of time to start learning.  like s4 lol.
optimizer = optim.Adam(student.parameters(), lr=learning_rate)
#optimizer = optim.SGD(student.parameters(), lr=learning_rate, momentum=momentum)
#optimizer = optim.Adagrad(student.parameters(), lr=learning_rate)

#scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda)
#scheduler = CosineAnnealingLRWithPeriod(optimizer, high_lr=0.01, low_lr=0.00001, total_epochs=num_epochs*batches_per_epoch, period_epochs=20)
#scheduler = UniformRandomLRScheduler(optimizer, low_lr, high_lr)


#no train data loader here.
val_data = list(zip(teacher_toy.val_inputs, teacher_toy.val_targets))
val_input_tensors = torch.stack([torch.Tensor(x[0]) for x in val_data])
val_target_tensors = torch.stack([torch.Tensor(x[1]) for x in val_data])
val_dataset = TensorDataset(val_input_tensors, val_target_tensors)
val_dataloader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=True)
accuracy_threshold = 0.5
print_every = 1 #its working, its just lots of data mama.
validation_every = 5
save_every = 10

gen_args  = { 'val_train' : "train"
              , 'n' : data_per_batch
              , 'dist_type' : 'ints' #this is generating ints for inputs, the outputs are logits.  hmmmm
              , 'm' : vocab_size
              , 'std': 1.0
              , 'display_progress' : False
              , 'store_outputs' : True
        }

losses = []  # List to store losses
accuracies = []



student = student.to(device)
for epoch in range(num_epochs):
    student.train()
    teacher_toy.model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    lr_list = []
    for i in range(batches_per_epoch):

        teacher_toy.generate_data(**gen_args)
        data_e = list(zip(teacher_toy.train_inputs, teacher_toy.train_targets))
        input_tensors_e = torch.stack([torch.Tensor(x[0]) for x in data_e])
        target_tensors_e = torch.stack([torch.Tensor(x[1]) for x in data_e])
        dataset_e = TensorDataset(input_tensors_e, target_tensors_e)
        dataloader_e = DataLoader(dataset_e, batch_size=batch_size, shuffle=True)

        for input_batch_e, target_batch_e in dataloader_e:
            optimizer.zero_grad()  # Zero the gradients
            input_batch_e = input_batch_e.to(device)
            target_batch_e = target_batch_e.to(device)
            output = student(input_batch_e)  # Forward pass
            loss = criterion(output, target_batch_e)  # Compute the loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update the weights

            current_lr = optimizer.param_groups[0]['lr']
            lr_list.append(current_lr)

            #scheduler.step()

            total_loss += loss.item()
            #add early stopping...
            #and validation at each step.
            # Calculate accuracy
            soft_output = torch.argmax(F.softmax(output, dim=1),axis=1) #F.softmax(
            soft_targets = torch.argmax(F.softmax(target_batch_e,dim=1),axis=1)

            correct_predictions += (soft_output == soft_targets).sum().item()
            total_samples += input_batch_e.size(0)


    # Print the average loss for this epoch
    avg_loss = total_loss / ( batches_per_epoch * data_per_batch)
    losses.append(avg_loss)

    accuracy = correct_predictions / total_samples
    accuracies.append(accuracy)

    if (epoch + 1) % print_every == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss}, Train Accuracy: {accuracy:.4f}')

        formatted_lr_list = [f'{i:.8f}' for i in lr_list]
        formatted_lr_str = ', '.join(formatted_lr_list)
        print(f'LRs used: {formatted_lr_str}')

        #current_lr = optimizer.param_groups[0]['lr']
        #print(current_lr) ##this is misleading.  it has used several different lrs.  write to a list me thinks.
    if (epoch + 1) % save_every == 0:
      torch.save(student.state_dict(), file_path)
    if (epoch + 1) % validation_every == 0:

        student.eval()

        total_val_samples = 0
        correct_val_predictions = 0

        with torch.no_grad():
            for val_input_batch, val_target_batch in val_dataloader:
              val_input_batch = val_input_batch.to(device)
              val_target_batch = val_target_batch.to(device)
              val_output = student(val_input_batch)

              soft_output_val = torch.argmax(F.softmax(val_output, dim=1),axis=1) #F.softmax(
              soft_targets_val = torch.argmax(F.softmax(val_target_batch,dim=1),axis=1)

              correct_val_predictions += (soft_output_val == soft_targets_val).sum().item()
              total_val_samples += val_input_batch.size(0)

        # Calculate validation accuracy
        val_accuracy = correct_val_predictions / total_val_samples

        # Print the validation accuracy for this epoch
        print(f'\t\tValidation Accuracy: {val_accuracy:.4f}')

        # Set the model back to training mode
        student.train()

Epoch [1/100], Loss: 76983.12385177612, Train Accuracy: 0.8587
LRs used: 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.00100000, 0.0

In [None]:
student.load_state_dict(torch.load(file_path))

<All keys matched successfully>

In [None]:
student.eval()
i = teacher_toy.train_inputs[0:5]
i = i.to(device)
teacher_toy.model(i)[0:1] > 0.5, teacher_toy.train_targets[0:1] > 0.5,student(i)[0:1] > 0.5
F.softmax(teacher_toy.model(i)[0:1]) > 0.5 , F.softmax(teacher_toy.train_targets[0:1]) > 0.5,F.softmax(student(i)[0:1]) > 0.5
#teacher_toy.model(i)[0:3],teacher_toy.train_targets[0:3] ,student(i)[0:3]
(F.softmax(teacher_toy.model(i)[0:1])>0.5) == (F.softmax(student(i)[0:1]) > 0.5)
#yeah, my accuracy is just bad..

  F.softmax(teacher_toy.model(i)[0:1]) > 0.5 , F.softmax(teacher_toy.train_targets[0:1]) > 0.5,F.softmax(student(i)[0:1]) > 0.5
  torch.all((F.softmax(teacher_toy.model(i)[0:1])>0.5) == (F.softmax(student(i)[0:1]) > 0.5))


tensor(True, device='cuda:0')

In [None]:
total_val_samples = 0
correct_val_predictions = 0
for val_input_batch, val_target_batch in val_dataloader:
  val_input_batch = val_input_batch.to(device)
  val_target_batch = val_target_batch.to(device)
  val_output = student(val_input_batch)

  soft_output_val = torch.argmax(F.softmax(val_output, dim=1),axis=1) #F.softmax(
  soft_targets_val = torch.argmax(F.softmax(val_target_batch,dim=1),axis=1)

  correct_val_predictions += (soft_output_val == soft_targets_val).sum().item()
  total_val_samples += val_input_batch.size(0)

val_accuracy = correct_val_predictions / total_val_samples
print(val_accuracy)

0.7647


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
i = 30
m = 50
teacher_toy.model(teacher_toy.val_inputs[:m][i:i+1].to(device))
teacher_toy.val_targets[:m][i:1+2] #they match

tensor([], device='cuda:0', size=(0, 80))

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
i = 30
m = 50
teacher_toy.model(teacher_toy.val_inputs[:m][i:i+1].to(device)) ,teacher_toy.val_targets[:m][i] #they match

(tensor([[ 2.0586e+02,  3.4992e+01,  8.5352e+01, -1.1358e+02,  1.5227e+02,
          -5.4810e+01, -3.7290e+01, -9.5843e+01, -1.6989e+02,  3.7378e+01,
           2.2404e+01, -5.8327e+00, -2.6117e+02, -7.3296e+01,  4.3739e+01,
           1.0842e+02,  1.0811e+02, -2.5372e+02, -1.4633e+01, -1.6201e+02,
          -3.2133e+01, -2.0622e+01, -8.5095e+01, -3.6187e+01, -1.1476e+02,
           1.5680e+01, -1.2444e+02,  5.8280e+01, -3.9752e+01, -1.6273e+01,
          -1.7848e+02, -4.9088e+00, -7.4937e-03, -9.0707e+01,  1.1688e+02,
          -6.3936e+01,  7.5031e+01,  8.1228e+01,  8.8059e+01,  5.1724e+01,
          -8.0736e+01, -6.4757e+01,  1.2481e+02,  1.0759e+02,  1.3555e+01,
           7.9394e+01, -3.8735e+01, -1.6845e+02, -3.3377e+01, -6.8988e+01,
          -1.0492e+02, -7.4577e+01, -1.0604e+01,  1.0371e+02, -1.1510e+02,
          -2.3550e+02,  9.1118e+01, -1.7592e+02, -1.5868e+02, -2.2437e+02,
           1.6668e+02,  2.8419e+01,  1.3112e+02, -3.9683e+00, -6.1722e+01,
           2.1326e+02,  2