In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from NoiseKD import Teacher, SimpleLanguageModel, slm_init_config, slm_model_config, count_parameters

In [6]:
SLM = SimpleLanguageModel(**slm_init_config) #use the slm_init_config to configure the model, then pass to Teacher.  

In [7]:
slm_init_config

{'embedding_dim': 16,
 'num_heads': 8,
 'hidden_dim': 11,
 'num_layers': 2,
 'dropout': 0.1,
 'vocab_size': 80,
 'class_num': 80,
 'sequence_length': 160}

In [8]:
teacher_slm = Teacher(SLM,(slm_init_config['sequence_length'],))

In [9]:
teacher_slm.configure(**slm_model_config) #this is the gen config.  

Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
lets try ints!


Configuring Teacher:: 100%|█████████████████████| 50/50 [01:37<00:00,  1.94s/it]

Teacher Configured





In [10]:
#we want to have a configured teacher, then be able to duplicate it with the embedding layer converted to something
#that can take floats.
train_args = { 'val_train' : "train"
                      , 'n' : 100_000
                      , 'dist_type' : 'ints'
                      , 'm' : slm_init_config['vocab_size']
                      , 'std': 1.0
        }
teacher_slm.generate_data(**train_args)

val_args = { 'val_train' : "val"
                      , 'n' : 100_000
                      , 'dist_type' : 'ints'
                      , 'm' : slm_init_config['vocab_size']
                      , 'std': 1.0
        }
teacher_slm.generate_data(**val_args)

Generating train data :: 100%|██████████████| 2000/2000 [00:49<00:00, 40.09it/s]
Generating val data :: 100%|████████████████| 2000/2000 [00:50<00:00, 39.50it/s]


In [11]:
embedding_weight_tensor = teacher_slm.model.embedding.weight.detach() #.numpy().shape

In [12]:
embedding_layer = teacher_slm.model.embedding

In [13]:
embedding_weight_tensor

tensor([[ 0.3093,  1.3306,  0.2426,  ..., -0.8217, -2.0095,  1.4835],
        [-1.9836,  0.9089,  0.5292,  ...,  0.1355,  0.4559,  2.3000],
        [-2.7316,  1.0899,  0.4864,  ...,  1.8981,  1.0435, -0.2640],
        ...,
        [-1.7901, -0.0087,  1.5593,  ..., -0.5925,  0.0217,  0.4165],
        [ 1.1177, -0.7155,  0.0531,  ..., -0.7303,  0.4064, -0.6738],
        [ 1.3045, -0.6034,  0.1966,  ...,  1.1825,  0.0454,  0.0603]])

In [14]:
test_data = teacher_slm.train_inputs[0:10]#this is a batch of 10 inputs, indexes.  
test_data.shape #this is batch of 10, seq_len of 160, each corresponding to an index.  
#so, if we do onehot, it should be b x 160 x vocab_size which is 80 in this case.  

torch.Size([10, 160])

In [15]:
embedding_layer(test_data).shape#works.  slm_init_config['embedding_dim'] is where the 16 is coming from.

torch.Size([10, 160, 16])

In [16]:
embedding_weight_tensor.shape #with onehot, it should be b x 160 x 80 * 80 x 16 = b x 160 x 16.  just linear! 1d convolution

torch.Size([80, 16])

In [17]:
def batch_one_hot(input_sequences, vocab_size):
    batch_size = input_sequences.size(0)
    max_seq_length = input_sequences.size(1)
    
    # Create a tensor to store the one-hot encodings
    one_hot_input = torch.zeros(batch_size, max_seq_length, vocab_size)
    
    # Use scatter_ to set the appropriate elements to 1 in each batch
    one_hot_input.scatter_(2, input_sequences.unsqueeze(2), 1)
    
    return one_hot_input

oh = batch_one_hot(test_data, vocab_size = 80)
##test_data == torch.argmax(oh, dim=2)

In [18]:
def Linearize_Embedding(embedding_layer):
    embedding_weight_tensor = embedding_layer.weight.detach() 
    shape = embedding_weight_tensor.shape
    vocab_size = shape[0]
    embedding_dim = shape[1]
    lin = nn.Linear(vocab_size,embedding_dim, bias = False)
    #print(lin.weight.shape)
    #print(embedding_weight_tensor.shape)
    lin.weight = nn.Parameter(embedding_weight_tensor.T) #not sure about this transpose
    return lin
L = Linearize_Embedding(embedding_layer)


In [19]:
##also for future reference, it sort of looks like linear layers just operate across the last layer. i may not need to stack.

In [20]:
oh.shape #this is 10 batches, 160 seq_len, and 80 embedding dim.  


torch.Size([10, 160, 80])

In [21]:
embedding_layer(test_data).shape,L(oh).shape
#normal takes the test data (ints), linearized takes onehotted data.

(torch.Size([10, 160, 16]), torch.Size([10, 160, 16]))

In [22]:
torch.sum((L(oh) - embedding_layer(test_data))**2) #ha.  it was the bias.  

tensor(0., grad_fn=<SumBackward0>)

In [23]:
nn.Parameter(embedding_weight_tensor) == L.weight.T #init works. not sure about having a bias in linear layer tho...

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])

In [24]:
import copy
copy_SLM = copy.deepcopy(SLM)
copy_SLM.embedding = L #swap out the embedding

In [25]:
SLM(test_data)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 

In [47]:
oh = batch_one_hot(test_data, vocab_size = 80) #i do this above, this is just so i can remeber what it is doing
copy_SLM(oh) #now it takes one hot, not indexes.

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e

In [54]:
torch.sum((SLM(test_data) - copy_SLM(oh))**2) #asdflkajsdglfonsdg;aogh so the parameter layer is working, but the final bit isnt

tensor(16., grad_fn=<SumBackward0>)

In [1]:
x = None

In [3]:
x = "hi"

In [4]:
if x == "hi":
    print('dfdf')

dfdf


In [26]:
teacher_slm.train_targets

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])