In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from NoiseKD import Teacher, SimpleLanguageModel, slm_init_config, slm_model_config, count_parameters

In [2]:
SLM = SimpleLanguageModel(**slm_init_config) #use the slm_init_config to configure the model, then pass to Teacher.  

In [43]:
slm_init_config

{'embedding_dim': 16,
 'num_heads': 8,
 'hidden_dim': 11,
 'num_layers': 2,
 'dropout': 0.1,
 'vocab_size': 80,
 'class_num': 80,
 'sequence_length': 160}

In [3]:
teacher_slm = Teacher(SLM,(slm_init_config['sequence_length'],))

In [8]:
teacher_slm.configure(**slm_model_config) #this is the gen config.  

Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
lets try ints!


Configuring Teacher:: 100%|█████████████████████| 50/50 [01:39<00:00,  1.99s/it]

Teacher Configured





In [11]:
#we want to have a configured teacher, then be able to duplicate it with the embedding layer converted to something
#that can take floats.
train_args = { 'val_train' : "train"
                      , 'n' : 100_000
                      , 'dist_type' : 'ints'
                      , 'm' : slm_init_config['vocab_size']
                      , 'std': 1.0
        }
teacher_slm.generate_data(**train_args)

val_args = { 'val_train' : "val"
                      , 'n' : 100_000
                      , 'dist_type' : 'ints'
                      , 'm' : slm_init_config['vocab_size']
                      , 'std': 1.0
        }
teacher_slm.generate_data(**val_args)

Configuring Teacher:: 100%|█████████████████| 2000/2000 [00:52<00:00, 38.35it/s]
Configuring Teacher:: 100%|█████████████████| 2000/2000 [00:49<00:00, 40.14it/s]


In [123]:
embedding_weight_tenso = teacher_slm.model.embedding.weight.detach() #.numpy().shape

In [124]:
embedding_layer = teacher_slm.model.embedding

In [125]:
test_data = teacher_slm.train_inputs[0:10]#this is a batch of 10 inputs, indexes.  
test_data.shape #this is batch of 10, seq_len of 160, each corresponding to an index.  
#so, if we do onehot, it should be b x 160 x vocab_size which is 80 in this case.  

torch.Size([10, 160])

In [126]:
embedding_layer(test_data).shape#works.  slm_init_config['embedding_dim'] is where the 16 is coming from.

torch.Size([10, 160, 16])

In [127]:
embedding_weight_tensor.shape #with onehot, it should be b x 160 x 80 * 80 * 16 = b x 160 x 16.  just linear! 1d convolution

torch.Size([80, 16])

In [128]:
def batch_one_hot(input_sequences, vocab_size):
    batch_size = input_sequences.size(0)
    max_seq_length = input_sequences.size(1)
    
    # Create a tensor to store the one-hot encodings
    one_hot_input = torch.zeros(batch_size, max_seq_length, vocab_size)
    
    # Use scatter_ to set the appropriate elements to 1 in each batch
    one_hot_input.scatter_(2, input_sequences.unsqueeze(2), 1)
    
    return one_hot_input

oh = batch_one_hot(test_data, vocab_size = 80)
##test_data == torch.argmax(oh, dim=2)

In [139]:
def Linearize_Embedding(embedding_layer):
    embedding_weight_tensor = embedding_layer.weight.detach() 
    shape = embedding_weight_tensor.shape
    vocab_size = shape[0]
    embedding_dim = shape[1]
    print(vocab_size,embedding_dim)
    lin = nn.Linear(vocab_size,embedding_dim)
    lin.weight = nn.Parameter(embedding_weight_tensor.T) #not sure about this transpose
    return lin
L = Linearize_Embedding(embedding_layer)


80 16


In [130]:
L.weight.shape

torch.Size([16, 80])

In [132]:
embedding_weight_tenso.T.shape

torch.Size([16, 80])

In [133]:
oh.shape

torch.Size([10, 160, 80])

In [143]:
embedding_layer(test_data) #this isn't changing.   good.

tensor([[[-0.2922,  0.9590, -0.3747,  ..., -0.8251, -1.4937,  0.8997],
         [-0.6123, -0.4356,  0.2401,  ..., -1.1940, -1.5489,  0.5022],
         [-1.3747,  0.5595,  2.5854,  ..., -0.8407,  1.8977, -0.2874],
         ...,
         [ 0.7573, -0.0439, -0.9874,  ..., -0.6674,  1.1896, -0.7050],
         [ 1.1906, -0.9558,  0.1148,  ..., -1.9944,  0.2102, -1.5978],
         [ 0.8662, -0.5946,  0.4465,  ...,  0.6806,  0.0759, -0.7071]],

        [[ 0.4890, -0.8238,  0.0754,  ...,  0.6244, -1.7895, -2.2388],
         [ 0.7802,  1.0969,  0.5488,  ..., -1.4447, -0.0410,  1.5863],
         [ 1.1503,  1.0996,  1.1508,  ..., -0.1844,  0.1494,  0.9812],
         ...,
         [ 2.1915, -1.4852,  1.7001,  ..., -0.1682,  0.2967,  0.9008],
         [-0.0962,  0.5417, -0.1710,  ...,  1.5816, -0.8062,  0.7614],
         [ 1.1021,  0.3538, -0.7704,  ..., -1.1417, -0.0236,  0.5138]],

        [[ 0.1323,  0.4150, -0.6024,  ..., -1.2214, -0.7204, -0.0747],
         [-1.3448, -0.3103, -1.0887,  ...,  0

In [147]:

L(oh) #not changing now....

tensor([[[-0.3137,  0.9066, -0.3850,  ..., -0.7290, -1.3897,  0.8688],
         [-0.6337, -0.4881,  0.2298,  ..., -1.0980, -1.4449,  0.4713],
         [-1.3961,  0.5070,  2.5751,  ..., -0.7447,  2.0017, -0.3183],
         ...,
         [ 0.7359, -0.0963, -0.9977,  ..., -0.5714,  1.2936, -0.7359],
         [ 1.1691, -1.0083,  0.1044,  ..., -1.8983,  0.3142, -1.6288],
         [ 0.8448, -0.6471,  0.4362,  ...,  0.7767,  0.1799, -0.7381]],

        [[ 0.4676, -0.8763,  0.0651,  ...,  0.7204, -1.6855, -2.2698],
         [ 0.7587,  1.0445,  0.5385,  ..., -1.3487,  0.0630,  1.5553],
         [ 1.1289,  1.0472,  1.1405,  ..., -0.0883,  0.2534,  0.9503],
         ...,
         [ 2.1701, -1.5377,  1.6898,  ..., -0.0721,  0.4007,  0.8699],
         [-0.1177,  0.4893, -0.1813,  ...,  1.6777, -0.7023,  0.7304],
         [ 1.0806,  0.3013, -0.7807,  ..., -1.0456,  0.0804,  0.4828]],

        [[ 0.1109,  0.3625, -0.6128,  ..., -1.1254, -0.6165, -0.1056],
         [-1.3662, -0.3628, -1.0990,  ...,  0

In [60]:
nn.Parameter(embedding_weight_tensor) == L.weight #init works. not sure about having a bias in linear layer tho...

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])

In [5]:
#i think i just need to convolve over the sequence lenght mama.  
#not too hard.


In [6]:
SLM

SimpleLanguageModel(
  (embedding): Embedding(80, 16)
  (transformer_encoder): TransformerEncoder(
    (transformer): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
          )
          (linear1): Linear(in_features=16, out_features=11, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=11, out_features=16, bias=True)
          (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
       