In [1]:
import torch
from torch.nn import functional as F
from collections import OrderedDict
import pandas as pd

In [2]:
#Create random array of 5 elements between 1 and 9

test_array = torch.tensor(torch.randint(1, 10, (5,)), dtype=torch.float32)
print(test_array)

#sOFtmax function
test_array_softmax = F.softmax(test_array, dim=0)

print(test_array_softmax)

print(sum(test_array_softmax))

#Multiply softmax by exponential mask
wm_mask = [0.3, 0.5, 0.7, 0.9, 1.0]

test_array_softmax_masked = test_array_softmax * torch.tensor(wm_mask, dtype=torch.float32)

print(test_array_softmax_masked)
print(sum(test_array_softmax_masked))

softmaxing = F.softmax(test_array_softmax_masked, dim=0)

print(softmaxing)
print(sum(softmaxing))

tensor([6., 2., 7., 7., 5.])
tensor([0.1466, 0.0027, 0.3984, 0.3984, 0.0539])
tensor(1.)
tensor([0.0440, 0.0013, 0.2789, 0.3586, 0.0539])
tensor(0.7367)
tensor([0.1785, 0.1710, 0.2257, 0.2445, 0.1803])
tensor(1.)


  test_array = torch.tensor(torch.randint(1, 10, (5,)), dtype=torch.float32)


In [22]:
def params(n_embd, block_size, vocab_size, n_layer, n_head, hs_dim = 1, ffw_size = 1):
    """ estimates the number of parameters in the model"""
    """ estimates the number of parameters in the model"""

    if hs_dim == 1:
        hs_dim = n_embd
    out = OrderedDict()

    # token and position embeddings
    out['emebedding/position'] = n_embd * block_size
    out['embedding/token'] = n_embd * vocab_size
    out['embedding'] = out['emebedding/position'] + out['embedding/token']

    # attention blocks
    out['attention/ln'] = n_embd # note, bias=False in our LN
    out['attention/kqv'] = n_embd * 3*hs_dim   #n_embd * 3*n_embd
    out['attention/proj'] = hs_dim*n_embd #n_embd**2
    out['attention'] = out['attention/ln'] + out['attention/kqv'] + out['attention/proj']

    # MLP blocks
    if ffw_size == 1:
        ffw_size = 4*n_embd # feed forward size
    #ffw_size = 4*n_embd # feed forward size
    out['mlp/ln'] = n_embd
    out['mlp/ffw'] = n_embd * ffw_size
    out['mlp/proj'] = ffw_size * n_embd
    out['mlp'] = out['mlp/ln'] + out['mlp/ffw'] + out['mlp/proj']

    # the transformer and the rest of it
    out['block'] = out['attention'] + out['mlp']
    out['transformer'] = n_layer * out['block']
    out['ln_f'] = n_embd # final layernorm
    out['dense'] = 0 # 0 because of parameter sharing. This layer uses the weights from the embedding layer

    # total
    out['total'] = out['embedding'] + out['transformer'] + out['ln_f'] + out['dense']

    return out

# compare our param count to that reported by PyTorch



In [12]:
results = []
settings = [
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 2, "n_head" : 12, "n_embd" : 384, "hs_dim": 1},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 4, "n_head" : 12, "n_embd" : 384, "hs_dim": 1},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 6, "n_head" : 12, "n_embd" : 384, "hs_dim":1},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 12, "n_head" : 12, "n_embd" : 384, "hs_dim":1},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 16, "n_head" : 12, "n_embd" : 384, "hs_dim": 1},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 24, "n_head" : 12, "n_embd" : 384},
    #{"block_size" : 1024, "vocab_size" : 50257, "n_layer" : 12, "n_head" : 12, "n_embd" : 768},  # 124M params
    #{"block_size" : 1024, "vocab_size" : 50257, "n_layer" : 24, "n_head" : 16, "n_embd" : 1024}, # 350M params
    #{"block_size" : 1024, "vocab_size" : 50257, "n_layer" : 36, "n_head" : 20, "n_embd" : 1280},
    #{"block_size" : 1024, "vocab_size" : 50257, "n_layer" : 48, "n_head" : 25, "n_embd" : 1600},

]

#p = params(**settings[0])
#params_total = p['total']

#print(f"we see: {params_total/1e6:.2f}M parameters")
# create a header
#print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
#for k,v in p.items():
#    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

for i in range(len(settings)):
    settings_tmp = {}#settings[i]

    p = params(**settings[i])
    params_total = p['total']
    settings_tmp["params"] = params_total
    settings_tmp["params_m"] = round(params_total/1e6, 2)
    for k,v in p.items():
        #if k in ["attention", "mlp", "transformer"]:
        settings_tmp[k] = v
    settings_tmp["ID"] = str(settings[i]["n_layer"])+"x"+str(settings[i]["n_head"])
    settings_tmp["n_layer"] = settings[i]["n_layer"]
    #results.append(p)
    #print(f"we see: {params_total/1e6:.2f}M parameters")
    #print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
    #for k,v in p.items():
    #    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

    results.append(settings_tmp)

#transpose_columns = [k for k in results[0].keys() if k not in settings[0].keys()]

df = pd.DataFrame(results)
df.set_index("ID", inplace=True)
#df = df.transpose()
#df["attention_total"] = df["attention"] * df["n_layer"]
#df["mlp_total"] = df["mlp"] * df["n_layer"]
#df["param_minus_embedding"] = df["params"] - df["embedding"]
#df["tot_sum"] = df["attention_total"] + df["mlp_total"]



df.T

ID,2x12,4x12,6x12,12x12,16x12,24x12
params,6614784.0,10155264.0,13695744.0,24317184.0,31398144.0,45560064.0
params_m,6.61,10.16,13.7,24.32,31.4,45.56
emebedding/position,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0
embedding/token,3072000.0,3072000.0,3072000.0,3072000.0,3072000.0,3072000.0
embedding,3073920.0,3073920.0,3073920.0,3073920.0,3073920.0,3073920.0
attention/ln,384.0,384.0,384.0,384.0,384.0,384.0
attention/kqv,442368.0,442368.0,442368.0,442368.0,442368.0,442368.0
attention/proj,147456.0,147456.0,147456.0,147456.0,147456.0,147456.0
attention,590208.0,590208.0,590208.0,590208.0,590208.0,590208.0
mlp/ln,384.0,384.0,384.0,384.0,384.0,384.0


In [5]:
results = []
settings = [
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 2, "n_head" : 12, "n_embd" : 384, "hs_dim": 6144},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 4, "n_head" : 12, "n_embd" : 384, "hs_dim": 2688},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 6, "n_head" : 12, "n_embd" : 384, "hs_dim": 1536},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 12, "n_head" : 12, "n_embd" : 384, "hs_dim":384},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 16, "n_head" : 12, "n_embd" : 384, "hs_dim": 96},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 24, "n_head" : 12, "n_embd" : 384, "hs_dim": 1},
]

#p = params(**settings[0])
#params_total = p['total']

#print(f"we see: {params_total/1e6:.2f}M parameters")
# create a header
#print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
#for k,v in p.items():
#    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

for i in range(len(settings)):
    settings_tmp = {}#settings[i]

    p = params(**settings[i])
    params_total = p['total']
    #settings_tmp["params"] = params_total
    settings_tmp["params_m"] = round(params_total/1e6, 2)
    for k,v in p.items():
        if k in ["attention", "mlp", "transformer"]:
            settings_tmp[k] = v
    settings_tmp["ID"] = str(settings[i]["n_layer"])+"x"+str(settings[i]["n_head"])
    settings_tmp["n_layer"] = settings[i]["n_layer"]
    #results.append(p)
    #print(f"we see: {params_total/1e6:.2f}M parameters")
    #print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
    #for k,v in p.items():
    #    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

    results.append(settings_tmp)

#transpose_columns = [k for k in results[0].keys() if k not in settings[0].keys()]

df = pd.DataFrame(results)
df.set_index("ID", inplace=True)
#df = df.transpose()
df["attention_total"] = df["attention"] * df["n_layer"]
df["mlp_total"] = df["mlp"] * df["n_layer"]
#df["param_minus_embedding"] = df["params"] - df["embedding"]
df["tot_sum"] = df["attention_total"] + df["mlp_total"]



df

Unnamed: 0_level_0,params_m,attention,mlp,transformer,n_layer,attention_total,mlp_total,tot_sum
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2x12,24.31,9437568,1180032,21235200,2,18875136,2360064,21235200
4x12,24.31,4129152,1180032,21236736,4,16516608,4720128,21236736
6x12,24.31,2359680,1180032,21238272,6,14158080,7080192,21238272
12x12,24.32,590208,1180032,21242880,12,7082496,14160384,21242880
16x12,24.32,147840,1180032,21245952,16,2365440,18880512,21245952
24x12,45.56,590208,1180032,42485760,24,14164992,28320768,42485760


In [38]:
results = []
settings = [
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 2, "n_head" : 12, "n_embd" : 384, "hs_dim": 384, "ffw_size" : 13056},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 4, "n_head" : 12, "n_embd" : 384, "hs_dim": 384, "ffw_size" : 6144},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 6, "n_head" : 12, "n_embd" : 384, "hs_dim": 384, "ffw_size" : 3840},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 12, "n_head" : 12, "n_embd" : 384, "hs_dim":384, "ffw_size" : 1},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 16, "n_head" : 12, "n_embd" : 384, "hs_dim": 384, "ffw_size" : 960},
    {"block_size" : 5, "vocab_size" : 8000, "n_layer" : 24, "n_head" : 12, "n_embd" : 384, "hs_dim": 1, "ffw_size" : 384},
]

#p = params(**settings[0])
#params_total = p['total']

#print(f"we see: {params_total/1e6:.2f}M parameters")
# create a header
#print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
#for k,v in p.items():
#    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

for i in range(len(settings)):
    settings_tmp = {}#settings[i]

    p = params(**settings[i])
    params_total = p['total']
    settings_tmp["params"] = params_total
    settings_tmp["params_m"] = round(params_total/1e6, 2)
    for k,v in p.items():
        if k in ["attention", "mlp", "transformer", "block"]:
            settings_tmp[k] = v
    settings_tmp["ID"] = str(settings[i]["n_layer"])+"x"+str(settings[i]["n_head"])
    settings_tmp["n_layer"] = settings[i]["n_layer"]
    #results.append(p)
    #print(f"we see: {params_total/1e6:.2f}M parameters")
    #print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
    #for k,v in p.items():
    #    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

    results.append(settings_tmp)

#transpose_columns = [k for k in results[0].keys() if k not in settings[0].keys()]

df = pd.DataFrame(results)
df.set_index("ID", inplace=True)
#df = df.transpose()
df["attention_total"] = df["attention"] * df["n_layer"]
df["mlp_total"] = df["mlp"] * df["n_layer"]
#df["param_minus_embedding"] = df["params"] - df["embedding"]
df["tot_sum"] = df["attention_total"] + df["mlp_total"]

df["A_totminusffb"] = df["block"] - df["mlp"]

df

Unnamed: 0_level_0,params,params_m,attention,mlp,block,transformer,n_layer,attention_total,mlp_total,tot_sum,A_totminusffb
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2x12,24309504,24.31,590208,10027392,10617600,21235200,2,1180416,20054784,21235200,590208
4x12,24311040,24.31,590208,4718976,5309184,21236736,4,2360832,18875904,21236736,590208
6x12,24312576,24.31,590208,2949504,3539712,21238272,6,3541248,17697024,21238272,590208
12x12,24317184,24.32,590208,1180032,1770240,21242880,12,7082496,14160384,21242880,590208
16x12,24320256,24.32,590208,737664,1327872,21245952,16,9443328,11802624,21245952,590208
24x12,24326400,24.33,590208,295296,885504,21252096,24,14164992,7087104,21252096,590208


In [238]:
#Select the row with id "params"

df_temp = df.loc[["transformer", "block"]].T
df_temp["calc"] = df_temp["transformer"] / df_temp["block"]
df_temp

Unnamed: 0_level_0,transformer,block,calc
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2x12,3540480.0,1770240.0,2.0
4x12,7080960.0,1770240.0,4.0
6x12,10621440.0,1770240.0,6.0
12x12,84953090.0,7079424.0,12.0
24x16,302039000.0,12584960.0,24.0
36x20,707881000.0,19663360.0,36.0
48x25,1474714000.0,30723200.0,48.0


In [39]:
n_embd = 384
dff0 = 4*384 #Default value is 4*n_embd

#Since for a single layer parameter calculation is
# M = 2*n_embd*d_ff (mlp part) + 4*n_embd*n_embd (attention part, technically n_embd*d_attn but d_attn = n_embd)
# Can be written as M = B*d_ff + A
# Where A = 4*n_embd*n_embd and B = 2*n_embd

print(f"Parameters for a single layer with n_embd = {n_embd} is {2*n_embd*dff0 + 4*n_embd*n_embd}")

A = 4*384*384
B = 2*384
nl0 = 12 #number of layers for which you want to bring parameters count to
for nl1 in [2,4,6,16,24]:
    k = nl1-nl0
    wk = ((1- (nl0/(nl0+k)) ) * (dff0 + (A/B)))
    print(f"nl1 = {nl1}")
    print(f"wk value for k = {k} is {wk}")
    print(f"dff value for k = {k} is {dff0-wk}")

Parameters for a single layer with n_embd = 384 is 1769472
nl1 = 2
wk value for k = -10 is -11520.0
dff value for k = -10 is 13056.0
nl1 = 4
wk value for k = -8 is -4608.0
dff value for k = -8 is 6144.0
nl1 = 6
wk value for k = -6 is -2304.0
dff value for k = -6 is 3840.0
nl1 = 16
wk value for k = 4 is 576.0
dff value for k = 4 is 960.0
nl1 = 24
wk value for k = 12 is 1152.0
dff value for k = 12 is 384.0


ModuleNotFoundError: No module named 'model'