In [1]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


This file is a test run of creating a single artist embedding using BERT

In [2]:
df = pd.read_csv("lyricdata.csv")
lyrics = df.loc[0,"lyrics"]
lyrics

'Pity the poor little darling now Oh, she never had a chance Death came quick for the girl Victim of homicide Cut and beaten, brutally raped The five year old, she didn\'t escape Her mom and friend did her in Cigarette burns on the skin From her mom and her friend He had a whip, Do you wanna party  It\'s party time We gotta party  It\'s party time She never had a chance Her grandmother was sent to jail, her older sister Never loved, not for her time All night she sleeps, party time!Do you wanna party  It\'s party time We gotta party  It\'s party timeWhen the moon arises on the western sky And the vampires are out on the sly At times they have trouble finding necks to bite That\'s why I cook up in my roon tonight...My riboflavin flavored, non-carbonated, polyunsaturated blood Drac came to me one night at 12 And said the blood bank had no blood on the shelf I exclaimed "don\'t fret!" i\'ll give you on fix Of my newly regenerated mix I gave him riboflavin flavored, non-carbonated, polyuns

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
#tokenize all of the artists lyrics, returns a list of tokens
list_of_tokens = tokenizer.tokenize(lyrics)
len(list_of_tokens)

2003

In [5]:
#split the entire list of tokens into lists of 510 tokens, as that will be the max number of tokens we can make an
#embedding for after adding the special tokens

lyrics_split_list = []

temp_list = []
for i,token in enumerate(list_of_tokens):
     
    if i % 510 == 0 and i != 0:
        lyrics_split_list.append(temp_list)
        temp_list = []
    
    temp_list.append(token)

lyrics_split_list.append(temp_list)   
    
for l in lyrics_split_list:
    print(len(l))

#now we have 4 lists of tokens, all for this one artist

510
510
510
473


In [6]:

indexed_tokens_list = []
#for each list of tokens, add special tokens to front and back
# and then map tokens to their vocabulary indices for each list of tokens
for l in lyrics_split_list:
    l.insert(0,"[CLS]")
    l.append("[SEP]")
    indexed_tokens_list.append(tokenizer.convert_tokens_to_ids(l))






In [7]:
#make segment ids, this will just be 4 lists of all 1's 

segment_ids = []


for l in lyrics_split_list:
    segment_ids.append([1] * len(l))



In [8]:
#convert both the indexed_token_list and segment id's to tensors

tokens_tensor_list = []
segment_tensor_list = []

for i, l in enumerate(indexed_tokens_list):
    tokens_tensor_list.append(torch.tensor([l]))
    segment_tensor_list.append(torch.tensor([segment_ids[i]]))

#check out the dims for all of these
for i, l in enumerate(tokens_tensor_list):
    print(f"token-tensor dims: {l.size()}\nsegment-tensor dims: {segment_tensor_list[i].size()}\n")

token-tensor dims: torch.Size([1, 512])
segment-tensor dims: torch.Size([1, 512])

token-tensor dims: torch.Size([1, 512])
segment-tensor dims: torch.Size([1, 512])

token-tensor dims: torch.Size([1, 512])
segment-tensor dims: torch.Size([1, 512])

token-tensor dims: torch.Size([1, 475])
segment-tensor dims: torch.Size([1, 475])



In [9]:
#get the BERT model

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [10]:
with torch.no_grad():

    hidden_state_list = []

    for i, l in enumerate(tokens_tensor_list):
        output = model(l, segment_tensor_list[i])
        hidden_state_list.append(output[2])

hidden_state_list

[(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
           [ 0.9758,  0.6009,  1.0942,  ...,  0.2942,  0.9149, -0.4457],
           [-0.7790,  0.4781,  0.0859,  ..., -0.2489,  0.5078, -0.5689],
           ...,
           [ 0.2995,  0.2276,  0.5906,  ...,  0.1345, -0.1416,  0.0193],
           [ 0.6109,  0.7766,  0.7759,  ..., -1.0444, -0.7319, -0.0908],
           [-0.2301, -0.4165,  0.3172,  ..., -0.2481,  0.5677, -1.6841]]]),
  tensor([[[-0.0300,  0.0268, -0.3461,  ...,  0.2453, -0.0136,  0.0344],
           [ 1.3086,  0.5746,  0.8405,  ...,  0.8477,  1.1813, -0.6246],
           [-0.7688, -0.0869,  0.0370,  ..., -0.0141,  0.8146, -0.6577],
           ...,
           [ 0.1688, -0.0984,  0.5960,  ...,  0.0912,  0.2156,  0.0024],
           [-0.0207,  0.6920,  0.9662,  ..., -0.9666, -0.4290, -0.0360],
           [-0.2062, -0.0881,  0.0865,  ..., -0.1541,  0.6643, -1.3124]]]),
  tensor([[[-0.1448, -0.1177, -0.5335,  ...,  0.3517,  0.1266, -0.0023],
           [ 

In [42]:
list_of_embeddings = []

for state in hidden_state_list:
    stacked = torch.stack(state, dim=0) 
    squeezed = torch.squeeze(stacked,dim=1)
    permuted = squeezed.permute(1,0,2)
    list_of_embeddings.append(permuted)
    print(permuted.size())

torch.Size([512, 13, 768])
torch.Size([512, 13, 768])
torch.Size([512, 13, 768])
torch.Size([475, 13, 768])


In [12]:
def create_sentence_embedding(hidden_state):
    token_vectors = hidden_state[-2][0]
    return torch.mean(token_vectors,dim=0)

In [13]:
sentence_embeddings = []

for state in hidden_state_list:
    e = create_sentence_embedding(state)
    sentence_embeddings.append(e)
    print(e.size())

torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
