In [3]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import json

import concurrent.futures

In [4]:
#get lyrics data
data = pd.read_csv("final_data.csv")

#trim data, it was taking too long to do all of them
rows_to_drop = data.sample(n=1454, random_state=42).index

data = data.drop(rows_to_drop)
data

Unnamed: 0,artist,genre,numsongs,lyrics
1,Abrasive Wheels,punk,10,Got my marching orders in the morning post Whe...
4,The Adverts,punk,10,"Life's short, don't make a mess of it To the e..."
8,Alternative TV,punk,10,"Action, time and vision Action, time and visio..."
13,Anti-Pasti,punk,10,See how they run With their backs Against the ...
14,Au Pairs,punk,10,Spending time nowadays Nowadays it’s nice It’s...
...,...,...,...,...
1925,The Four Horsemen,hard rock,10,I'm keeping a heater I'm doing the best I can ...
1936,Blackfoot,country,10,You took his land and you ate his corn And on ...
1938,Atomic Opera,hard rock,10,I want a Virgin Mary nightlamp Bible hero lunc...
1950,Asterix,hard rock,10,⒯⋆̶⋆'⋆̶⋆⒧⋆̶⋆⒜⋆̶⋆⒣⋆̶⋆ ⒣⋆̶⋆⒜⋆̶⋆⒝⋆̶⋆⒤⋆̶⋆⒮⋆̶⋆ ⒦⋆̶⋆...


In [5]:
#load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#load bert model and put in eval mode
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
def create_embeddings(start_index, end_index, df):

        #this will be a dictionary mapping an artist to that artist's respective embedding and genre, (embedding,genre), the contents of this dict will later be saved to a json file
    artist_embedding_dict = dict()



    for index, row in df.iloc[start_index:end_index].iterrows():

        #get info from row
        artist = row["artist"]
        genre = row["genre"]
        lyrics = row["lyrics"]

        #convert lyrics to list of tokens
        list_of_tokens = tokenizer.tokenize(lyrics)

        #split the entire list of tokens into lists of 510 tokens, as that will be the max number of tokens we can make an
        #embedding for after adding the special tokens

        lyrics_split_list = []

        temp_list = []
        for i,token in enumerate(list_of_tokens):
            
            if i % 510 == 0 and i != 0:
                lyrics_split_list.append(temp_list)
                temp_list = []
            
            temp_list.append(token)

        lyrics_split_list.append(temp_list)


        #for each list of tokens, add special tokens to front and back
        # and then map tokens to their vocabulary indices for each list of tokens
        # and make segment ids, this will just be lists of all 1's 
        indexed_tokens_list = []
        segment_ids = []

        for l in lyrics_split_list:
            l.insert(0,"[CLS]")
            l.append("[SEP]")
            segment_ids.append([1] * len(l))
            indexed_tokens_list.append(tokenizer.convert_tokens_to_ids(l))
        
        #convert to tensors
        tokens_tensor_list = []
        segment_tensor_list = []

        for i, l in enumerate(indexed_tokens_list):
            tokens_tensor_list.append(torch.tensor([l]))
            segment_tensor_list.append(torch.tensor([segment_ids[i]]))

        #run tensors through bert and get the hidden states
        with torch.no_grad():

            hidden_state_list = []

            for i, l in enumerate(tokens_tensor_list):
                output = model(l, segment_tensor_list[i])
                hidden_state_list.append(output[2])
        
        embeddings = []

        for state in hidden_state_list:
            token_vectors = state[-2][0]
            embeddings.append(torch.mean(token_vectors,dim=0))

        artist_embedding = torch.mean(torch.stack(embeddings), dim=0)

        #make it a python list so it is json serializable
        artist_embedding_dict[artist] = artist_embedding.tolist()

    return artist_embedding_dict


In [7]:

def process_dataframe(df, num_threads):
    results = []
    chunk_size = len(df) // num_threads
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        start_index = 0
        for i in range(num_threads):
            end_index = start_index + chunk_size if i < num_threads - 1 else None
            future = executor.submit(create_embeddings, start_index, end_index, df)
            futures.append(future)
            start_index = end_index
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            results.append(result)
    return results



# Set the number of threads
num_threads = 4

# Process the dataframe using 4 threads
processed_data = process_dataframe(data, num_threads)

total_dict = {}

for d in processed_data:
    total_dict.update(d)

with open("500_artist_embeddings.json", "w+") as f:
    json.dump(total_dict,f)


In [11]:
#larger trial

data = pd.read_csv("final_data.csv")

large_total_dict = {}

batch_size = 20

# Create a generator function to yield rows in batches
def batch_iterator(dataframe, batch_size):
    num_rows = len(dataframe)
    for i in range(0, num_rows, batch_size):
        yield dataframe.iloc[i : i + batch_size]

# Iterate over rows in batches
for count,batch_df in enumerate(batch_iterator(data, batch_size)):
    print(f"Starting Batch {count}")
    temp_results = process_dataframe(batch_df, 4)

    for r in temp_results:
        large_total_dict.update(r)

with open("2000_artist_embeddings.json", "w+") as f:
    json.dump(total_dict,f)

Starting Batch 0
Starting Batch 1
Starting Batch 2
Starting Batch 3
Starting Batch 4
Starting Batch 5
Starting Batch 6
Starting Batch 7
Starting Batch 8
Starting Batch 9
Starting Batch 10
Starting Batch 11
Starting Batch 12
Starting Batch 13
Starting Batch 14
Starting Batch 15
Starting Batch 16
Starting Batch 17
Starting Batch 18
Starting Batch 19
Starting Batch 20
Starting Batch 21
Starting Batch 22
Starting Batch 23
Starting Batch 24
