In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 38.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 21.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [2]:
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModelForTokenClassification
import torch
import numpy as np
from os import path
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [4]:
  #https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

def get_sentence_embedding(text):

    tokenizer = AutoTokenizer.from_pretrained("classla/bcms-bertic")
    model = AutoModelForPreTraining.from_pretrained("classla/bcms-bertic",output_hidden_states = True, # Whether the model returns all hidden-states.
                                      )
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model     = model.to(device)
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    #model.eval()

          
    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the 22 tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)

    #check length of tokens:
    if len(indexed_tokens)>512: 
      indexed_tokens = indexed_tokens[0:511]
      segments_ids = segments_ids[0:511]

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[1]


    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding.to("cpu")

In [5]:
df_info = pd.read_csv(f"/content/gdrive/My Drive/CroLyrics_data/info_for_scraping.csv")

In [6]:

for artist_name in df_info.artist_name:
  
  file_path = f"/content/gdrive/My Drive/CroLyrics_data/{artist_name}_final_lyrics.csv"
  if path.exists(file_path) and not path.exists(f"/content/gdrive/My Drive/CroLyrics_data/{artist_name}_final_lyrics_embedded.csv"):
    print(f"Calculating embedded lyrics for {artist_name}")
    
    df_lyrics = pd.read_csv(f"/content/gdrive/My Drive/CroLyrics_data/{artist_name}_final_lyrics.csv")
    embedded_lyrics=[]
    for lyrics in df_lyrics.Lyrics.values: 
      embedded_lyrics.append(get_sentence_embedding(lyrics))
    
    df_lyrics["embedded_lyrics"] = embedded_lyrics
    df_lyrics.to_csv(f"/content/gdrive/My Drive/CroLyrics_data/{artist_name}_final_lyrics_embedded.csv", index=False)
    print(f"Embedded lyrics saved for {artist_name}")



Calculating embedded lyrics for Goran Karan


Downloading:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Embedded lyrics saved for Goran Karan
Calculating embedded lyrics for Ivan Zak


Token indices sequence length is longer than the specified maximum sequence length for this model (640 > 512). Running this sequence through the model will result in indexing errors


Embedded lyrics saved for Ivan Zak
Calculating embedded lyrics for Divlje Jagode
Embedded lyrics saved for Divlje Jagode
Calculating embedded lyrics for Gazde
Embedded lyrics saved for Gazde
Calculating embedded lyrics for Dalmatino
Embedded lyrics saved for Dalmatino
Calculating embedded lyrics for Krunoslav Kićo Slabinac


Token indices sequence length is longer than the specified maximum sequence length for this model (592 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (709 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


Embedded lyrics saved for Krunoslav Kićo Slabinac
