In [1]:
!pip install classla

Collecting classla
  Downloading classla-1.1.0-py3-none-any.whl (262 kB)
[?25l[K     |█▎                              | 10 kB 27.0 MB/s eta 0:00:01[K     |██▌                             | 20 kB 25.0 MB/s eta 0:00:01[K     |███▊                            | 30 kB 13.1 MB/s eta 0:00:01[K     |█████                           | 40 kB 10.3 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 7.7 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 9.0 MB/s eta 0:00:01[K     |████████▊                       | 71 kB 9.3 MB/s eta 0:00:01[K     |██████████                      | 81 kB 7.9 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 8.8 MB/s eta 0:00:01[K     |████████████▌                   | 102 kB 8.6 MB/s eta 0:00:01[K     |█████████████▊                  | 112 kB 8.6 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 8.6 MB/s eta 0:00:01[K     |████████████████▏               | 133 kB 8.6 MB/s eta 0:00:01[

In [2]:
import classla
import pandas as pd
import numpy as np
import re
import string
from os import path

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [4]:
def is_float(element):
    try:
        float(element)
        return True
    except ValueError:
        return False

In [5]:
classla.download('hr', type='nonstandard')

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_1.0.1.json: 10.3kB [00:00, 4.73MB/s]                   
2022-03-02 16:36:06 INFO: Downloading these customized packages for language: hr (Croatian)...
| Processor | Package     |
---------------------------
| tokenize  | nonstandard |
| pos       | nonstandard |
| lemma     | nonstandard |
| depparse  | standard    |
| ner       | nonstandard |
| pretrain  | standard    |

Downloading https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1331/hr_nstd: 100%|██████████| 92.6M/92.6M [00:44<00:00, 2.10MB/s]
Downloading https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1352/hr_all_hrLex_lemmatizer.pt: 100%|██████████| 94.3M/94.3M [00:42<00:00, 2.20MB/s]
Downloading https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1259/hr500k_ud: 100%|██████████| 103M/103M [00:52<00:00, 1.95MB/s]
Downloading https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1340/hr_nstd: 100%|█

In [6]:
nlp = classla.Pipeline('hr', type='nonstandard') # run classla.download('hr', type='nonstandard') beforehand if necessary

2022-03-02 16:39:52 INFO: Loading these models for language: hr (Croatian):
| Processor | Package     |
---------------------------
| tokenize  | nonstandard |
| pos       | nonstandard |
| lemma     | nonstandard |
| depparse  | standard    |
| ner       | nonstandard |

2022-03-02 16:39:52 INFO: Use device: gpu
2022-03-02 16:39:52 INFO: Loading: tokenize
2022-03-02 16:39:52 INFO: Loading: pos
2022-03-02 16:40:03 INFO: Loading: lemma
2022-03-02 16:40:09 INFO: Loading: depparse
2022-03-02 16:40:10 INFO: Loading: ner
2022-03-02 16:40:11 INFO: Done loading processors!


In [7]:
pos_tags = "ADJ – ADP – ADV – AUX – CCONJ – DET – INTJ – NOUN – NUM – PART – PRON – PROPN – PUNCT – SCONJ – SYM – VERB – X".split(" – ")
relations = "acl – advcl – advmod – advmod:emph – amod – appos – aux – case – cc – ccomp – compound – conj – cop – csubj – dep – det – det:numgov – discourse – dislocated – expl – fixed – flat – flat:foreign – iobj – list – mark – nmod – nsubj – nummod – obj – obl – orphan – parataxis – punct – root – vocative – xcomp".split(" – ")
word_features = "Animacy – Case – Definite – Degree – Foreign – Gender – Gender[psor] – Mood – Number – Number[psor] – NumType – Person – Polarity – Poss – PronType – Reflex – Tense – VerbForm – Voice".split(" – ")

In [8]:
#define regex expressions to get rid of words refren and pripjev
refren = re.compile(('refren[:\)\]\.]'), re.IGNORECASE)
refrain = re.compile(('refrain[:\)\]\.]'), re.IGNORECASE)
pripjev = re.compile(('pripjev[:\)\]\.]'), re.IGNORECASE)

ref = re.compile('ref[\.,:\ )\]]', re.IGNORECASE)  #ref followed by any of chars in braacket
ref_last = re.compile('ref$', re.IGNORECASE)  #ref followed by any of chars in braacket


In [9]:
def correct_lyrics(lyrics_df):
  lyrics = []
  for song_lyrics in lyrics_df:
    song_lyrics_corrected = []
    for l in song_lyrics.splitlines():
        l = refren.sub("", l)
        l = refrain.sub("", l)
        l = ref.sub("", l)
        l = ref_last.sub("", l)
        l = pripjev.sub("", l)
        l = re.sub("Refren", "", l)  #remove word Refren (if its uppercase) 
        l = re.sub("Refrain", "", l)  #remove word Refrain (if its uppercase)         
        l = re.sub("Pripjev", "", l)  #remove word Pripjev (if its uppercase)     

        l = re.sub("\dx", "", l)    #remove \digit x (usually left after ref)
        l = re.sub("x\d", "", l)    #remove x \digit
        
        l = re.sub("()", "", l)  #remove brackets without content
        l = re.sub("\[\]", "", l)  

        l = l.strip()
        song_lyrics_corrected.append(l)
        
    lyrics.append([l for l in song_lyrics_corrected if not is_float(l) and "http" not in l and len(l)>0])  #do not append if string is a single number, contains https or has length 0


  all_lyrics = ["".join([(". " if c[0].isupper() else ", ")+c for c in l]).strip() for l in lyrics]

  corrected_lyrics=[]
  for l in all_lyrics:

    l.strip()
    l = re.sub(f" [{string.punctuation}] ", " ", l)      #remove special characters that are flanked by empty spaces on both sides
    l = re.sub(f" [{string.punctuation}]", " ", l)      #remove special characters that are flanked by empty spaces on left

    l = re.sub(",,", ",", l)    #replace two commas with one
    l = re.sub("\.,", ".", l)    #replace dot comma with dot
    l = re.sub(",\.", ",", l)    #replace comma dot with comma
    l = re.sub("(?<!\.)\.\.(?!\.)", ".", l)    #replace two dots (that are neither preceded nor followed by third dot), with single dot
    l = re.sub("\.\.\.\.[\.]*", "...", l)    #replace anything with more than three dots with three dots
    l.strip()

    l=l[2:] #remove the dot that was left from the concetination
    corrected_lyrics.append(l)

  return corrected_lyrics



In [10]:
def get_dataframes(df):
  #choose lyrics and name of the song in following order: lyricstranslate, cuspajz, tekstovinet
  lyrics_df = df.Lyrics_lyricstranslate.fillna(df.Lyrics_cuspajz).fillna(df.Lyrics_tekstovinet)
  song_df = df.Song_lyricstranslate.fillna(df.Song_cuspajz).fillna(df.Song_tekstovinet)
  return lyrics_df, song_df

In [11]:
def get_nlp_df(corrected_lyrics, song_df, df):
  df_nlp = pd.DataFrame()

  for cl, song, song_id in zip(corrected_lyrics, song_df, df["Song_ID"].values):

      doc = nlp(cl)
      doc_dict = doc.to_dict()

      #df_song = pd.DataFrame([doc_dict[i][0] for i, _ in enumerate(doc_dict)]).transpose()
      sentences = [doc_dict[i][0] for i, _ in enumerate(doc_dict)]
      df_song = pd.DataFrame(sentences[0])
      for sentence in sentences[1:]:
          df_song = pd.concat([df_song, pd.DataFrame(sentence)])

      df_song["Song"] = song 
      df_song["Song_ID"] = song_id
      df_nlp = pd.concat([df_nlp, df_song])
      df_nlp.reset_index(drop=True, inplace=True)

  return df_nlp

In [12]:
df_info = pd.read_csv(f"/content/gdrive/My Drive/CroLyrics_data/info_for_scraping.csv")

In [13]:
for artist_name in df_info.artist_name:
  

  try:
    df = pd.read_csv(f'/content/gdrive/MyDrive/CroLyrics_data/{artist_name}_final.csv')
    if not path.exists(f"/content/gdrive/MyDrive/CroLyrics_data/{artist_name}_final_lyrics.csv") or not path.exists(f"/content/gdrive/MyDrive/CroLyrics_data/{artist_name}_nlp.csv"):
      lyrics_df, song_df = get_dataframes(df)
      corrected_lyrics = correct_lyrics(lyrics_df)
      final_lyrics_df = pd.DataFrame([df.Song_ID.values, song_df.values, corrected_lyrics]).transpose()
      final_lyrics_df.columns = ["Song_ID", "Song", "Lyrics"]
      final_lyrics_df.to_csv(f"/content/gdrive/MyDrive/CroLyrics_data/{artist_name}_final_lyrics.csv")
      print(f"Successfully saved final lyrics for artist {artist_name}")
    
      df_nlp = get_nlp_df(corrected_lyrics, song_df, df)
      features_df = pd.DataFrame()
      rows = df_nlp.feats.str.split("|").to_list()

      for i, row in enumerate(rows):
          if not type(row) == float:
              for f in row:
                  features_df.loc[i, f[0:f.index("=")]] = f[f.index("=")+1:]

      df_nlp = df_nlp.join(features_df)
      df_nlp.to_csv(f"/content/gdrive/MyDrive/CroLyrics_data/{artist_name}_nlp.csv", index=False)
      print(f"Successfully saved nlp analysis data for artist {artist_name}")

  except: FileNotFoundError: print(f"Couldn't find 'final' file for artist {artist_name}")



Successfully saved final lyrics for artist Goran Karan
Successfully saved nlp analysis data for artist Goran Karan
Successfully saved final lyrics for artist Ivan Zak
Successfully saved nlp analysis data for artist Ivan Zak
Successfully saved final lyrics for artist Divlje Jagode
Successfully saved nlp analysis data for artist Divlje Jagode
Successfully saved final lyrics for artist Gazde
Successfully saved nlp analysis data for artist Gazde
Successfully saved final lyrics for artist Dalmatino
Successfully saved nlp analysis data for artist Dalmatino
Successfully saved final lyrics for artist Krunoslav Kićo Slabinac
Successfully saved nlp analysis data for artist Krunoslav Kićo Slabinac
