In [1]:
import pandas as pd
import spacy
from spacy.tokens import DocBin

from sklearn.model_selection import train_test_split

RNG_SEED = 69

In [2]:
# !python -m spacy download en_core_web_sm

In [3]:
# !python -m pip install spacy-transformers

In [4]:
df = pd.read_csv("withoutDuplicates.csv", encoding="utf-8")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16597 entries, 0 to 16596
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       16597 non-null  object
 1   genre       16597 non-null  object
 2   urlSpotify  16597 non-null  object
 3   spotifyID   16597 non-null  object
 4   lyrics      16597 non-null  object
dtypes: object(5)
memory usage: 648.4+ KB


In [5]:
df.head()

Unnamed: 0,title,genre,urlSpotify,spotifyID,lyrics
0,It's Complicated,"[""Pop punk""]",https://play.spotify.com/track/069deq5woglvUnW...,069deq5woglvUnWok4MqTe,"Here we go again, another night of being bumpe..."
1,All Signs Point To Lauderdale,"[""Pop punk""]",https://play.spotify.com/track/15qf0H31MChhqKr...,15qf0H31MChhqKr2zvejla,"I hate this town, it's so washed up\nAnd all m..."
2,Right Back At It Again,"[""Pop punk"",""Post-hardcore""]",https://play.spotify.com/track/0OQmxZOINqWsbKy...,0OQmxZOINqWsbKysfiPSTO,We're coming out swinging\n♪\nStarted out in t...
3,Almost Lover,"[""Adult album alternative""]",https://play.spotify.com/track/71ehTADpxs85ULr...,71ehTADpxs85ULrZgSEKCy,Your fingertips across my skin\nThe palm trees...
4,Blow Away,"[""Adult album alternative"",""Pop music""]",https://play.spotify.com/track/2mzgLMAfvGd7pN6...,2mzgLMAfvGd7pN6JRlNfo3,One of us is gonna be here and\nOne of us is g...


In [6]:
df["genre"] = df["genre"].apply(lambda x: x.lower())
df.head()

Unnamed: 0,title,genre,urlSpotify,spotifyID,lyrics
0,It's Complicated,"[""pop punk""]",https://play.spotify.com/track/069deq5woglvUnW...,069deq5woglvUnWok4MqTe,"Here we go again, another night of being bumpe..."
1,All Signs Point To Lauderdale,"[""pop punk""]",https://play.spotify.com/track/15qf0H31MChhqKr...,15qf0H31MChhqKr2zvejla,"I hate this town, it's so washed up\nAnd all m..."
2,Right Back At It Again,"[""pop punk"",""post-hardcore""]",https://play.spotify.com/track/0OQmxZOINqWsbKy...,0OQmxZOINqWsbKysfiPSTO,We're coming out swinging\n♪\nStarted out in t...
3,Almost Lover,"[""adult album alternative""]",https://play.spotify.com/track/71ehTADpxs85ULr...,71ehTADpxs85ULrZgSEKCy,Your fingertips across my skin\nThe palm trees...
4,Blow Away,"[""adult album alternative"",""pop music""]",https://play.spotify.com/track/2mzgLMAfvGd7pN6...,2mzgLMAfvGd7pN6JRlNfo3,One of us is gonna be here and\nOne of us is g...


In [7]:
genre_mappings = {
    "pop": "pop",
    "rock": "rock",
    "country": "country",
    "folk": "folk",
    "jazz": "jazz",
    "hip hop": "hip hop",
    "rap": "rap",
    "soul": "soul",
    "blues": "blues",
    "metal": "metal"
}

for genre, replacement in genre_mappings.items():
    df["genre"] = df["genre"].apply(lambda x: replacement if genre in x else x)

df.head()

Unnamed: 0,title,genre,urlSpotify,spotifyID,lyrics
0,It's Complicated,pop,https://play.spotify.com/track/069deq5woglvUnW...,069deq5woglvUnWok4MqTe,"Here we go again, another night of being bumpe..."
1,All Signs Point To Lauderdale,pop,https://play.spotify.com/track/15qf0H31MChhqKr...,15qf0H31MChhqKr2zvejla,"I hate this town, it's so washed up\nAnd all m..."
2,Right Back At It Again,pop,https://play.spotify.com/track/0OQmxZOINqWsbKy...,0OQmxZOINqWsbKysfiPSTO,We're coming out swinging\n♪\nStarted out in t...
3,Almost Lover,"[""adult album alternative""]",https://play.spotify.com/track/71ehTADpxs85ULr...,71ehTADpxs85ULrZgSEKCy,Your fingertips across my skin\nThe palm trees...
4,Blow Away,pop,https://play.spotify.com/track/2mzgLMAfvGd7pN6...,2mzgLMAfvGd7pN6JRlNfo3,One of us is gonna be here and\nOne of us is g...


In [8]:
df_simplified = df[df["genre"].isin(genre_mappings.values())]
df_simplified.head()

Unnamed: 0,title,genre,urlSpotify,spotifyID,lyrics
0,It's Complicated,pop,https://play.spotify.com/track/069deq5woglvUnW...,069deq5woglvUnWok4MqTe,"Here we go again, another night of being bumpe..."
1,All Signs Point To Lauderdale,pop,https://play.spotify.com/track/15qf0H31MChhqKr...,15qf0H31MChhqKr2zvejla,"I hate this town, it's so washed up\nAnd all m..."
2,Right Back At It Again,pop,https://play.spotify.com/track/0OQmxZOINqWsbKy...,0OQmxZOINqWsbKysfiPSTO,We're coming out swinging\n♪\nStarted out in t...
4,Blow Away,pop,https://play.spotify.com/track/2mzgLMAfvGd7pN6...,2mzgLMAfvGd7pN6JRlNfo3,One of us is gonna be here and\nOne of us is g...
6,Space Age Love Song,pop,https://play.spotify.com/track/4FMsajq8hKiOmcJ...,4FMsajq8hKiOmcJ7UzOhag,I saw your eyes\nAnd you made me smile\nFor a ...


In [9]:
df_shuffled = df_simplified.sample(frac=1, random_state=RNG_SEED)
# df_shuffled = df_simplified.sample(n=100, random_state=RNG_SEED)

In [10]:
# 60% train, 20% dev, 20% test
df_train_dev, df_test = train_test_split(df_shuffled, test_size=0.2, random_state=RNG_SEED)
df_train, df_dev = train_test_split(df_train_dev, test_size=0.25, random_state=RNG_SEED)

In [11]:
print(f"{len(df_train) = }")
print(f"{len(df_dev) = }")
print(f"{len(df_test) = }")

len(df_train) = 9015
len(df_dev) = 3006
len(df_test) = 3006


In [12]:
df_train.head()

Unnamed: 0,title,genre,urlSpotify,spotifyID,lyrics
2092,Ma Baker,pop,https://play.spotify.com/track/4SMVEIfMusETeLh...,4SMVEIfMusETeLhl7ptFUh,"Freeze, I'm Ma Baker, put your hands in the ai..."
11335,Ridin' The Storm Out,rock,https://play.spotify.com/track/0SkmxNE5ydxiEN0...,0SkmxNE5ydxiEN0hzJbMac,"Ridin' the storm out, waitin' for the thaw out..."
15676,As Good As I Once Was,country,https://play.spotify.com/track/5KjhrsPiaDtefxG...,5KjhrsPiaDtefxGFfTUmaO,She said I seen you in here before\nI said I b...
2300,Little Drummer Boy,pop,https://play.spotify.com/track/2YTWQAF0A7rzCSp...,2YTWQAF0A7rzCSpfeiHZa5,"Come they told me, pa rum pum-pum-pum\nA new b..."
816,"Do Right Woman, Do Right Man",pop,https://play.spotify.com/track/6uAEJQpAPoYmM6M...,6uAEJQpAPoYmM6Mb0tJN2I,Take me to heart and I'll always love you\nAnd...


In [13]:
nlp = spacy.load("en_core_web_sm")

def save_docs(song_data: pd.DataFrame, output_file: str) -> None:
    db = DocBin()

    def add_doc(lyrics: str, label: str) -> None:
        doc = nlp.make_doc(lyrics)
        doc.cats = {cat: int(cat == label) for cat in genre_mappings.values()}
        db.add(doc)

    for _, row in song_data.iterrows():
        add_doc(row["lyrics"], row["genre"])

    db.to_disk(output_file)


save_docs(df_train, "spacy/train.spacy")
save_docs(df_dev, "spacy/dev.spacy")
save_docs(df_test, "spacy/test.spacy")

In [14]:
!python -m spacy init config --pipeline textcat spacy/config-no-transformer.cfg

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
spacy/config-no-transformer.cfg
You can now add your data and train your pipeline:
python -m spacy train config-no-transformer.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [15]:
!python -m spacy train spacy/config-no-transformer.cfg --paths.train spacy/train.spacy  --paths.dev spacy/dev.spacy --output textcat_model

[38;5;4mℹ Saving to output directory: textcat_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-03-28 14:53:48,469] [INFO] Set up nlp object from config
[2023-03-28 14:53:48,489] [INFO] Pipeline: ['textcat']
[2023-03-28 14:53:48,494] [INFO] Created vocabulary
[2023-03-28 14:53:48,501] [INFO] Finished initializing nlp object
[2023-03-28 14:55:36,726] [INFO] Initialized pipeline components: ['textcat']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.09        0.69    0.01
  0     200         17.03       12.02    0.12
  0     400         17.44       11.87    0.12
  0     600         18.15       14.24    0.14
  0     800         17.89       12.09    0.12
  0    1000         19.85       13.24    0.13
  0    1200         17.82       14.03    0.14
  0    1400         18.09       15.15    0.15

In [16]:
!python -m spacy evaluate textcat_model/model-best/ --output metrics.json spacy/test.spacy

[38;5;4mℹ Using CPU[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   23.63 
SPEED               302554

[1m

              P       R       F
pop       46.48   67.59   55.08
rock      51.54   40.79   45.54
country   39.71   40.89   40.29
folk      50.00   10.53   17.39
jazz      13.64   18.00   15.52
hip hop   40.00    2.13    4.04
rap        0.00    0.00    0.00
soul      21.43    8.49   12.16
blues     40.96   24.64   30.77
metal     19.18   12.96   15.47

[1m

          ROC AUC
pop          0.67
rock         0.65
country      0.83
folk         0.64
jazz         0.69
hip hop      0.17
rap          0.21
soul         0.56
blues        0.71
metal        0.66

[38;5;2m✔ Saved results to metrics.json[0m
