In [35]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
data = pd.read_csv('all_songs_and_lyrics.csv')

# Extract lyrics and numerical features
lyrics = data['text']
numerical_features = data[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']].values

# Clean and preprocess lyrics
def preprocess_lyrics(lyric):
    # Lowercase the lyric
    lyric = lyric.lower()

    # Remove punctuation
    lyric = re.sub(r'[^\w\s]', '', lyric)

    # Remove extra whitespaces
    lyric = re.sub(r'\s+', ' ', lyric)

    # Return the preprocessed lyric
    return lyric

preprocessed_lyrics = [preprocess_lyrics(lyric) for lyric in lyrics]

# Vectorize lyrics using a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_lyrics)
total_words = len(tokenizer.word_index) + 1
max_sequence_length = max(len(seq) for seq in tokenizer.texts_to_sequences(data['text']))
lyrics_sequences = tokenizer.texts_to_sequences(preprocessed_lyrics)

In [39]:
lyrics_sequences_array = pad_sequences(lyrics_sequences, maxlen=max_sequence_length)

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the model architecture
model = Sequential()
model.add(Embedding(total_words, 128, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
# Split data into training and validation sets
train_sequences = lyrics_sequences_array[:int(0.8 * len(lyrics_sequences))]
train_targets = numerical_features[:int(0.8 * len(lyrics_sequences))]
val_sequences = lyrics_sequences_array[int(0.8 * len(lyrics_sequences)):]
val_targets = numerical_features[int(0.8 * len(lyrics_sequences)):]

# Train the model
model.fit(train_sequences, train_targets, epochs=10, batch_size=64, validation_data=(val_sequences, val_targets))

Epoch 1/10


ValueError: in user code:

    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/backend.py", line 5573, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 11) and (None, 25396) are incompatible


In [None]:
# Evaluate the model on unseen test data
test_sequences = lyrics_sequences[int(0.9 * len(lyrics_sequences)):]
test_targets = numerical_features[int(0.9 * len(lyrics_sequences)):]
model.evaluate(test_sequences, test_targets)