In [10]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, concatenate

# Load the dataset
data = pd.read_csv('all_songs_and_lyrics.csv')

# Extract lyrics and numerical features
lyrics = data['text']
numerical_features = data[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

# Clean and preprocess lyrics
def preprocess_lyrics(lyric):
    # Lowercase the lyric
    lyric = lyric.lower()

    # Remove punctuation
    lyric = re.sub(r'[^\w\s]', '', lyric)

    # Remove extra whitespaces
    lyric = re.sub(r'\s+', ' ', lyric)

    # Return the preprocessed lyric
    return lyric

preprocessed_lyrics = [preprocess_lyrics(lyric) for lyric in lyrics]

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_lyrics)
sequences = tokenizer.texts_to_sequences(preprocessed_lyrics)
total_words = len(tokenizer.word_index) + 1
max_sequence_length = max(len(seq) for seq in tokenizer.texts_to_sequences(data['text']))

padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Model Architecture
text_input = Input(shape=(max_sequence_length,), name='text_input')
non_text_input = Input(shape=(11,), name='non_text_input')
embedding_layer = Embedding(input_dim=total_words, output_dim=150)(text_input)
lstm_layer = LSTM(units=256)(embedding_layer)
combined = concatenate([lstm_layer, non_text_input])
output = Dense(total_words, activation='softmax')(combined)
model = Model(inputs=[text_input, non_text_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy')



In [16]:
# Assuming you have already loaded and preprocessed your data
X_text_train, X_text_val, X_non_text_train, X_non_text_val, y_train, y_val = train_test_split(
    padded_sequences,  # Padded sequences for lyrics
    numerical_features,
    lyrics,
    test_size=0.2,
    random_state=42
)

# Scaling non-text features
scaler = StandardScaler()
X_non_text_train_scaled = scaler.fit_transform(X_non_text_train)
X_non_text_val_scaled = scaler.transform(X_non_text_val)

In [17]:
# Training
model.fit(
    {'text_input': X_text_train, 'non_text_input': X_non_text_train_scaled},
    {'output': y_train},
    epochs=5,
    batch_size=32,
    validation_data=({'text_input': X_text_val, 'non_text_input': X_non_text_val_scaled}, {'output': y_val})
)

Epoch 1/5


ValueError: in user code:

    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 248, in __call__
        y_true = self._conform_to_outputs(y_pred, y_true)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 63, in _conform_to_outputs
        struct = map_to_output_names(outputs, self._output_names, struct)
    File "/Users/domidanke/anaconda3/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 819, in map_to_output_names
        raise ValueError(

    ValueError: Found unexpected losses or metrics that do not correspond to any Model output: dict_keys(['output']). Valid mode output names: ['dense_3']. Received struct is: {'output': <tf.Tensor 'IteratorGetNext:2' shape=(None, 1) dtype=string>}.
