The core of this project is based around a simple task -- performing genre analysis on the “Multi-
Lingual Lyrics for Genre Classification” dataset on kaggle.
https://www.kaggle.com/datasets/mateibejan/multilingual-lyrics-for-genre-classification
This is an extensive dataset that is split into training and testing subsets. The testing dataset
should be used for final testing only. The training dataset should be all training and validation
tasks as appropriate.
This first part of the task is to perform a number of analyses based on training from scratch to
predict genre based on initially on song lyrics and then on song lyrics and artist.

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the training and test datasets
train_df = pd.read_csv('train.csv')  # Replace with the path to your train.csv
test_df = pd.read_csv('test.csv')  # Replace with the path to your test.csv

# Show the first few rows of the train dataset to understand the structure
train_df.head()

# Convert all entries in the DataFrames to strings
train_df = train_df.astype(str)
test_df = test_df.astype(str)

# Lowercase all column names in the testing dataset
test_df.columns = [col.lower() for col in test_df.columns]

# Lowercase all column names in the testing dataset
train_df.columns = [col.lower() for col in train_df.columns]

# Split the training dataset into train and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Preprocess text data (convert lyrics to lowercase)
train_texts = train_df['lyrics'].str.lower().values
val_texts = val_df['lyrics'].str.lower().values
test_texts = test_df['lyrics'].str.lower().values

# Tokenize the lyrics
max_words = 10000  # maximum number of words to consider
max_sequence_length = 100  # max length of each sequence (lyrics)
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to ensure uniform input size
X_train = pad_sequences(train_sequences, maxlen=max_sequence_length)
X_val = pad_sequences(val_sequences, maxlen=max_sequence_length)
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Prepare genre labels (assuming they are categorical)
y_train = pd.get_dummies(train_df['genre']).values
y_val = pd.get_dummies(val_df['genre']).values
y_test = pd.get_dummies(test_df['genre']).values

max_artist_length = 1  # Since we are using one token for the artist

# Tokenize the artist names
artist_tokenizer = Tokenizer()
artist_tokenizer.fit_on_texts(train_df['artist'].values)

train_artist_sequences = artist_tokenizer.texts_to_sequences(train_df['artist'].values)
val_artist_sequences = artist_tokenizer.texts_to_sequences(val_df['artist'].values)
test_artist_sequences = artist_tokenizer.texts_to_sequences(test_df['artist'].values)

# Pad sequences to ensure uniform input size for artist (usually 1 token per artist name)
X_train_artist = pad_sequences(train_artist_sequences, maxlen=1)  # padding to length 1
X_val_artist = pad_sequences(val_artist_sequences, maxlen=1)
X_test_artist = pad_sequences(test_artist_sequences, maxlen=1)

artist_vocab_size = len(artist_tokenizer.word_index) + 1  # Size of the artist vocabulary


In [56]:
import pickle

y_train_encoded = pd.get_dummies(train_df['genre'])

# Save the one-hot encoded labels (y_train) to a file
with open('y_train_encoded.pkl', 'wb') as f:
    pickle.dump(y_train_encoded, f)

    # Save the column names (genre classes) for consistency during testing
with open('genre_columns.pkl', 'wb') as f:
    pickle.dump(y_train_encoded.columns, f)

In [None]:
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"X_test_artist shape: {X_test_artist.shape}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, LSTM

def build_basic_rnn():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(SimpleRNN(128, return_sequences=False))
    model.add(Dropout(0.5))  # to prevent overfitting
    model.add(Dense(y_train.shape[1], activation='softmax'))  # multi-class classification
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

basic_rnn_model = build_basic_rnn()
basic_rnn_model.summary()


In [None]:
basic_rnn_model.summary()


In [None]:
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

lstm_model = build_lstm_model()
lstm_model.summary()


In [None]:
lstm_model.summary()


In [None]:
def build_multilayer_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

multi_layer_lstm_model = build_multilayer_lstm_model()
multi_layer_lstm_model.summary()


In [None]:
multi_layer_lstm_model.summary()


In [None]:
# Importing the 50-dimensional embedding text file
path = 'glove.6B.100d.txt'

embeddings = {}

import numpy as np

with open(path, 'r', encoding = 'utf-8') as f:
    for line in f:
      values = line.split()                                          # Each line in the file is a word + 50 integers denoting its vector.
      embeddings[values[0]] = np.array(values[1:], 'float32')        # The first element of every line is a word & the rest 50 are its array of integers.


# Building the embeddings matrix out of words present in our corpus
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, index in tokenizer.word_index.items():
    if index < max_words:
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
        else:
            embedding_matrix[index] = np.random.uniform(-0.1, 0.1, embedding_dim)

In [None]:
def build_multilayer_lstm_model_emb():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

multi_layer_lstm_model_emb = build_multilayer_lstm_model_emb()
hidden1 = multi_layer_lstm_model_emb.layers[2]
hidden1.name

print(f"hidden1 matrix shape: {hidden1.name}")

# multi_layer_lstm_model_emb.summary()

print(f"Embedding matrix shape: {multi_layer_lstm_model_emb.layers[0]}")


# Loading our pre-trained embedding matrix in the Embedding layer
multi_layer_lstm_model_emb.layers[0].set_weights([embedding_matrix])
multi_layer_lstm_model_emb.layers[0].trainable = True  

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

def build_basic_rnn_model_with_artist(max_lyrics_length, max_artist_length, max_words, artist_vocab_size, embedding_dim=128):
    """
    Builds a basic RNN model that takes both lyrics and artist as inputs.
    
    Args:
    - max_lyrics_length: The maximum length of the lyrics sequences.
    - max_artist_length: The maximum length of the artist sequence (typically 1 token).
    - max_words: The size of the vocabulary (number of unique words in lyrics).
    - artist_vocab_size: The size of the artist vocabulary (number of unique artists).
    - embedding_dim: The dimensionality of the embedding layer.
    
    Returns:
    - model: The compiled Keras model.
    """
    
    # Input layer for lyrics
    input_lyrics = Input(shape=(max_lyrics_length,), name='lyrics_input')
    # Embedding layer for lyrics
    embedding_lyrics = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_lyrics_length)(input_lyrics)
    # RNN layer for lyrics
    rnn_lyrics = SimpleRNN(128, activation='relu')(embedding_lyrics)
    
    # Input layer for artist
    input_artist = Input(shape=(max_artist_length,), name='artist_input')
    # Embedding layer for artist
    embedding_artist = Embedding(input_dim=artist_vocab_size, output_dim=embedding_dim)(input_artist)
    # Flatten the artist embedding to concatenate with the lyrics output
    flatten_artist = Flatten()(embedding_artist)

    # Combine both the RNN output (lyrics) and the artist embedding output
    combined = tf.keras.layers.concatenate([rnn_lyrics, flatten_artist])

    # Dense layers to predict the genre
    x = Dense(128, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(10, activation='softmax')(x)  # 10 classes, softmax activation for multi-class

    # Create the model
    model = Model(inputs=[input_lyrics, input_artist], outputs=output)

    # Compile the model
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Build the model
rnn_model_with_artist = build_basic_rnn_model_with_artist(max_sequence_length, max_artist_length, max_words, artist_vocab_size)

# Display the model summary
rnn_model_with_artist.summary()


In [None]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D

def build_cnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(y_train.shape[1], activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

cnn_model = build_cnn_model()
cnn_model.summary()


In [None]:
cnn_model.summary()


In [41]:
# Initialize an empty list to store the results
results = []

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model, Model

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define the checkpoint callback dynamically using lambda to include the model name
checkpoint_path = f'part_1_new/best_rnn_model.keras'
checkpoint = ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        mode='min',
        verbose=1
    )

# Train Basic RNN Model
history_basic_rnn = basic_rnn_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping, checkpoint])

basic_rnn_model.summary()

# Load the best model saved during training
best_model = load_model(f'part_1_new/best_rnn_model.keras')

# Evaluate the model on the test data (Block 1)
test_loss, test_acc = best_model.evaluate(X_test, y_test)

# Store the results in the list
results.append({
    'model_name': 'best_rnn_model',
    'test_loss': test_loss,
    'test_accuracy': test_acc,
    'model_history': history_basic_rnn
})


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model, Model

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define the checkpoint callback dynamically using lambda to include the model name
checkpoint_path = f'part_1_new/best_lstm_model.keras'
checkpoint = ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        mode='min',
        verbose=1
    )

# Train LSTM Model
history_lstm = lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping, checkpoint])

lstm_model.summary()

# Load the best model saved during training
best_model = load_model(f'part_1_new/best_lstm_model.keras')

# Evaluate the model on the test data (Block 1)
test_loss, test_acc = best_model.evaluate(X_test, y_test)

# Store the results in the list
results.append({
    'model_name': 'best_lstm_model',
    'test_loss': test_loss,
    'test_accuracy': test_acc,
    'model_history': history_lstm
})


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model, Model

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define the checkpoint callback dynamically using lambda to include the model name
checkpoint_path = f'part_1_new/best_multi_layer_lstm_model.keras'
checkpoint = ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        mode='min',
        verbose=1
    )

# Train Multi-layer LSTM Model
history_multi_lstm = multi_layer_lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping, checkpoint])

multi_layer_lstm_model.summary()

# Load the best model saved during training
best_model = load_model(f'part_1_new/best_multi_layer_lstm_model.keras')

# Evaluate the model on the test data (Block 1)
test_loss, test_acc = best_model.evaluate(X_test, y_test)

# Store the results in the list
results.append({
    'model_name': 'best_multi_layer_lstm_model',
    'test_loss': test_loss,
    'test_accuracy': test_acc,
    'model_history': history_multi_lstm
})


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

# Define early stopping to monitor the validation loss
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define the checkpoint callback dynamically to save the best model based on val_loss
checkpoint_path = 'part_1_new/best_rnn_model_with_artist.keras'
checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)

# Train the model with both lyrics and artist inputs, and validation data
history_rnn_model_with_artist = rnn_model_with_artist.fit(
    [X_train, X_train_artist],  # Inputs: lyrics and artist
    y_train,  # Output: genre labels
    epochs=10,  # Number of epochs to train
    batch_size=64,  # Batch size
    validation_data=([X_val, X_val_artist], y_val),  # Validation data: lyrics and artist
    callbacks=[early_stopping, checkpoint]  # Callbacks for early stopping and model checkpointing
)

# Print the model summary to check the structure
rnn_model_with_artist.summary()

# Load the best model saved during training
best_model = load_model('part_1_new/best_rnn_model_with_artist.keras')

# Evaluate the model on the test data (Block 1)
test_loss, test_acc = best_model.evaluate([X_test, X_test_artist], y_test)

# Store the results in a results list (assuming results is predefined)
results.append({
    'model_name': 'best_rnn_model_with_artist',
    'test_loss': test_loss,
    'test_accuracy': test_acc,
    'model_history': history_rnn_model_with_artist
})


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

genre_labels = train_df['genre'].unique()  # Or test_df['genre'] if you want the test set

best_model = load_model('part_1_new/best_rnn_model_with_artist.keras')

# Step 1: Run predictions on the test data
predictions = best_model.predict([X_test, X_test_artist])

# Step 2: Convert predicted probabilities to class indices
predicted_classes = np.argmax(predictions, axis=1)

# Step 3: Convert the true labels (y_test) from one-hot encoded format to class indices
y_test_classes = np.argmax(y_test, axis=1)

# Step 4: Create confusion matrix
cm = confusion_matrix(y_test_classes, predicted_classes)

# Step 5: Plot the confusion matrix as a heatmap for better visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=genre_labels, yticklabels=genre_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Step 6: Optionally, print classification report for more detailed performance metrics
print("Classification Report:")
print(classification_report(y_test_classes, predicted_classes, target_names=genre_labels))


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model, Model

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define the checkpoint callback dynamically using lambda to include the model name
checkpoint_path = f'part_1_new/best_cnn_model.keras'
checkpoint = ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        mode='min',
        verbose=1
    )

# Train CNN Model
history_cnn = cnn_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping, checkpoint])

cnn_model.summary()

# Load the best model saved during training
best_model = load_model(f'part_1_new/best_cnn_model.keras')

# Evaluate the model on the test data (Block 1)
test_loss, test_acc = best_model.evaluate(X_test, y_test)

# Store the results in the list
results.append({
    'model_name': 'best_cnn_model',
    'test_loss': test_loss,
    'test_accuracy': test_acc,
    'model_history': history_cnn
})


In [None]:
import pickle

# Save the results array to a file
with open('results_v1_part1.pkl', 'wb') as file:
    pickle.dump(results, file)
print("Results array saved to 'results_v1_part1.pkl'.")

In [None]:
import pickle
import matplotlib.pyplot as plt


# Load the results array from a file
with open('results_v1_part1.pkl', 'rb') as file:
    loaded_results = pickle.load(file)
print("Results array loaded from 'results_v1_part1.pkl'.")

# Plot training and validation accuracy dynamically
plt.figure(figsize=(12, 6))
for result in loaded_results:
    model_name = result['model_name']
    history = result['model_history']
    plt.plot(history.history['loss'], label=f'{model_name} Training Loss')
    plt.plot(history.history['val_loss'], label=f'{model_name} Validation Loss')

plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
import pickle
import matplotlib.pyplot as plt


# Load the results array from a file
with open('results_v1_part1.pkl', 'rb') as file:
    loaded_results = pickle.load(file)
print("Results array loaded from 'results_v1_part1.pkl'.")


# Print test accuracies dynamically
for result in loaded_results:
    print(f"{result['model_name']} Test Accuracy: {result['test_accuracy']}")

# Plot training and validation accuracy dynamically
plt.figure(figsize=(12, 6))
for result in loaded_results:
    model_name = result['model_name']
    history = result['model_history']
    plt.plot(history.history['accuracy'], label=f'{model_name} Training Accuracy')
    plt.plot(history.history['val_accuracy'], label=f'{model_name} Validation Accuracy')

plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
import pickle
import matplotlib.pyplot as plt

# Load the results array from a file
with open('results_v1_part1.pkl', 'rb') as file:
    loaded_results = pickle.load(file)
print("Results array loaded from 'results_v1_part1.pkl'.")

# Print test accuracies dynamically
for result in loaded_results:
    print(f"{result['model_name']} Test Accuracy: {result['test_accuracy']}, Test Loss: {result['test_loss']}")

# Prepare data for bar plots
model_names = [result['model_name'] for result in loaded_results]
test_accuracies = [result['test_accuracy'] for result in loaded_results]
test_losses = [result['test_loss'] for result in loaded_results]

# Plot Test Accuracy Bar Plot
plt.figure(figsize=(12, 6))
plt.bar(model_names, test_accuracies, color='green')
plt.title('Test Accuracy per Model')
plt.xlabel('Model')
plt.ylabel('Test Accuracy')
plt.xticks(rotation=45, ha='right')  # Rotate model names for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of x-axis labels
plt.show()

# Plot Test Loss Bar Plot
plt.figure(figsize=(12, 6))
plt.bar(model_names, test_losses, color='red')
plt.title('Test Loss per Model')
plt.xlabel('Model')
plt.ylabel('Test Loss')
plt.xticks(rotation=45, ha='right')  # Rotate model names for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of x-axis labels
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


model_url = "https://github.com/fizz3r/ml/raw/main/part_1_v2/best_rnn_model_with_artist_2.keras"
model_filename = "best_rnn_model_with_artist_2.keras"

model_path = tf.keras.utils.get_file(model_filename, model_url)
best_rnn_model_with_artist_2 = tf.keras.models.load_model(model_path)

predictions = best_rnn_model_with_artist_2.predict([X_test, X_test_artist])
# Convert the predicted probabilities to class indices
predicted_classes = np.argmax(predictions, axis=1)

# Convert the true labels (y_test) to class indices
y_test_classes = np.argmax(y_test, axis=1)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_classes, predicted_classes)

print(f"Prediction Accuracy: {accuracy}")

# Step 4: Evaluate the model on the test data
test_loss, test_acc = best_rnn_model_with_artist_2.evaluate([X_test, X_test_artist], y_test)

# Print the evaluation results (loss and accuracy)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

In [None]:
import pickle

# Assuming X_test, X_test_artist, and y_test are your test data

# Save X_test to a pickle file
with open('X_test.pkl', 'wb') as file:
    pickle.dump(X_test, file)
print("X_test saved to 'X_test.pkl'.")

# Save X_test_artist to a pickle file
with open('X_test_artist.pkl', 'wb') as file:
    pickle.dump(X_test_artist, file)
print("X_test_artist saved to 'X_test_artist.pkl'.")

# Save y_test to a pickle file
with open('y_test.pkl', 'wb') as file:
    pickle.dump(y_test, file)
print("y_test saved to 'y_test.pkl'.")