In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import pandas as pd
import numpy as np
import re

# Used for tokenization from the original notebook
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
# --- 1. Device Configuration (for M1/M2/M3 Mac GPU) ---
# This is the command to select the appropriate hardware for training
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS (Apple Metal GPU) is available. Using MPS.")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using CUDA.")
else:
    device = torch.device("cpu")
    print("No GPU available. Using CPU.")

MPS (Apple Metal GPU) is available. Using MPS.


In [3]:
df = pd.read_csv("song_lyrics.csv", usecols=['lyrics'], encoding='utf-8', nrows=500)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [4]:
print(f"Loaded {len(df)} unique song lyrics.")

Loaded 495 unique song lyrics.


In [5]:
def clean_text(text):
    text = str(text).lower()
    # remove \n
    text = text.replace('\n', ' ')
    # remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    # remove punctuation
    text = re.sub(r'[,\.!?()]', '', text)
    # remove numbers
    text = re.sub(r'\w*\d\w*',' ', text)
    # remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # replace multiple newlines/spaces with a single one 
    text = re.sub(r'\s+', ' ', text).strip()
    # Keep letters, numbers, basic punctuation, and newlines
    text = re.sub(r'[^a-z0-9\s\n\']', '', text)
    return text


In [6]:
# Apply cleaning to each lyric individually
corpus_lines = [clean_text(lyric) for lyric in df['lyrics'].tolist()]
corpus_lines = [line for line in corpus_lines if line] # Remove empty lines after cleaning
corpus_lines = corpus_lines[:1000]  # Use only 1000 lines for testing

In [7]:
# --- 3. Tokenization ---
# We can still use Keras's tokenizer for convenience
tokenizer = Tokenizer(oov_token='<oov>') # Added an OOV token
tokenizer.fit_on_texts(corpus_lines)
total_words = len(tokenizer.word_index) + 1

print(f"Total unique words in vocabulary: {total_words}")

Total unique words in vocabulary: 12767


In [8]:
# Create n-gram sequences for the model
# NOTE: This process is very memory-intensive. For larger datasets,
# creating sequences on-the-fly with a custom Dataset is recommended.
input_sequences = []
for line in corpus_lines:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [9]:
# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [10]:
# Create predictors (X) and labels (y)
X_np, y_np = input_sequences[:,:-1], input_sequences[:,-1]

# Convert to PyTorch Tensors
X = torch.LongTensor(X_np)
y = torch.LongTensor(y_np)

print(f"\nMax sequence length: {max_sequence_len}")
print(f"Shape of predictors (X): {X.shape}")
print(f"Shape of label (y): {y.shape}")



Max sequence length: 14575
Shape of predictors (X): torch.Size([161110, 14574])
Shape of label (y): torch.Size([161110])


In [11]:
# --- 4. Define the PyTorch LSTM Model ---
class LyricGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate=0.2):
        super(LyricGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True) 
        self.output_layer = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x = self.dropout(x)
        x, _ = self.lstm2(x)
        # We take the output of the last time step
        x = x[:, -1, :]
        # x = torch.relu(self.fc1(x))
        x = self.output_layer(x)
        return x


In [12]:
# Model parameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 150
LEARNING_RATE = 0.001
EPOCHS = 30
BATCH_SIZE = 16

In [13]:
from torch.utils.data import Dataset

class LyricsDataset(Dataset):
    def __init__(self, corpus_lines, tokenizer, max_sequence_len):
        self.sequences = []
        for line in corpus_lines:
            token_list = tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i+1]
                self.sequences.append(n_gram_sequence)
        self.max_sequence_len = max_sequence_len

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = pad_sequences([self.sequences[idx]], maxlen=self.max_sequence_len, padding='pre')[0]
        X = torch.LongTensor(seq[:-1])
        y = torch.LongTensor([seq[-1]])
        return X, y.squeeze()

In [14]:
# Instantiate the model, loss, and optimizer
model = LyricGenerator(total_words, EMBEDDING_DIM, HIDDEN_DIM).to(device)
# Use CrossEntropyLoss which handles integer labels automatically
criterion = nn.CrossEntropyLoss()
# L2 regularization is added via the 'weight_decay' parameter in the optimizer
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

# Create DataLoader for batching
train_data = TensorDataset(X, y)

# Usage:
dataset = LyricsDataset(corpus_lines, tokenizer, max_sequence_len)
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [15]:
# --- 5. PyTorch Training Loop ---
print("\n--- Starting Model Training ---")
model.train() # Set model to training mode
for epoch in range(EPOCHS):
    total_loss = 0
    for batch_X, batch_y in train_loader:
        # Move data to the configured device (CPU or MPS)
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()    # Clear previous gradients
        outputs = model(batch_X) # Forward pass
        loss = criterion(outputs, batch_y) # Calculate loss
        loss.backward()          # Backward pass
        optimizer.step()         # Update weights

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f}")




--- Starting Model Training ---


KeyboardInterrupt: 

In [None]:
def generate_lyrics(seed_text, next_words, model, max_sequence_len):
    model.eval() # Set model to evaluation mode
    generated_text = seed_text

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        
        # Convert to a PyTorch tensor and move to device
        input_tensor = torch.LongTensor(token_list).to(device)

        # Get prediction (no gradients needed)
        with torch.no_grad():
            output = model(input_tensor)
            # Get the word with the highest probability (we don't need softmax, just the argmax)
            predicted_index = torch.argmax(output, dim=1).item()

        # Find the word corresponding to the predicted index
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        
        generated_text += " " + output_word
        
    return generated_text

# --- 7. Generate New Lyrics ---
print("\n--- Generated Lyrics ---")
seed_text = "i see a fire"
generated_lyrics = generate_lyrics(seed_text, 50, model, max_sequence_len)
print(generated_lyrics)

In [None]:
# Save the trained model in .pt (PyTorch) format
torch.save(model.state_dict(), "lyric_generator_model.pth")
print("Model saved as lyric_generator_model.pth")