In [None]:
%%capture
!pip install unsloth
!pip install transformers datasets tqdm matplotlib seaborn scikit-learn pandas torch
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
"""
This notebook implements sentiment and thematic analysis of song lyrics using LoRA-tuned Llama model.
"""

# Core ML and data processing 
from unsloth import FastLanguageModel  
from datasets import Dataset, DatasetDict  
from transformers import TrainingArguments 
from trl import SFTTrainer 
import pandas as pd 
import numpy as np  
import torch

# Visualization and analysis tools
from sklearn.manifold import TSNE  
import matplotlib.pyplot as plt 
import seaborn as sns  

# Utility imports
from sklearn.model_selection import train_test_split
import json  
import tempfile  
from tqdm import tqdm  
import os  

# Google Drive integration for Colab
from google.colab import drive
drive.mount('/content/drive')

# Define project directory structure
BASE_PATH = '/content/drive/MyDrive/CS229'
DATA_PATH = f'{BASE_PATH}/data'      
OUTPUT_PATH = f'{BASE_PATH}/output'  
MODEL_PATH = f'{BASE_PATH}/models'  

for path in [DATA_PATH, OUTPUT_PATH, MODEL_PATH]:
    os.makedirs(path, exist_ok=True)

def get_project_path(filename, folder='output'):
    """
    Generate absolute paths for project files.
    
    Args:
        filename (str): Name of the file
        folder (str): Target folder ('data', 'output', or 'models')
    
    Returns:
        str: Absolute path to the file in the specified project folder
    """
    folder_map = {
        'data': DATA_PATH,
        'output': OUTPUT_PATH,
        'models': MODEL_PATH
    }
    return os.path.join(folder_map[folder], filename)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Mounted at /content/drive


In [None]:
def load_lyrics_from_json(json_path):
    """
    Load lyrics data from a JSON file containing scraped song information.
    
    Args:
        json_path (str): Path to the JSON file containing lyrics data
        
    Returns:
        list: List of dictionaries containing song information
        
    Raises:
        Exception: If file cannot be read or JSON is invalid
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            lyrics_data = json.load(f)
        print(f"Successfully loaded {len(lyrics_data)} songs")
        return lyrics_data
    except Exception as e:
        print(f"Error loading file from {json_path}: {e}")
        raise

def prepare_lyrics_dataset(lyrics_data):
    """
    Prepare lyrics dataset for fine-tuning the Llama model using LoRA.
    
    The function:
    1. Formats each song's lyrics into a structured prompt
    2. Creates a template response format for scoring
    3. Splits data into training and validation sets
    
    Args:
        lyrics_data (list): List of dictionaries containing song information
        
    Returns:
        DatasetDict: Contains 'train' and 'validation' splits with formatted data
                    Each example has 'conversations' field with user/assistant pairs
    """
    dataset = []

    for song in lyrics_data:
        # Extract song information with fallbacks for missing data
        lyrics = song.get('lyrics', None)
        music = song.get('music', 'Unknown Title')
        singer = song.get('singer', 'Unknown Artist')

        if not lyrics:
            print(f"Skipping song: {music} by {singer} (missing lyrics)")
            continue

        # Create structured prompt for lyrical analysis
        prompt = f"""Analyze these song lyrics and their emotional content:

Lyrics: {lyrics.strip()}

Musical Context:
Artist: {singer.strip()}
Title: {music.strip()}

Please analyze the emotional tone, narrative complexity, and thematic elements."""

        # Each category is scored on a 0-1 scale for quantitative analysis
        response = """Lyrical Analysis:
1. Narrative Complexity: [0-1 score]
2. Emotional Sophistication: [0-1 score]
3. Thematic Elements:
   - Love: [0-1 score]
   - Life: [0-1 score]
   - Social: [0-1 score]
4. Temporal Focus:
   - Past: [0-1 score]
   - Present: [0-1 score]
   - Future: [0-1 score]"""

        # Format as conversation pairs for the model
        dataset.append({
            "conversations": [
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": response}
            ]
        })

    # Convert to HuggingFace dataset format and split into train/validation
    hf_dataset = Dataset.from_list(dataset)
    dataset_df = hf_dataset.to_pandas()
    train_data, val_data = train_test_split(dataset_df, test_size=0.2, random_state=42)
    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)

    return DatasetDict({"train": train_dataset, "validation": val_dataset})

def parse_response_to_features(response_text):
    """
    Extracts numerical scores for each dimension of lyrical analysis.
    
    Feature dimensions:
    - Narrative Complexity: Overall structural sophistication
    - Emotional Sophistication: Depth of emotional content
    - Thematic Elements: Scores for love, life, and social themes
    - Temporal Focus: Distribution across past, present, and future
    
    Args:
        response_text (str): Raw text response from the model
        
    Returns:
        dict: Structured feature dictionary with numerical scores
              Returns None if parsing fails or no valid scores found
    """
    try:
        features = {
            'narrative_complexity': 0.0,
            'emotional_sophistication': 0.0,
            'thematic_elements': {
                'love': 0.0,
                'life': 0.0,
                'social': 0.0
            },
            'temporal_focus': {
                'past': 0.0,
                'present': 0.0,
                'future': 0.0
            }
        }

        # Handle alternative response formats
        if "here are the computed scores:" in response_text.lower():
            response_text = response_text.split("here are the computed scores:")[1]

        # Parse response line by line 
        lines = [line.strip() for line in response_text.split('\n') if line.strip()]
        current_section = None

        # Extract scores from each line
        for line in lines:
            try:
                if ':' in line:
                    key, value = [x.strip() for x in line.split(':', 1)]
                    key = key.lower()

                    # Handle section headers
                    if key == 'thematic_elements':
                        current_section = 'thematic_elements'
                        continue
                    elif key == 'temporal_focus':
                        current_section = 'temporal_focus'
                        continue

                    # Parse scores based on current section
                    if current_section:
                        if current_section == 'thematic_elements' and key in ['love', 'life', 'social']:
                            features['thematic_elements'][key] = float(value)
                        elif current_section == 'temporal_focus' and key in ['past', 'present', 'future']:
                            features['temporal_focus'][key] = float(value)
                    else:
                        if key == 'narrative_complexity':
                            features['narrative_complexity'] = float(value)
                        elif key == 'emotional_sophistication':
                            features['emotional_sophistication'] = float(value)
            except Exception:
                continue

        # Validate that at least one feature was successfully parsed
        has_values = (
            features['narrative_complexity'] > 0 or
            features['emotional_sophistication'] > 0 or
            any(v > 0 for v in features['thematic_elements'].values()) or
            any(v > 0 for v in features['temporal_focus'].values())
        )

        return features if has_values else None
    except Exception:
        return None

In [None]:
def train_lyrics_model(training_data_json):
    """
    Fine-tune the Llama model for lyrical analysis using LoRA.
  
    Args:
        training_data_json (str): Path to JSON file containing lyrics training data
        
    Returns:
        tuple: (model, tokenizer, log_history) or (None, None, None) if training fails
    """
    # Disable Weights & Biases logging
    os.environ["WANDB_DISABLED"] = "true"
    
    # Load and prepare the dataset
    lyrics_data = load_lyrics_from_json(training_data_json)
    training_dataset = prepare_lyrics_dataset(lyrics_data)

    # Validate dataset structure
    if not isinstance(training_dataset, DatasetDict) or "train" not in training_dataset or "validation" not in training_dataset:
        raise ValueError("Training dataset must be a DatasetDict with 'train' and 'validation' splits.")

    # Initialize base Llama model with 4-bit quantization for memory efficiency
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-Instruct",
        max_seq_length=2048,
        load_in_4bit=True,
    )

    # Apply LoRA adaptation to specific layers
    # r=16 sets the rank of W_LoRA matrices, controlling parameter efficiency
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,  
        target_modules=[
            "q_proj", "k_proj", "v_proj",        # Attention layers
            "o_proj",                            # Output projection
            "gate_proj", "up_proj", "down_proj"  # Feed-forward layers
        ],
        lora_alpha=16,     # Scaling factor for LoRA updates
        lora_dropout=0.1,  # Dropout for regularization
        bias="none",       # No bias updates
        use_gradient_checkpointing="unsloth",  # Memory optimization
    )

    # Configure training hyperparameters
    training_args = TrainingArguments(
        output_dir=get_project_path('lyrics_model', 'models'),
        num_train_epochs=10,                    
        per_device_train_batch_size=4,          
        gradient_accumulation_steps=4,         
        learning_rate=1e-4,                     
        warmup_steps=100,                      
        lr_scheduler_type="cosine",           
        logging_steps=10,                        
        eval_strategy="epoch",                 
        save_strategy="epoch",                 
        save_total_limit=2,                    
        report_to="none",                      
        fp16=True,                            
        optim="adamw_torch",                
    )

    # Initialize trainer with model and data
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=training_dataset["train"],
        eval_dataset=training_dataset["validation"],
        args=training_args,
    )

    # Execute training with error handling
    try:
        trainer.train()
        FastLanguageModel.for_inference(model)  
        return model, tokenizer, trainer.state.log_history
    except Exception as e:
        return None, None, None

In [None]:
def extract_lyric_features(model, tokenizer, lyrics):
    """
    Extract feature vectors from lyrics using the fine-tuned Llama model.
    
    The feature vector captures:
    1. Narrative complexity (structural sophistication)
    2. Emotional sophistication (depth of emotional content)
    3. Thematic elements (love, life, social themes)
    4. Temporal focus (past, present, future emphasis)
    
    Args:
        model: Fine-tuned Llama model
        tokenizer: Associated tokenizer
        lyrics (str): Raw lyrics text to analyze
        
    Returns:
        dict: Structured feature dictionary or None if analysis fails
    """
    if not lyrics:
        print("Error, no lyrics")
        return None

    # Construct system prompt with detailed scoring criteria
    prompt = f"""<|system|>You are a lyric analysis expert. Carefully analyze these lyrics and provide scores based on the content.

<|user|>Analyze these lyrics for their emotional depth, themes, and temporal focus:

{lyrics}

Based on your analysis, provide scores (0-1) where:
- narrative_complexity: measures the depth and complexity of the story/narrative
- emotional_sophistication: measures the depth and range of emotions expressed
- thematic_elements:
  * love: presence of love, relationships, connection themes
  * life: presence of life experiences, existential themes
  * social: presence of social commentary, societal themes
- temporal_focus:
  * past: references to past events/memories
  * present: focus on current moment/situations
  * future: references to future events/possibilities

Provide only the scores in this exact format:
narrative_complexity: [score]
emotional_sophistication: [score]
thematic_elements:
  love: [score]
  life: [score]
  social: [score]
temporal_focus:
  past: [score]
  present: [score]
  future: [score]

<|assistant|>Based on analyzing the lyrics, here are the computed scores:"""

    # Tokenize input with GPU acceleration
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")

    # Generate analysis with controlled randomness for consistent yet varied outputs
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            max_new_tokens=128,          
            temperature=0.2,            
            top_p=0.9,                   
            do_sample=True,             
            num_beams=1,                
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode model output and extract features
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\nModel Response:")
    print(response)

    features = parse_response_to_features(response)

    # Validate feature extraction with at least one non-zero score
    if not features or all(
        v == 0 for v in [
            features['narrative_complexity'],
            features['emotional_sophistication'],
            *features['thematic_elements'].values(),
            *features['temporal_focus'].values()
        ]
    ):
        print("All zeros!")
        return None

    return features

def create_unique_key(song_data):
    """
    Create a unique identifier for each song using title and artist.
    
    Args:
        song_data (dict): Dictionary containing song metadata
        
    Returns:
        str: Unique key in format "title - artist"
    """
    title = song_data.get('music', '').strip()
    artist = song_data.get('singer', '').strip()
    return f"{title} - {artist}"

def process_all_lyrics(model, tokenizer, lyrics_data):
    """
    Process entire dataset to extract feature vectors for all songs.
    
    Args:
        model: Fine-tuned Llama model
        tokenizer: Associated tokenizer
        lyrics_data (list): List of song dictionaries containing lyrics and metadata
        
    Returns:
        dict: Mapping of song identifiers to their feature vectors and metadata
    """
    all_features = {}

    # Process each song with progress tracking
    for song_data in tqdm(lyrics_data, desc="Processing lyrics"):
        try:
            unique_key = create_unique_key(song_data)

            lyrics = song_data.get('lyrics', None)
            if not lyrics:
                print(f"Skipping {unique_key}: No lyrics.")
                continue

            # Extract and store features with metadata
            features = extract_lyric_features(model, tokenizer, lyrics)
            if features is not None:
                all_features[unique_key] = {
                    'title': song_data.get('music', ''),
                    'artist': song_data.get('singer', ''),
                    'features': features
                }
        except Exception as e:
            print(f"Error processing song {song_data.get('music', 'Unknown')}: {str(e)}")
            continue

    return all_features

In [None]:
class LyricAnalysisVisualizer:
    """
    Generates figures for understanding the feature space and model performance.
    
    Attributes:
        output_dir (str): Directory where generated figures will be saved
    """
    def __init__(self, output_dir="output"):
        """
        Initialize the visualizer with an output directory.
        """
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def save_figure(self, filename):
        """
        Save the current figure to the output directory.
        
        Args:
            filename (str): Name of the output file
        """
        filepath = os.path.join(self.output_dir, filename)
        plt.savefig(filepath)
        plt.close()

    def plot_training_progress(self, trainer_history):
        """
        Visualize the model's training and validation loss over time.
        
        Args:
            trainer_history (list): List of dictionaries containing training metrics
        """
        try:
            # Extract training and validation losses
            training_loss = [h.get('loss') for h in trainer_history if 'loss' in h]
            validation_loss = [h.get('eval_loss') for h in trainer_history if 'eval_loss' in h]

            if not training_loss:
                return

            # Create loss curves plot
            plt.figure(figsize=(12, 6))
            plt.plot(range(len(training_loss)), training_loss, label='Training Loss')
            if validation_loss:
                plt.plot(range(len(validation_loss)), validation_loss, label='Validation Loss')

            plt.title('Model Training Progress')
            plt.xlabel('Steps')
            plt.ylabel('Loss')
            plt.legend()
            self.save_figure('training_progress.png')
        except Exception as e:
            print(f"Error plotting training progress: {str(e)}")
            plt.close()  

    def plot_feature_distributions(self, all_features):
        """
        Generate violin plots showing the distribution of each feature across all songs.
        
        Args:
            all_features (dict): Dictionary mapping song keys to their features
            
        Returns:
            pd.DataFrame: Processed features in tabular format
        """
        # Transform nested feature dictionary into flat DataFrame
        features_list = []
        for song_key, data in all_features.items():
            features = data['features']
            row = {
                'song': song_key,
                'narrative_complexity': features['narrative_complexity'],
                'emotional_sophistication': features['emotional_sophistication'],
                **{f"theme_{k}": v for k, v in features['thematic_elements'].items()},
                **{f"temporal_{k}": v for k, v in features['temporal_focus'].items()}
            }
            features_list.append(row)

        df = pd.DataFrame(features_list)

        if not df.empty:
            # Create violin plot with quartile markers
            plt.figure(figsize=(15, 8))
            feature_cols = [col for col in df.columns if col != 'song']
            sns.violinplot(data=df[feature_cols], orient='h', inner="quartile", scale="width")
            plt.xticks(rotation=45)
            plt.title('Distribution of Lyrical Features')
            plt.tight_layout()
            self.save_figure('feature_distributions.png')

        return df

    def plot_feature_correlations(self, df):
        """
        Generate a correlation heatmap between all features.
        
        Args:
            df (pd.DataFrame): DataFrame containing feature values
        """
        feature_cols = [col for col in df.columns if col != 'song']
        corr = df[feature_cols].corr()

        plt.figure(figsize=(12, 10))
        sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
        plt.title('Feature Correlations')
        plt.tight_layout()
        self.save_figure('feature_correlations.png')

    def plot_temporal_evolution(self, df):
        """
        Create boxplots showing the distribution of temporal focus features.
        
        Args:
            df (pd.DataFrame): DataFrame containing feature values
        """
        temporal_cols = [col for col in df.columns if 'temporal_' in col]
        temporal_data = df[temporal_cols]

        plt.figure(figsize=(15, 6))
        temporal_data.boxplot()
        plt.title('Distribution of Temporal Focus')
        plt.ylabel('Score')
        plt.xticks(rotation=45)
        plt.tight_layout()
        self.save_figure('temporal_distribution.png')

    def plot_feature_space(self, df):
        """
        Generate t-SNE visualization of songs in the feature space.
        
        Args:
            df (pd.DataFrame): DataFrame containing feature values
        """
        feature_cols = [col for col in df.columns if col != 'song']
        X = df[feature_cols].values
        n_samples = X.shape[0]

        # Adjust perplexity based on dataset 
        perplexity = min(30, n_samples - 1)

        try:
            # Apply t-SNE dimensionality reduction
            tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
            X_tsne = tsne.fit_transform(X)

            # Create scatter plot with song labels
            plt.figure(figsize=(12, 8))
            plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5)

            # Add labels for subset of points to avoid overcrowding
            for i, song in enumerate(df['song']):
                if i % 5 == 0:  # Label every 5th song
                    plt.annotate(song, (X_tsne[i, 0], X_tsne[i, 1]))

            plt.title('t-SNE Visualization of Songs in Feature Space')
            plt.tight_layout()
            self.save_figure('feature_space_tsne.png')
        except Exception as e:
            print(f"Error generating t-SNE visualization: {str(e)}")
            plt.close() 

In [None]:
def save_features_to_json(features, output_path):
    """
    Save the extracted lyrical features to a properly formatted JSON file.
    
    Args:
        features (dict): Dictionary containing song features and metadata
        output_path (str): Path where the JSON file will be saved
    """
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            # Save with pretty printing and proper character encoding
            json.dump(features, f, indent=4, ensure_ascii=False)
    except Exception as e:
        print(f"Error in json: {e}")

In [None]:
def main():
    """
    Main execution pipeline for lyrical analysis using the fine-tuned Llama model.
    
    The pipeline does the following:
    1. Data Loading: Load lyrics from JSON storage
    2. Model Training: Fine-tune Llama model using LoRA
    3. Feature Extraction: Process lyrics to extract feature vectors
    4. Visualization: Generate analysis plots and save results
    
    Returns:
        tuple: (all_features, df) where:
            - all_features (dict): Extracted features for all songs
            - df (pd.DataFrame): Processed features in tabular format
    """
    try:
        # Step 1: Load lyrics data
        print("Loading lyrics data...")
        lyrics_path = get_project_path('lyrics_data.json', 'data')
        all_lyrics_data = load_lyrics_from_json(lyrics_path)
        if not all_lyrics_data:
            print("No lyrics found.")
            return None, None

        # Create temporary file for training data, ensure proper encoding
        with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8') as temp_file:
            temp_file_path = temp_file.name
            json.dump(all_lyrics_data, temp_file, indent=4)

        # Step 2: Fine-tune the model
        print("Fine-tuning model...")
        model, tokenizer, training_history = train_lyrics_model(temp_file_path)

        # Initialize visualization class
        output_dir = get_project_path('', 'output')
        visualizer = LyricAnalysisVisualizer(output_dir=output_dir)

        # Plot training metrics if available
        if training_history and any('loss' in h for h in training_history):
            visualizer.plot_training_progress(training_history)

        # Step 3: Extract features from all lyrics
        print("Extracting features from lyrics...")
        all_features = process_all_lyrics(model, tokenizer, all_lyrics_data)

        # Save extracted features to JSON
        output_path = os.path.join(output_dir, 'lyrical_features.json')
        save_features_to_json(all_features, output_path)

        if not all_features:
            print("No features found.")
            return all_features, pd.DataFrame()

        # Step 4: Generate visualizations and analysis
        df = visualizer.plot_feature_distributions(all_features)

        if not df.empty:
            visualizer.plot_feature_correlations(df)
            visualizer.plot_temporal_evolution(df)
            visualizer.plot_feature_space(df)
            
            # Save processed features to CSV 
            csv_path = os.path.join(output_dir, 'feature_analysis.csv')
            df.to_csv(csv_path, index=False)

        return all_features, df

    except Exception as e:
        return None, None

In [None]:
# Main execution of main and ML pipeline
features, df = main()