# Music Sentiment Analysis: Comprehensive Sentiment Analysis

This notebook performs comprehensive sentiment analysis on music lyrics using multiple methods:
- **VADER**: Rule-based sentiment analysis optimized for social media text
- **TextBlob**: Simple polarity and subjectivity analysis
- **Transformers**: State-of-the-art transformer-based sentiment analysis

## Goals
1. Analyze sentiment of all songs in the dataset
2. Compare different sentiment analysis methods
3. Create composite sentiment scores
4. Prepare data for temporal trend analysis


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import our analysis utilities
import sys
sys.path.append('../src')
from data.loader import load_music_dataset
from analysis.sentiment import MusicSentimentAnalyzer, analyze_music_sentiment

# Check if we have the cleaned dataset
cleaned_path = Path('../data/processed/cleaned_music_dataset.csv')
if cleaned_path.exists():
    print("Loading cleaned dataset...")
    df = pd.read_csv(cleaned_path)
else:
    print("Loading and cleaning dataset...")
    df = load_music_dataset('../data/tcc_ceds_music.csv')
    df.to_csv('../data/processed/cleaned_music_dataset.csv', index=False)

print(f"Dataset loaded: {len(df)} songs")
print(f"Year range: {df['year'].min()}-{df['year'].max()}")
print(f"Columns: {list(df.columns)}")


## 1. Initialize Sentiment Analyzer


In [None]:
# Initialize sentiment analyzer
# Note: For large datasets, consider using GPU if available
use_gpu = False  # Set to True if you have CUDA available
analyzer = MusicSentimentAnalyzer(use_gpu=use_gpu)

# Test the analyzer on a sample
sample_lyrics = [
    "I love this song, it makes me so happy!",
    "This is terrible, I hate it.",
    "The weather is okay today, nothing special.",
    "I'm feeling neutral about this situation."
]

print("Testing sentiment analyzer on sample texts:")
print("=" * 50)

for i, text in enumerate(sample_lyrics, 1):
    print(f"\n{i}. Text: '{text}'")
    scores = analyzer.analyze_single_text(text)
    
    # Display key scores
    print(f"   VADER Compound: {scores.get('vader_compound', 0):.3f}")
    print(f"   TextBlob Polarity: {scores.get('textblob_polarity', 0):.3f}")
    print(f"   Transformer Positive: {scores.get('transformer_positive', 0):.3f}")
    print(f"   Composite Sentiment: {scores.get('composite_sentiment', 0):.3f}")
    print(f"   Composite Confidence: {scores.get('composite_confidence', 0):.3f}")


## 2. Analyze Sentiment for All Songs

**Note**: This may take a while for large datasets. The analysis will process songs in batches and show progress.


In [None]:
# Check if sentiment analysis has already been done
sentiment_path = Path('../data/processed/music_with_sentiment.csv')

if sentiment_path.exists():
    print("Loading existing sentiment analysis results...")
    df_with_sentiment = pd.read_csv(sentiment_path)
    print(f"Loaded {len(df_with_sentiment)} songs with sentiment scores")
else:
    print("Performing sentiment analysis on all songs...")
    print("This may take several minutes depending on dataset size...")
    
    # Analyze sentiment for all songs
    # Using smaller batch size for better progress tracking
    df_with_sentiment = analyzer.analyze_dataframe(df, batch_size=50)
    
    # Save results
    df_with_sentiment.to_csv(sentiment_path, index=False)
    print(f"Sentiment analysis completed and saved to {sentiment_path}")

# Display basic info about the results
print(f"\nDataset with sentiment: {df_with_sentiment.shape}")
print(f"Sentiment columns added: {[col for col in df_with_sentiment.columns if any(method in col for method in ['vader', 'textblob', 'transformer', 'composite'])]}")


## 3. Sentiment Analysis Results Overview


In [None]:
# Get summary statistics for sentiment scores
sentiment_summary = analyzer.get_sentiment_summary(df_with_sentiment)

print("SENTIMENT ANALYSIS SUMMARY")
print("=" * 60)

for method, stats in sentiment_summary.items():
    print(f"\n{method.upper()}:")
    print(f"  Mean: {stats['mean']:.4f}")
    print(f"  Std:  {stats['std']:.4f}")
    print(f"  Min:  {stats['min']:.4f}")
    print(f"  Max:  {stats['max']:.4f}")
    print(f"  Median: {stats['median']:.4f}")

# Display distribution of composite sentiment
print(f"\nCOMPOSITE SENTIMENT DISTRIBUTION:")
composite_sentiment = df_with_sentiment['composite_sentiment']
print(f"Positive songs (>0.1): {len(composite_sentiment[composite_sentiment > 0.1]):,} ({len(composite_sentiment[composite_sentiment > 0.1])/len(composite_sentiment)*100:.1f}%)")
print(f"Neutral songs (-0.1 to 0.1): {len(composite_sentiment[(composite_sentiment >= -0.1) & (composite_sentiment <= 0.1)]):,} ({len(composite_sentiment[(composite_sentiment >= -0.1) & (composite_sentiment <= 0.1)])/len(composite_sentiment)*100:.1f}%)")
print(f"Negative songs (<-0.1): {len(composite_sentiment[composite_sentiment < -0.1]):,} ({len(composite_sentiment[composite_sentiment < -0.1])/len(composite_sentiment)*100:.1f}%)")


## 4. Sentiment Method Comparison


In [None]:
# Compare different sentiment methods
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# VADER scores
axes[0, 0].hist(df_with_sentiment['vader_compound'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('VADER Compound Score Distribution')
axes[0, 0].set_xlabel('VADER Compound Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df_with_sentiment['vader_compound'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {df_with_sentiment["vader_compound"].mean():.3f}')
axes[0, 0].legend()

# TextBlob polarity
axes[0, 1].hist(df_with_sentiment['textblob_polarity'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 1].set_title('TextBlob Polarity Distribution')
axes[0, 1].set_xlabel('TextBlob Polarity')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df_with_sentiment['textblob_polarity'].mean(), color='red', linestyle='--',
                  label=f'Mean: {df_with_sentiment["textblob_polarity"].mean():.3f}')
axes[0, 1].legend()

# Transformer positive
axes[0, 2].hist(df_with_sentiment['transformer_positive'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 2].set_title('Transformer Positive Score Distribution')
axes[0, 2].set_xlabel('Transformer Positive Score')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].axvline(df_with_sentiment['transformer_positive'].mean(), color='red', linestyle='--',
                  label=f'Mean: {df_with_sentiment["transformer_positive"].mean():.3f}')
axes[0, 2].legend()

# Composite sentiment
axes[1, 0].hist(df_with_sentiment['composite_sentiment'], bins=50, alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Composite Sentiment Distribution')
axes[1, 0].set_xlabel('Composite Sentiment Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(df_with_sentiment['composite_sentiment'].mean(), color='red', linestyle='--',
                  label=f'Mean: {df_with_sentiment["composite_sentiment"].mean():.3f}')
axes[1, 0].legend()

# Correlation between methods
correlation_data = df_with_sentiment[['vader_compound', 'textblob_polarity', 'transformer_positive', 'composite_sentiment']].corr()
sns.heatmap(correlation_data, annot=True, cmap='coolwarm', center=0, square=True, fmt='.3f', ax=axes[1, 1])
axes[1, 1].set_title('Sentiment Method Correlations')

# Confidence distribution
axes[1, 2].hist(df_with_sentiment['composite_confidence'], bins=50, alpha=0.7, edgecolor='black')
axes[1, 2].set_title('Composite Confidence Distribution')
axes[1, 2].set_xlabel('Composite Confidence Score')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].axvline(df_with_sentiment['composite_confidence'].mean(), color='red', linestyle='--',
                  label=f'Mean: {df_with_sentiment["composite_confidence"].mean():.3f}')
axes[1, 2].legend()

plt.tight_layout()
plt.show()

# Print correlation matrix
print("CORRELATION BETWEEN SENTIMENT METHODS:")
print("=" * 50)
print(correlation_data.round(3))


## 5. Sample Songs with Sentiment Analysis


In [None]:
# Show sample songs with their sentiment scores
print("SAMPLE SONGS WITH SENTIMENT ANALYSIS")
print("=" * 80)

# Get samples from different sentiment categories
positive_songs = df_with_sentiment[df_with_sentiment['composite_sentiment'] > 0.3].head(3)
negative_songs = df_with_sentiment[df_with_sentiment['composite_sentiment'] < -0.3].head(3)
neutral_songs = df_with_sentiment[(df_with_sentiment['composite_sentiment'] >= -0.1) & 
                                 (df_with_sentiment['composite_sentiment'] <= 0.1)].head(3)

def display_song_sentiment(songs, category):
    print(f"\n{category.upper()} SONGS:")
    print("-" * 40)
    
    for idx, song in songs.iterrows():
        print(f"\n{song['artist_name']}: '{song['track_name']}' ({song['year']})")
        print(f"Genre: {song['genre']}")
        print(f"Lyrics preview: {song['lyrics'][:150]}...")
        print(f"Sentiment scores:")
        print(f"  VADER Compound: {song['vader_compound']:.3f}")
        print(f"  TextBlob Polarity: {song['textblob_polarity']:.3f}")
        print(f"  Transformer Positive: {song['transformer_positive']:.3f}")
        print(f"  Composite Sentiment: {song['composite_sentiment']:.3f}")
        print(f"  Confidence: {song['composite_confidence']:.3f}")

display_song_sentiment(positive_songs, "Positive")
display_song_sentiment(negative_songs, "Negative")
display_song_sentiment(neutral_songs, "Neutral")


## 6. Sentiment Analysis Complete

The sentiment analysis has been completed successfully! The dataset now includes:

- **VADER sentiment scores**: Compound, positive, neutral, negative
- **TextBlob scores**: Polarity and subjectivity
- **Transformer scores**: Positive, neutral, negative probabilities
- **Composite sentiment**: Weighted average of all methods
- **Composite confidence**: Agreement between methods

The results have been saved to `../data/processed/music_with_sentiment.csv` and are ready for temporal trend analysis in the next notebook.
