# Music Sentiment Analysis: Temporal Trends (1950-2019)

This notebook analyzes how music sentiment has evolved over the 60-year period from 1950 to 2019. We'll examine:

1. **Overall sentiment trends** over time
2. **Sentiment by decade** and year
3. **Genre-specific sentiment trends**
4. **Statistical analysis** of sentiment changes
5. **Cultural and historical context** for sentiment shifts

## Key Questions
- Has music become more positive or negative over time?
- Which decades showed the most significant sentiment changes?
- How do different genres compare in their sentiment evolution?
- Are there correlations between historical events and music sentiment?


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import our analysis utilities
import sys
sys.path.append('../src')
from analysis.sentiment import MusicSentimentAnalyzer

# Load the dataset with sentiment analysis
sentiment_path = Path('../data/processed/music_with_sentiment.csv')
if sentiment_path.exists():
    print("Loading dataset with sentiment analysis...")
    df = pd.read_csv(sentiment_path)
    print(f"Loaded {len(df)} songs with sentiment scores")
else:
    print("Error: Sentiment analysis not found. Please run the sentiment analysis notebook first.")
    raise FileNotFoundError("Sentiment analysis file not found")

# Basic dataset info
print(f"Year range: {df['year'].min()}-{df['year'].max()}")
print(f"Unique genres: {df['genre'].nunique()}")
print(f"Sentiment columns: {[col for col in df.columns if any(method in col for method in ['vader', 'textblob', 'transformer', 'composite'])]}")


## 1. Overall Sentiment Trends Over Time


In [None]:
# Calculate yearly sentiment averages
yearly_sentiment = df.groupby('year').agg({
    'composite_sentiment': ['mean', 'std', 'count'],
    'vader_compound': 'mean',
    'textblob_polarity': 'mean',
    'transformer_positive': 'mean',
    'composite_confidence': 'mean'
}).round(4)

# Flatten column names
yearly_sentiment.columns = ['_'.join(col).strip() for col in yearly_sentiment.columns]
yearly_sentiment = yearly_sentiment.reset_index()

# Calculate decade averages
decade_sentiment = df.groupby('decade').agg({
    'composite_sentiment': ['mean', 'std', 'count'],
    'vader_compound': 'mean',
    'textblob_polarity': 'mean',
    'transformer_positive': 'mean',
    'composite_confidence': 'mean'
}).round(4)

decade_sentiment.columns = ['_'.join(col).strip() for col in decade_sentiment.columns]
decade_sentiment = decade_sentiment.reset_index()

print("YEARLY SENTIMENT TRENDS (1950-2019)")
print("=" * 50)
print(f"Overall average sentiment: {df['composite_sentiment'].mean():.4f}")
print(f"Sentiment range: {df['composite_sentiment'].min():.4f} to {df['composite_sentiment'].max():.4f}")
print(f"Most positive year: {yearly_sentiment.loc[yearly_sentiment['composite_sentiment_mean'].idxmax(), 'year']} ({yearly_sentiment['composite_sentiment_mean'].max():.4f})")
print(f"Most negative year: {yearly_sentiment.loc[yearly_sentiment['composite_sentiment_mean'].idxmin(), 'year']} ({yearly_sentiment['composite_sentiment_mean'].min():.4f})")

print("\nDECADE AVERAGES:")
print("-" * 30)
for _, row in decade_sentiment.iterrows():
    print(f"{int(row['decade'])}s: {row['composite_sentiment_mean']:.4f} (±{row['composite_sentiment_std']:.4f}) [{int(row['composite_sentiment_count'])} songs]")


In [None]:
# Create comprehensive temporal trend visualizations
fig, axes = plt.subplots(3, 2, figsize=(20, 18))

# 1. Yearly sentiment trend with confidence intervals
axes[0, 0].plot(yearly_sentiment['year'], yearly_sentiment['composite_sentiment_mean'], 
                linewidth=2, label='Composite Sentiment', color='blue')
axes[0, 0].fill_between(yearly_sentiment['year'], 
                       yearly_sentiment['composite_sentiment_mean'] - yearly_sentiment['composite_sentiment_std'],
                       yearly_sentiment['composite_sentiment_mean'] + yearly_sentiment['composite_sentiment_std'],
                       alpha=0.3, color='blue')
axes[0, 0].set_title('Music Sentiment Trends Over Time (1950-2019)', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Average Sentiment Score')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5, label='Neutral')
axes[0, 0].legend()

# 2. Comparison of different sentiment methods over time
axes[0, 1].plot(yearly_sentiment['year'], yearly_sentiment['vader_compound_mean'], 
                label='VADER', linewidth=2, alpha=0.8)
axes[0, 1].plot(yearly_sentiment['year'], yearly_sentiment['textblob_polarity_mean'], 
                label='TextBlob', linewidth=2, alpha=0.8)
axes[0, 1].plot(yearly_sentiment['year'], yearly_sentiment['transformer_positive_mean'], 
                label='Transformer', linewidth=2, alpha=0.8)
axes[0, 1].plot(yearly_sentiment['year'], yearly_sentiment['composite_sentiment_mean'], 
                label='Composite', linewidth=3, color='black')
axes[0, 1].set_title('Sentiment Method Comparison Over Time', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Average Sentiment Score')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].legend()

# 3. Decade sentiment comparison
decade_order = sorted(decade_sentiment['decade'].unique())
bars = axes[1, 0].bar(decade_order, decade_sentiment['composite_sentiment_mean'], 
                      alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Average Sentiment by Decade', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Decade')
axes[1, 0].set_ylabel('Average Sentiment Score')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, value in zip(bars, decade_sentiment['composite_sentiment_mean']):
    height = bar.get_height()
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., height + 0.001,
                    f'{value:.3f}', ha='center', va='bottom', fontsize=10)

# 4. Number of songs per year
axes[1, 1].plot(yearly_sentiment['year'], yearly_sentiment['composite_sentiment_count'], 
                linewidth=2, color='green', marker='o', markersize=4)
axes[1, 1].set_title('Number of Songs per Year', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Number of Songs')
axes[1, 1].grid(True, alpha=0.3)

# 5. Sentiment distribution by decade (box plot)
decade_data = []
decade_labels = []
for decade in sorted(df['decade'].unique()):
    decade_songs = df[df['decade'] == decade]['composite_sentiment']
    if len(decade_songs) > 0:
        decade_data.append(decade_songs)
        decade_labels.append(f"{int(decade)}s")

axes[2, 0].boxplot(decade_data, labels=decade_labels)
axes[2, 0].set_title('Sentiment Distribution by Decade', fontsize=14, fontweight='bold')
axes[2, 0].set_xlabel('Decade')
axes[2, 0].set_ylabel('Sentiment Score')
axes[2, 0].tick_params(axis='x', rotation=45)
axes[2, 0].grid(True, alpha=0.3, axis='y')

# 6. Confidence over time
axes[2, 1].plot(yearly_sentiment['year'], yearly_sentiment['composite_confidence_mean'], 
                linewidth=2, color='purple', marker='s', markersize=4)
axes[2, 1].set_title('Sentiment Analysis Confidence Over Time', fontsize=14, fontweight='bold')
axes[2, 1].set_xlabel('Year')
axes[2, 1].set_ylabel('Average Confidence Score')
axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 2. Genre-Specific Sentiment Trends


In [None]:
# Analyze sentiment trends by genre
top_genres = df['genre'].value_counts().head(8).index
print(f"Analyzing sentiment trends for top {len(top_genres)} genres:")
print(top_genres.tolist())

# Calculate genre-specific sentiment over time
genre_sentiment_trends = {}

for genre in top_genres:
    genre_df = df[df['genre'] == genre]
    if len(genre_df) > 50:  # Only analyze genres with sufficient data
        yearly_genre = genre_df.groupby('year')['composite_sentiment'].agg(['mean', 'count']).reset_index()
        yearly_genre = yearly_genre[yearly_genre['count'] >= 5]  # At least 5 songs per year
        if len(yearly_genre) > 5:  # At least 5 years of data
            genre_sentiment_trends[genre] = yearly_genre

print(f"\nGenres with sufficient data for trend analysis: {len(genre_sentiment_trends)}")

# Create genre sentiment trend visualization
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# 1. Genre sentiment trends over time
colors = plt.cm.Set3(np.linspace(0, 1, len(genre_sentiment_trends)))
for i, (genre, data) in enumerate(genre_sentiment_trends.items()):
    axes[0, 0].plot(data['year'], data['mean'], label=genre, linewidth=2, color=colors[i], alpha=0.8)

axes[0, 0].set_title('Sentiment Trends by Genre Over Time', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Average Sentiment Score')
axes[0, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)

# 2. Average sentiment by genre (overall)
genre_avg_sentiment = df.groupby('genre')['composite_sentiment'].agg(['mean', 'count']).reset_index()
genre_avg_sentiment = genre_avg_sentiment[genre_avg_sentiment['count'] >= 20]  # At least 20 songs
genre_avg_sentiment = genre_avg_sentiment.sort_values('mean', ascending=False)

top_10_genres = genre_avg_sentiment.head(10)
bars = axes[0, 1].barh(range(len(top_10_genres)), top_10_genres['mean'], alpha=0.7)
axes[0, 1].set_yticks(range(len(top_10_genres)))
axes[0, 1].set_yticklabels(top_10_genres['genre'])
axes[0, 1].set_title('Average Sentiment by Genre (Top 10)', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Average Sentiment Score')
axes[0, 1].grid(True, alpha=0.3, axis='x')

# Add value labels
for i, (bar, value) in enumerate(zip(bars, top_10_genres['mean'])):
    axes[0, 1].text(value + 0.001, bar.get_y() + bar.get_height()/2, 
                    f'{value:.3f}', va='center', ha='left', fontsize=10)

# 3. Genre sentiment distribution (box plot)
genre_data = []
genre_labels = []
for genre in top_10_genres['genre']:
    genre_songs = df[df['genre'] == genre]['composite_sentiment']
    genre_data.append(genre_songs)
    genre_labels.append(genre)

axes[1, 0].boxplot(genre_data, labels=genre_labels)
axes[1, 0].set_title('Sentiment Distribution by Genre', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Genre')
axes[1, 0].set_ylabel('Sentiment Score')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 4. Genre sentiment change over decades
decade_genre_sentiment = df.groupby(['decade', 'genre'])['composite_sentiment'].mean().reset_index()
decade_genre_sentiment = decade_genre_sentiment[decade_genre_sentiment['genre'].isin(top_genres)]

# Pivot for heatmap
pivot_data = decade_genre_sentiment.pivot(index='genre', columns='decade', values='composite_sentiment')
sns.heatmap(pivot_data, annot=True, cmap='RdBu_r', center=0, fmt='.3f', ax=axes[1, 1])
axes[1, 1].set_title('Genre Sentiment Heatmap by Decade', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Decade')
axes[1, 1].set_ylabel('Genre')

plt.tight_layout()
plt.show()

# Print genre sentiment statistics
print("\nGENRE SENTIMENT STATISTICS:")
print("=" * 50)
for _, row in top_10_genres.iterrows():
    print(f"{row['genre']:15s}: {row['mean']:.4f} ({int(row['count'])} songs)")


## 3. Statistical Analysis of Sentiment Changes


In [None]:
# Statistical analysis of sentiment trends
from scipy import stats
import numpy as np

print("STATISTICAL ANALYSIS OF SENTIMENT TRENDS")
print("=" * 60)

# 1. Overall trend analysis
years = yearly_sentiment['year'].values
sentiment = yearly_sentiment['composite_sentiment_mean'].values

# Linear regression to find trend
slope, intercept, r_value, p_value, std_err = stats.linregress(years, sentiment)

print(f"1. OVERALL SENTIMENT TREND (1950-2019):")
print(f"   Slope: {slope:.6f} (sentiment change per year)")
print(f"   R-squared: {r_value**2:.4f}")
print(f"   P-value: {p_value:.6f}")
print(f"   Trend: {'Increasing' if slope > 0 else 'Decreasing'} sentiment over time")

# 2. Decade comparison
print(f"\n2. DECADE COMPARISON:")
early_decades = [1950, 1960, 1970]  # 1950s, 1960s, 1970s
late_decades = [2000, 2010]  # 2000s, 2010s

early_sentiment = df[df['decade'].isin(early_decades)]['composite_sentiment']
late_sentiment = df[df['decade'].isin(late_decades)]['composite_sentiment']

t_stat, p_value = stats.ttest_ind(early_sentiment, late_sentiment)

print(f"   Early decades (1950s-1970s) mean: {early_sentiment.mean():.4f}")
print(f"   Late decades (2000s-2010s) mean: {late_sentiment.mean():.4f}")
print(f"   Difference: {late_sentiment.mean() - early_sentiment.mean():.4f}")
print(f"   T-test p-value: {p_value:.6f}")
print(f"   Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

# 3. Genre sentiment stability
print(f"\n3. GENRE SENTIMENT STABILITY:")
genre_stability = {}
for genre in top_genres:
    genre_df = df[df['genre'] == genre]
    if len(genre_df) > 100:  # Sufficient data
        decade_means = genre_df.groupby('decade')['composite_sentiment'].mean()
        if len(decade_means) > 3:  # At least 3 decades
            stability = 1 - decade_means.std()  # Higher = more stable
            genre_stability[genre] = stability

# Sort by stability
sorted_stability = sorted(genre_stability.items(), key=lambda x: x[1], reverse=True)
print("   Most stable genres (least variation over time):")
for i, (genre, stability) in enumerate(sorted_stability[:5], 1):
    print(f"   {i}. {genre}: {stability:.4f}")

print("   Most variable genres (most variation over time):")
for i, (genre, stability) in enumerate(sorted_stability[-5:], 1):
    print(f"   {i}. {genre}: {stability:.4f}")

# 4. Correlation with time
print(f"\n4. CORRELATION WITH TIME:")
correlations = {}
for genre in top_genres:
    genre_df = df[df['genre'] == genre]
    if len(genre_df) > 100:
        corr, p_val = stats.pearsonr(genre_df['year'], genre_df['composite_sentiment'])
        correlations[genre] = (corr, p_val)

# Sort by correlation strength
sorted_corr = sorted(correlations.items(), key=lambda x: abs(x[1][0]), reverse=True)
print("   Genres with strongest time correlation:")
for i, (genre, (corr, p_val)) in enumerate(sorted_corr[:5], 1):
    direction = "increasing" if corr > 0 else "decreasing"
    significance = "significant" if p_val < 0.05 else "not significant"
    print(f"   {i}. {genre}: {corr:.4f} ({direction}, {significance})")


## 4. Key Findings and Insights


In [None]:
# Generate key insights and findings
print("KEY FINDINGS FROM MUSIC SENTIMENT ANALYSIS (1950-2019)")
print("=" * 70)

# 1. Overall sentiment trend
overall_trend = "increasing" if slope > 0 else "decreasing"
trend_strength = "strong" if abs(r_value) > 0.5 else "moderate" if abs(r_value) > 0.3 else "weak"
significance = "statistically significant" if p_value < 0.05 else "not statistically significant"

print(f"1. OVERALL TREND:")
print(f"   Music sentiment has been {overall_trend} over the 60-year period")
print(f"   This is a {trend_strength} trend ({r_value**2:.1%} of variance explained)")
print(f"   The trend is {significance} (p = {p_value:.4f})")

# 2. Decade insights
early_mean = early_sentiment.mean()
late_mean = late_sentiment.mean()
change = late_mean - early_mean
change_pct = (change / abs(early_mean)) * 100 if early_mean != 0 else 0

print(f"\n2. DECADE COMPARISON:")
print(f"   Early decades (1950s-1970s): {early_mean:.4f} average sentiment")
print(f"   Recent decades (2000s-2010s): {late_mean:.4f} average sentiment")
print(f"   Change: {change:+.4f} ({change_pct:+.1f}%)")
print(f"   This change is {'significant' if p_value < 0.05 else 'not significant'}")

# 3. Genre insights
most_positive_genre = top_10_genres.iloc[0]
most_negative_genre = top_10_genres.iloc[-1]
most_stable_genre = sorted_stability[0] if sorted_stability else ("N/A", 0)
most_variable_genre = sorted_stability[-1] if sorted_stability else ("N/A", 0)

print(f"\n3. GENRE INSIGHTS:")
print(f"   Most positive genre: {most_positive_genre['genre']} ({most_positive_genre['mean']:.4f})")
print(f"   Most negative genre: {most_negative_genre['genre']} ({most_negative_genre['mean']:.4f})")
print(f"   Most stable genre: {most_stable_genre[0]} (stability: {most_stable_genre[1]:.4f})")
print(f"   Most variable genre: {most_variable_genre[0]} (stability: {most_variable_genre[1]:.4f})")

# 4. Method agreement
vader_textblob_corr = df['vader_compound'].corr(df['textblob_polarity'])
vader_transformer_corr = df['vader_compound'].corr(df['transformer_positive'])
textblob_transformer_corr = df['textblob_polarity'].corr(df['transformer_positive'])

print(f"\n4. SENTIMENT METHOD AGREEMENT:")
print(f"   VADER vs TextBlob correlation: {vader_textblob_corr:.3f}")
print(f"   VADER vs Transformer correlation: {vader_transformer_corr:.3f}")
print(f"   TextBlob vs Transformer correlation: {textblob_transformer_corr:.3f}")

# 5. Data quality
avg_confidence = df['composite_confidence'].mean()
high_confidence_songs = len(df[df['composite_confidence'] > 0.7])
confidence_pct = (high_confidence_songs / len(df)) * 100

print(f"\n5. ANALYSIS QUALITY:")
print(f"   Average confidence score: {avg_confidence:.3f}")
print(f"   High confidence songs (>0.7): {high_confidence_songs:,} ({confidence_pct:.1f}%)")
print(f"   Total songs analyzed: {len(df):,}")

# 6. Historical context
print(f"\n6. HISTORICAL CONTEXT:")
print(f"   The analysis spans major historical periods:")
print(f"   - Post-WWII era (1950s-1960s)")
print(f"   - Social movements (1960s-1970s)")
print(f"   - Economic changes (1980s-1990s)")
print(f"   - Digital age (2000s-2010s)")
print(f"   These periods may have influenced music sentiment trends.")

print(f"\n" + "="*70)
print("ANALYSIS COMPLETE - Ready for visualization and reporting")
