# Spotify Songs Project - Insights calculations
The aim for this notebook is to specify the impact of the insights that we are building for this project.
### Link to full project:
- https://lemagaston.wordpress.com/2024/11/11/01-first-portfolio-project-spotify-songs-insights/
### Previous steps:
- Data exploratory: https://www.kaggle.com/code/gastnezequiellema/spotify-songs-data-exploratory/notebook
- Data visualization: https://public.tableau.com/app/profile/gast.n.lema/viz/2020_30k_spotify_songs/Topartists?publish=yes

In [1]:
# Import dataset as data frame
import pandas as pd

df = pd.read_csv('/kaggle/input/spotify-songs-data-exploratory/spotify_songs_clean.csv') # Change dataset
print(df.columns.tolist())



['track_id', 'track_name', 'track_artist', 'track_popularity', 'track_album_id', 'track_album_name', 'track_album_release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']


In [2]:
import numpy as np
from scipy import stats

def analyze_top_tracks_factors(df, top_n=1000):
    """
    Analyze success factors focusing on top n most popular tracks
    """
    # Sort by popularity and get top n tracks
    top_tracks = df.nlargest(top_n, 'track_popularity').copy()
    results = {}
    
    # Calculate means for comparison
    total_mean_popularity = df['track_popularity'].mean()
    top_mean_popularity = top_tracks['track_popularity'].mean()
    
    # Store reference metrics
    results['reference'] = {
        'total_tracks': len(df),
        'total_mean_popularity': total_mean_popularity,
        'top_mean_popularity': top_mean_popularity
    }
    
    # 1. Tempo Analysis (91-127 bpm)
    tempo_mask = (top_tracks['tempo'] >= 91) & (top_tracks['tempo'] <= 127)
    tempo_tracks = top_tracks[tempo_mask]
    
    tempo_stats = {
        'mean_popularity': tempo_tracks['track_popularity'].mean(),
        'percentage_difference': ((tempo_tracks['track_popularity'].mean() - total_mean_popularity) / total_mean_popularity) * 100,
        'count_in_range': len(tempo_tracks),
        'percentage_in_range': (len(tempo_tracks) / len(top_tracks)) * 100
    }
    results['tempo_analysis'] = tempo_stats
    
    # 2. Duration Analysis (159000-246000 ms ~ 2.5-4.1 minutes)
    duration_mask = (top_tracks['duration_ms'] >= 159000) & (top_tracks['duration_ms'] <= 246000)
    duration_tracks = top_tracks[duration_mask]
    
    duration_stats = {
        'mean_popularity': duration_tracks['track_popularity'].mean(),
        'percentage_difference': ((duration_tracks['track_popularity'].mean() - total_mean_popularity) / total_mean_popularity) * 100,
        'count_in_range': len(duration_tracks),
        'percentage_in_range': (len(duration_tracks) / len(top_tracks)) * 100
    }
    results['duration_analysis'] = duration_stats
    
    # 3. Danceability Correlation in Top Tracks
    dance_corr = stats.pearsonr(top_tracks['danceability'], top_tracks['track_popularity'])
    dance_stats = {
        'correlation_coefficient': dance_corr[0],
        'p_value': dance_corr[1],
        'correlation_strength': 'Strong' if abs(dance_corr[0]) > 0.5 else 'Moderate' if abs(dance_corr[0]) > 0.3 else 'Weak',
        'mean_danceability': top_tracks['danceability'].mean(),
        'median_danceability': top_tracks['danceability'].median()
    }
    results['danceability_correlation'] = dance_stats
    
    # 4. Energy Correlation in Top Tracks
    energy_corr = stats.pearsonr(top_tracks['energy'], top_tracks['track_popularity'])
    energy_stats = {
        'correlation_coefficient': energy_corr[0],
        'p_value': energy_corr[1],
        'correlation_strength': 'Strong' if abs(energy_corr[0]) > 0.5 else 'Moderate' if abs(energy_corr[0]) > 0.3 else 'Weak',
        'mean_energy': top_tracks['energy'].mean(),
        'median_energy': top_tracks['energy'].median()
    }
    results['energy_correlation'] = energy_stats
    
    return results

def format_top_tracks_insights(results):
    """
    Format the analysis results into presentation-ready insights
    """
    insights = {
        'tempo_insight': (
            f"Among the top 1000 tracks, {results['tempo_analysis']['percentage_in_range']:.1f}% "
            f"have a tempo between 91-127 BPM. These tracks perform "
            f"{results['tempo_analysis']['percentage_difference']:.1f}% better than the overall average."
        ),
        'duration_insight': (
            f"Among the top 1000 tracks, {results['duration_analysis']['percentage_in_range']:.1f}% "
            f"have a duration between 2.5-4.1 minutes. These tracks perform "
            f"{results['duration_analysis']['percentage_difference']:.1f}% better than the overall average."
        ),
        'danceability_insight': (
            f"In top 1000 tracks, danceability shows a {results['danceability_correlation']['correlation_strength'].lower()} "
            f"correlation of {results['danceability_correlation']['correlation_coefficient']:.3f} with popularity. "
            f"The average danceability is {results['danceability_correlation']['mean_danceability']:.3f}."
        ),
        'energy_insight': (
            f"In top 1000 tracks, energy shows a {results['energy_correlation']['correlation_strength'].lower()} "
            f"correlation of {results['energy_correlation']['correlation_coefficient']:.3f} with popularity. "
            f"The average energy is {results['energy_correlation']['mean_energy']:.3f}."
        )
    }
    return insights

# Example usage:
results = analyze_top_tracks_factors(df)
insights = format_top_tracks_insights(results)
for key, insight in insights.items():
    print(f"\n{key.replace('_', ' ').title()}:")
    print(insight)


Tempo Insight:
Among the top 1000 tracks, 50.5% have a tempo between 91-127 BPM. These tracks perform 106.5% better than the overall average.

Duration Insight:
Among the top 1000 tracks, 72.3% have a duration between 2.5-4.1 minutes. These tracks perform 106.3% better than the overall average.

Danceability Insight:
In top 1000 tracks, danceability shows a weak correlation of 0.101 with popularity. The average danceability is 0.682.

Energy Insight:
In top 1000 tracks, energy shows a weak correlation of -0.142 with popularity. The average energy is 0.653.


In [3]:
import pandas as pd
import numpy as np
from scipy import stats

def analyze_danceability_relationship(df):
    """
    Deep analysis of danceability's relationship with track popularity using multiple approaches
    """
    results = {}
    
    # 1. Top 100 Analysis
    top_100 = df.nlargest(100, 'track_popularity')
    dance_corr_100 = stats.pearsonr(top_100['danceability'], top_100['track_popularity'])
    
    # 2. Quartile Analysis
    df['danceability_quartile'] = pd.qcut(df['danceability'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
    quartile_stats = df.groupby('danceability_quartile', observed=True)['track_popularity'].agg(['mean', 'count'])
    
    # 3. Success Rate Analysis
    # Define 'high danceability' as top 25% of danceability
    high_dance_threshold = df['danceability'].quantile(0.75)
    # Define 'successful' as top 10% of popularity
    success_threshold = df['track_popularity'].quantile(0.9)
    
    high_dance_tracks = df[df['danceability'] >= high_dance_threshold]
    success_rate_high_dance = (high_dance_tracks['track_popularity'] >= success_threshold).mean() * 100
    
    low_dance_tracks = df[df['danceability'] < high_dance_threshold]
    success_rate_low_dance = (low_dance_tracks['track_popularity'] >= success_threshold).mean() * 100
    
    # 4. Average Danceability by Popularity Tiers
    df['popularity_tier'] = pd.qcut(df['track_popularity'], q=5, labels=['Bottom 20%', '20-40%', '40-60%', '60-80%', 'Top 20%'])
    tier_stats = df.groupby('popularity_tier', observed=True)['danceability'].mean()
    
    results = {
        'top_100_analysis': {
            'mean_danceability': top_100['danceability'].mean(),
            'median_danceability': top_100['danceability'].median(),
            'correlation': dance_corr_100[0],
            'p_value': dance_corr_100[1]
        },
        'quartile_analysis': quartile_stats.to_dict(),
        'success_rates': {
            'high_danceability_success_rate': success_rate_high_dance,
            'low_danceability_success_rate': success_rate_low_dance,
            'relative_improvement': ((success_rate_high_dance - success_rate_low_dance) / success_rate_low_dance) * 100
        },
        'popularity_tiers': tier_stats.to_dict()
    }
    
    return results

def format_danceability_insights(results):
    """
    Format the danceability analysis results into presentation-ready insights
    """
    insights = {
        'top_100_insight': (
            f"Top 100 tracks have an average danceability of {results['top_100_analysis']['mean_danceability']:.3f}, "
            f"with a correlation of {results['top_100_analysis']['correlation']:.3f} between danceability and popularity."
        ),
        'success_rate_insight': (
            f"Highly danceable tracks are {results['success_rates']['relative_improvement']:.1f}% more likely "
            f"to be successful compared to less danceable tracks. "
            f"{results['success_rates']['high_danceability_success_rate']:.1f}% of highly danceable tracks "
            f"reach the top 10% in popularity."
        ),
        'popularity_tier_insight': (
            f"Danceability progressively increases with popularity tiers. "
            f"Top 20% tracks have {((results['popularity_tiers']['Top 20%'] / results['popularity_tiers']['Bottom 20%']) - 1) * 100:.1f}% "
            f"higher danceability than bottom 20% tracks."
        )
    }
    return insights

# Example usage:
results = analyze_danceability_relationship(df)
insights = format_danceability_insights(results)
for key, insight in insights.items():
    print(f"\n{key.replace('_', ' ').title()}:")
    print(insight)


Top 100 Insight:
Top 100 tracks have an average danceability of 0.703, with a correlation of 0.052 between danceability and popularity.

Success Rate Insight:
Highly danceable tracks are 23.3% more likely to be successful compared to less danceable tracks. 12.2% of highly danceable tracks reach the top 10% in popularity.

Popularity Tier Insight:
Danceability progressively increases with popularity tiers. Top 20% tracks have 3.4% higher danceability than bottom 20% tracks.
