# Data Description

#### Data Set:  
[Spotify Tracks](https://www.kaggle.com/datasets/gauthamvijayaraj/spotify-tracks-dataset-updated-every-week/data)

#### Column Description:

- track_id: Spotify ID for the track.

- track_name: Name of the track.

- artist_name: The names of artists who performed the track, separated by commas if there are multiple artists.

- year: The release year of the track.

- popularity: A value between 0 and 100 indicating how popular a track is based on plays and recency.

- artwork_url: URL of the album or track's artwork.

- album_name: The album in which the track appears.

- acousticness: A confidence measure from 0.0 to 1.0 of whether the track is acoustic.

- danceability: A measure of how suitable a track is for dancing (0.0 = least danceable, 1.0 = most danceable).

- duration_ms: Track length in milliseconds.

- energy: A perceptual measure from 0.0 to 1.0 of intensity and activity.

- key: The musical key of the track, using standard pitch class notation (e.g., 0 = C, 1 = C♯/D♭).

- liveness: A measure of the likelihood that the track was recorded live (higher values indicate live performances).

- loudness: The overall loudness of the track in decibels (dB).

- mode: Indicates the modality of the track (1 = major, 0 = minor).

- speechiness: Measures the presence of spoken words in a track (closer to 1.0 indicates more speech-like content).

- tempo: The estimated tempo of the track in beats per minute (BPM).

- time_signature: The number of beats per measure, ranging from 3 to 7.

- valence: A measure from 0.0 to 1.0 indicating the track's musical positiveness (higher values are happier).

- track_url: The Spotify URL for the track.

- language: The language of the track (English, Tamil, Hindi, Telugu, Malayalam, Korean).

# Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
print("Setup Complete")

Setup Complete


In [2]:
import plotly.io as pio

pio.templates["my_white"] = pio.templates["plotly_white"]

# 全域套用自定義白色樣式
pio.templates.default = "my_white"


In [3]:
filepath = "../input/spotify-tracks-dataset-updated-every-week/spotify_tracks.csv"
sp_raw_data = pd.read_csv(filepath)

In [4]:
sp_raw_data.shape

(62317, 22)

In [5]:
sp_raw_data.head()

Unnamed: 0,track_id,track_name,artist_name,year,popularity,artwork_url,album_name,acousticness,danceability,duration_ms,...,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,track_url,language
0,2r0ROhr7pRN4MXDMT1fEmd,"Leo Das Entry (From ""Leo"")",Anirudh Ravichander,2024,59,https://i.scdn.co/image/ab67616d0000b273ce9c65...,"Leo Das Entry (From ""Leo"")",0.0241,0.753,97297.0,...,8.0,0.1,-5.994,0.0,0.103,110.997,4.0,0.459,https://open.spotify.com/track/2r0ROhr7pRN4MXD...,Tamil
1,4I38e6Dg52a2o2a8i5Q5PW,AAO KILLELLE,"Anirudh Ravichander, Pravin Mani, Vaishali Sri...",2024,47,https://i.scdn.co/image/ab67616d0000b273be1b03...,AAO KILLELLE,0.0851,0.78,207369.0,...,10.0,0.0951,-5.674,0.0,0.0952,164.995,3.0,0.821,https://open.spotify.com/track/4I38e6Dg52a2o2a...,Tamil
2,59NoiRhnom3lTeRFaBzOev,Mayakiriye Sirikiriye - Orchestral EDM,"Anirudh Ravichander, Anivee, Alvin Bruno",2024,35,https://i.scdn.co/image/ab67616d0000b27334a1dd...,Mayakiriye Sirikiriye (Orchestral EDM),0.0311,0.457,82551.0,...,2.0,0.0831,-8.937,0.0,0.153,169.996,4.0,0.598,https://open.spotify.com/track/59NoiRhnom3lTeR...,Tamil
3,5uUqRQd385pvLxC8JX3tXn,Scene Ah Scene Ah - Experimental EDM Mix,"Anirudh Ravichander, Bharath Sankar, Kabilan, ...",2024,24,https://i.scdn.co/image/ab67616d0000b27332e623...,Scene Ah Scene Ah (Experimental EDM Mix),0.227,0.718,115831.0,...,7.0,0.124,-11.104,1.0,0.445,169.996,4.0,0.362,https://open.spotify.com/track/5uUqRQd385pvLxC...,Tamil
4,1KaBRg2xgNeCljmyxBH1mo,Gundellonaa X I Am A Disco Dancer - Mashup,"Anirudh Ravichander, Benny Dayal, Leon James, ...",2024,22,https://i.scdn.co/image/ab67616d0000b2735a59b6...,Gundellonaa X I Am a Disco Dancer (Mashup),0.0153,0.689,129621.0,...,7.0,0.345,-9.637,1.0,0.158,128.961,4.0,0.593,https://open.spotify.com/track/1KaBRg2xgNeCljm...,Tamil


In [6]:
sp_raw_data.describe(include='all')

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,track_id,track_name,artist_name,year,popularity,artwork_url,album_name,acousticness,danceability,duration_ms,...,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,track_url,language
count,62317,62317,62317,62317.0,62317.0,62317,62317,62317.0,62317.0,62317.0,...,62317.0,62317.0,62317.0,62317.0,62317.0,62317.0,62317.0,62317.0,62317,62317
unique,62239,41521,12513,,,21110,19898,,,,...,,,,,,,,,62239,7
top,0nLLe981VCdAkZXXxXrbPn,"Merry Christmas, Happy Holidays",Shankar Mahadevan,,,https://i.scdn.co/image/ab67616d0000b273bf257e...,Nowhere to Hide (Unabridged),,,,...,,,,,,,,,https://open.spotify.com/track/0nLLe981VCdAkZX...,English
freq,2,151,1391,,,186,186,,,,...,,,,,,,,,2,23392
mean,,,,2014.425935,15.358361,,,0.362292,0.596807,242527.0,...,5.101658,0.194143,-65.103433,0.586052,0.087722,117.931247,3.857086,0.495226,,
std,,,,9.645113,18.626908,,,0.314609,0.186209,112999.9,...,3.553469,0.17203,2369.051478,0.493682,0.11515,28.509459,0.50266,0.264787,,
min,,,,1971.0,0.0,,,-1.0,-1.0,5000.0,...,-1.0,-1.0,-100000.0,-1.0,-1.0,-1.0,-1.0,-1.0,,
25%,,,,2011.0,0.0,,,0.0671,0.497,192160.0,...,2.0,0.0932,-10.727,0.0,0.0367,95.942,4.0,0.292,,
50%,,,,2017.0,7.0,,,0.286,0.631,236267.0,...,5.0,0.125,-7.506,1.0,0.0489,117.991,4.0,0.507,,
75%,,,,2022.0,26.0,,,0.632,0.73,286240.0,...,8.0,0.243,-5.456,1.0,0.0891,135.081,4.0,0.71,,


In [7]:
sp_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62317 entries, 0 to 62316
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          62317 non-null  object 
 1   track_name        62317 non-null  object 
 2   artist_name       62317 non-null  object 
 3   year              62317 non-null  int64  
 4   popularity        62317 non-null  int64  
 5   artwork_url       62317 non-null  object 
 6   album_name        62317 non-null  object 
 7   acousticness      62317 non-null  float64
 8   danceability      62317 non-null  float64
 9   duration_ms       62317 non-null  float64
 10  energy            62317 non-null  float64
 11  instrumentalness  62317 non-null  float64
 12  key               62317 non-null  float64
 13  liveness          62317 non-null  float64
 14  loudness          62317 non-null  float64
 15  mode              62317 non-null  float64
 16  speechiness       62317 non-null  float6

# Exploratory Data Analysis (EDA) 

## Check for missing values

In [8]:
sp_raw_data = sp_raw_data.replace(-1, np.nan)
missing = sp_raw_data.isnull().sum()

missing[missing > 0].sort_values(ascending=False)

acousticness        35
danceability        35
energy              35
instrumentalness    35
key                 35
liveness            35
mode                35
speechiness         35
tempo               35
time_signature      35
valence             35
dtype: int64

In [9]:
sp_raw_data.loc[:, missing > 0].describe()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,mode,speechiness,tempo,time_signature,valence
count,62282.0,62282.0,62282.0,62282.0,62282.0,62282.0,62282.0,62282.0,62282.0,62282.0,62282.0
mean,0.363057,0.597704,0.603397,0.146859,5.105087,0.194814,0.586943,0.088333,117.998081,3.859815,0.496066
std,0.313035,0.182372,0.243264,0.306688,3.551521,0.169733,0.492387,0.112257,28.377679,0.489432,0.262477
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0673,0.498,0.44,0.0,2.0,0.0932,0.0,0.0367,95.953,4.0,0.293
50%,0.286,0.631,0.639,2.5e-05,5.0,0.125,1.0,0.0489,117.997,4.0,0.508
75%,0.633,0.73,0.803,0.0152,8.0,0.243,1.0,0.0892,135.09,4.0,0.71
max,0.996,0.986,1.0,0.999,11.0,0.998,1.0,0.959,239.97,5.0,0.995


## 🎯 Imputer Stratgies

| Feature| Distribution Type|Imputation Method|Explanation|
|--------|------------------|-----------------|-----------|
| `valence` |Semi-categorical (0~1, clustered)|✅ mode|High concentration, mode is reasonable; more meaningful for classification visualization or clustering sentiment|
| `time_signature` |Categorical (commonly 4/4)|✅ mode|Almost all music is 4/4, directly impute with 4|
| `mode`|Binary categorical (major/minor)|✅ mode|Usually 1 (major), using mode is safe|

---

### Other Features (e.g., `acousticness`, `tempo`...)
These features are all continuous with wide ranges and skewed distributions. Choosing median as the imputation method is the most robust strategy, especially suitable for handling outliers (like the highly skewed values in instrumentalness).


In [10]:
from sklearn.impute import SimpleImputer

# Impute with Median
median_cols = ['acousticness', 'danceability', 'energy', 'instrumentalness',
               'liveness', 'speechiness', 'tempo', 'key']

# Impute with Mode
mode_cols = ['valence', 'time_signature', 'mode']

median_imputer = SimpleImputer(strategy='median')
sp_raw_data[median_cols] = median_imputer.fit_transform(sp_raw_data[median_cols])

mode_imputer = SimpleImputer(strategy='most_frequent')
sp_raw_data[mode_cols] = mode_imputer.fit_transform(sp_raw_data[mode_cols])

In [11]:
sp_raw_data.columns[sp_raw_data.isnull().sum() > 0]

Index([], dtype='object')

## 🎯 Normalization
### 📌 1. Significant Range Differences Across Features

- Features like `acousticness`, `energy`, `valence` have values between 0 and 1
tempo ranges approximately from 60–200.
- `duration_ms` reaches as high as 200,000–400,000 milliseconds.
- If these features are directly used in unified charts or distance calculations, larger values (such as duration_ms) would dominate the comparison results, leading to misleading conclusions.

### 📊 2. Improving Visualization and Fairness
- Fairly compare musical styles across different languages
- All features have consistency and comparability in visualizations
No single feature appears "more important" visually due to its numerical magnitude


In [12]:
from sklearn.preprocessing import MinMaxScaler

df = sp_raw_data.copy()

exclude_columns = ['year']
keep_cols = ['language']
keep_df = df[keep_cols+exclude_columns]

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
columns_to_normalize = [col for col in numerical_columns if col not in exclude_columns]

# Normalization
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(df[columns_to_normalize])

normalized_df = pd.DataFrame(normalized_features, columns=columns_to_normalize)
normalized_df = pd.concat([keep_df.reset_index(drop=True), normalized_df.reset_index(drop=True)], axis=1)

normalized_df

Unnamed: 0,language,year,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Tamil,2024,0.634409,0.024197,0.763692,0.020168,0.970,0.055355,0.727273,0.100200,0.999928,0.0,0.107404,0.462545,0.8,0.461307
1,Tamil,2024,0.505376,0.085442,0.791075,0.044219,0.793,0.000000,0.909091,0.095291,0.999931,0.0,0.099270,0.687565,0.6,0.825126
2,Tamil,2024,0.376344,0.031225,0.463489,0.016946,0.491,0.000000,0.181818,0.083267,0.999898,0.0,0.159541,0.708405,0.8,0.601005
3,Tamil,2024,0.258065,0.227912,0.728195,0.024218,0.630,0.000728,0.636364,0.124248,0.999877,1.0,0.464025,0.708405,0.8,0.363819
4,Tamil,2024,0.236559,0.015361,0.698783,0.027231,0.748,0.000001,0.636364,0.345691,0.999891,1.0,0.164755,0.537405,0.8,0.595980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62312,Tamil,2021,0.322581,0.085241,0.889452,0.077509,0.469,0.000000,0.636364,0.110220,0.999873,1.0,0.368092,0.416819,0.8,0.506533
62313,Tamil,2021,0.322581,0.079116,0.844828,0.032637,0.689,0.000000,0.727273,0.267535,0.999937,1.0,0.084776,0.458561,0.8,0.526633
62314,Tamil,2021,0.193548,0.009217,0.853955,0.050038,0.765,0.000000,0.636364,0.079158,0.999929,1.0,0.142857,0.508555,0.8,0.672362
62315,Tamil,2021,0.010753,0.101406,0.691684,0.038415,0.861,0.000000,0.090909,0.320641,0.999927,1.0,0.297185,0.312510,0.8,0.441206


In [13]:
normalized_df.to_csv('Spotify_Normalized.csv', index=False)

# PART 0 - General Trends in Music Features

### 📊 Distribution of Songs by Language
The dataset contains songs from multiple languages:
- English tracks dominate the dataset (37.5%), followed by Tamil (20.3%) and Unknown (20.9%).
- Korean tracks make up smaller proportions, yet still represent significant cultural movements.

### ⏳ Trends in Music Features Over Time
- 🎧 Shorter Songs: The average song duration (`duration_ms`) has been steadily decreasing since the 2000s, likely reflecting changes in streaming behavior and attention span.

- 💃 More Danceable & Energetic: Both `danceability` and `energy` show clear upward trends, indicating a growing preference for upbeat, performance-friendly music.

- 🌫️ Lower Valence: The `valence` feature, which indicates musical positivity, shows a gradual decline—possibly pointing to a shift toward more emotionally neutral or melancholic songs.

These long-term shifts help explain the emergence of genres like K-pop and why certain types of music gained popularity during specific periods.

## 🎯 Counts of Songs

**⚠️ Note on 'Unknown' Language Tracks**

As shown in the yearly distribution chart, tracks labeled as **'Unknown'** appear consistently across all years — not just in early periods. This indicates that:
- Some tracks, even in recent years, are missing reliable language metadata.
- These may include instrumentals, independent music, or metadata import issues.

For clarity and analytical focus, **exclude 'Unknown' tracks from popularity, genre, and trend-based analysis**, unless explicitly noted.

In [14]:
custom_color_map = {
    'Unknown': 'lightgray'
}

In [15]:
song_count = normalized_df['language'].value_counts().reset_index()

fig = px.pie(
    song_count,
    names='language',
    values='count',
    color='language',
    color_discrete_map=custom_color_map,
    title='Overall Language Distribution of Songs'
)

fig.update_traces(textinfo='label+percent')
fig.update_layout(showlegend=False) 

fig.show()

In [16]:
year_change = normalized_df.copy()
year_change = year_change.drop('language', axis=1)

In [17]:
year_change = year_change.groupby('year').mean().reset_index()

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 設定子圖行列數
rows = 5
cols = 3
fig = make_subplots(rows=rows, cols=cols, subplot_titles=year_change.columns[2:])

for i, feature in enumerate(year_change.columns[2:]):  # 跳過 'year', 'popularity'
    row = i // cols + 1
    col = i % cols + 1

    fig.add_trace(
        go.Scatter(
            x=year_change['year'], 
            y=year_change[feature], 
            name=feature, 
            mode='lines'
        ),
        row=row,
        col=col
    )

fig.update_layout(
    height=1000, width=900,
    title_text="Trends of Music Features Over Years",
    showlegend=False
)
fig.show()


# PART 1 - Which Language of Songs Became Popular Over the Years?
**Examined the average popularity of songs by language over time to uncover major shifts in musical influence.**
### 🇬🇧 1. Early Dominance of English Songs
- Exclusive in Early Years: English tracks were the only songs represented in the early part of the dataset, naturally leading to the highest popularity.

- Possible Dataset Bias: This early dominance may be partially due to data limitations or Spotify's catalog during its formative years.

### 🇮🇳 2. Sharp Rise of Hindi Music in 2007
- Cultural Surge: A sudden spike in Hindi track popularity appears around 2007.

- Bollywood Influence: This aligns with major Bollywood movie releases, where hit soundtracks often contribute significantly to overall popularity.

### 🌍 3. Decline in English Popularity
- More Language Diversity: While English still accounts for the largest number of tracks, its average popularity has declined.

- Emergence of Other Languages: As Spotify expanded globally, songs in languages like Hindi, Korean, and Tamil began gaining traction.

### 🎤 4. Rise of K-pop After 2014
- Global Phenomenon: The rise of K-pop is clearly visible after 2010, driven by the success of iconic groups and viral hits.

- Not Just Quantity: Despite having fewer songs overall, Korean tracks have achieved high popularity — showing how strategic marketing, strong visuals, and passionate fan engagement can outweigh volume.

## 🎯 Popularity of Songs

In [18]:
popularity_year_language = normalized_df.groupby(['year', 'language'])['popularity'].mean().reset_index()
popularity_year_language

Unnamed: 0,year,language,popularity
0,1971,English,0.329749
1,1972,English,0.288274
2,1973,English,0.291016
3,1974,English,0.333333
4,1975,English,0.401811
...,...,...,...
208,2024,Korean,0.342891
209,2024,Malayalam,0.101965
210,2024,Tamil,0.139959
211,2024,Telugu,0.171595


In [19]:
fig = px.line(
    popularity_year_language,
    x='year',
    y='popularity',
    color='language',
    color_discrete_map=custom_color_map,
    title='Popularity by Year and Language',
    labels={'popularity': 'Popularity', 'year': 'Year'},
)

fig.show()

# Part 2 - Sharp Rise of Hindi Music in 2007.

**In 2007, Hindi songs experienced a dramatic surge in popularity — the highest among all languages that year. -> Analyzed the musical characteristics of Hindi tracks and compared them with other languages released in the same year.**

### 🎵 1. Energetic and Upbeat Style
- `Danceability` & `Valence` Lead: Hindi tracks had significantly higher danceability and valence than other languages in 2007.

- Bollywood Influence: This reflects Bollywood’s tradition of using lively, rhythmic music that emphasizes joy and entertainment — often tied to dance-heavy film sequences.

### 🎛️ 2. Simpler Musical Structure
- Low `Liveness` & `Mode`: Hindi songs showed lower liveness and mode values, suggesting a focus on studio production rather than live performance elements.

- Minor Tonality Preference: Despite their energetic feel, the lower mode scores may indicate more frequent use of minor keys, adding emotional depth beneath the upbeat rhythm.

### 📊 Visualizing Differences in Music Features
To better understand which features made Hindi songs unique in 2007, calculated the average difference between Hindi tracks and all other languages in that year. 

- 📈 Bar Chart: Differences in Music Features – Hindi vs. Others (2007)
(This chart highlights which features contributed most to Hindi’s uniqueness. A dashed line shows the average difference across all features.)

- 🧭 Outstanding Features Radar Comparison: Selected the top features where Hindi songs differ the most and visualized them in a radar chart. This allows for a quick comparison between Hindi music and all other songs in 2007 across key musical dimensions.

## 🎯 Counts of Hindi Songs in 2007.

In [20]:
sp_2007 = normalized_df[(normalized_df['year'] == 2007) & (normalized_df['language'] != 'Unknown')]
sp_2007_count = sp_2007['language'].value_counts().reset_index()

In [21]:
fig = px.bar(
    sp_2007_count,
    x='language',
    y='count',
    color='language',
    color_discrete_map=custom_color_map,
    title='Counts of Languages of Songs in 2007',
    labels={'language': 'Language', 'count': 'Number of Songs'}
)

fig.update_layout(xaxis_tickangle=-45) 
fig.show()

## 🎯 Find 3 songs of Hindi.

In 2007, Hindi songs achieved unprecedented popularity despite only three songs being in the dataset compared to 344 English songs. This success highlights the cultural power of Bollywood, as all three songs were tied to blockbuster films. Their immense popularity, driven by the success of these movies and the growing accessibility of online music platforms, marked a pivotal moment when Bollywood's influence expanded beyond India.

In [22]:
hindi_2007 = sp_raw_data[(sp_raw_data['year'] == 2007) & (sp_raw_data['language'] == 'Hindi')]
hindi_2007

Unnamed: 0,track_id,track_name,artist_name,year,popularity,artwork_url,album_name,acousticness,danceability,duration_ms,...,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,track_url,language
12090,1O5mTSQa0sCce9ghhDqIZl,Jashn-E-Bahaaraa,"A.R. Rahman, Javed Ali",2007,69,https://i.scdn.co/image/ab67616d0000b27346131d...,Jodhaa Akbar (Original Motion Picture Soundtrack),0.767,0.7,315173.0,...,2.0,0.0789,-10.474,1.0,0.0308,137.864,4.0,0.488,https://open.spotify.com/track/1O5mTSQa0sCce9g...,Hindi
36903,3gLf7ctpqnklhoQfeveBw0,Ab To Forever,"Vishal-Shekhar, KK, Shreya Ghoshal, Vishal Dad...",2007,53,https://i.scdn.co/image/ab67616d0000b273ae7fac...,Ta Ra Rum Pum,0.201,0.836,301714.0,...,6.0,0.0389,-5.745,0.0,0.0445,109.03,4.0,0.792,https://open.spotify.com/track/3gLf7ctpqnklhoQ...,Hindi
36939,5TwBs8jF4YYmnSFDkxSXKF,Saaiyaan,"Vishal-Shekhar, Vishal Dadlani, Javed Akhtar",2007,42,https://i.scdn.co/image/ab67616d0000b273ae7fac...,Ta Ra Rum Pum,0.171,0.569,295314.0,...,4.0,0.09,-6.018,0.0,0.0393,173.964,4.0,0.781,https://open.spotify.com/track/5TwBs8jF4YYmnSF...,Hindi


In [23]:
hindi_popularity = sp_2007.groupby('language')[numerical_columns].mean().reset_index()
hindi_popularity.set_index('language', inplace=True)
hindi_popularity

Unnamed: 0_level_0,year,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
English,2007.0,0.168917,0.369641,0.482232,0.047996,0.548988,0.310748,0.52537,0.238337,0.999885,0.604651,0.077917,0.490435,0.780233,0.35366
Hindi,2007.0,0.587814,0.381191,0.711629,0.065349,0.673667,0.00038,0.363636,0.069405,0.999914,0.333333,0.039833,0.584598,0.8,0.690452
Korean,2007.0,0.173425,0.142209,0.617662,0.046258,0.774171,0.001428,0.511688,0.279084,0.999935,0.657143,0.07867,0.514757,0.8,0.597085
Tamil,2007.0,0.123131,0.289125,0.674925,0.059929,0.697445,0.047721,0.497783,0.198431,0.999906,0.512195,0.104206,0.497859,0.762195,0.631315
Telugu,2007.0,0.149002,0.583491,0.536076,0.073114,0.424571,0.261462,0.376623,0.109462,0.999884,0.857143,0.040191,0.541208,0.8,0.319311


## 🎯 Highlight Key Differences in Hindi Music Features

To understand why Hindi songs had the highest popularity in 2007, compare the average values of each music feature in Hindi tracks against those of all other languages. 

Identify the features with the most significant differences and visualize them using a radar chart to clearly highlight how Hindi music stands out in terms of musical characteristics.

In [24]:
def language_feature_diff(df, col, language, bar_title, radar_title, bar_main_color, bar_secondary_color, radar_main_color, radar_line_color):
    #Group by language and compute mean
    lang_popularity = df.groupby('language')[col].mean().reset_index()
    lang_popularity.set_index('language', inplace=True)
    if 'year' in lang_popularity.columns:
        lang_popularity.drop('year', axis=1, inplace=True)

    # Compute average difference
    lang_avg = lang_popularity.loc[language]
    others_avg = lang_popularity.drop(language).mean()

    diff = (lang_avg - others_avg).abs().sort_values().reset_index()
    diff.columns = ['feature', 'diff']
    diff_avg = diff['diff'].mean()

    # Add color category
    diff['color'] = ['highlight' if val > diff_avg else 'neutral' for val in diff['diff']]
    
    # Bar chart of differences
    bar_fig = px.bar(
        diff,
        x='diff',
        y='feature',
        color='color',
        orientation='h',
        color_discrete_map={
            'highlight': bar_main_color, 
            'neutral': bar_secondary_color     
        },
        labels={'diff': 'Difference', 'feature': 'Feature'},
        title=bar_title
    )
    bar_fig.add_vline(x=diff_avg, line_dash='dash', line_color='blue', annotation_text='Mean diff')
    bar_fig.update_layout(plot_bgcolor='white', showlegend=False)
    bar_fig.show()

    # Radar chart of outstanding features
    outstanding = diff[diff['diff'] > diff_avg]['feature'].tolist()
    lang_outstanding = lang_avg.loc[outstanding]
    others_outstanding = others_avg.loc[outstanding]

    radar_fig = go.Figure()
    radar_fig.add_trace(go.Scatterpolar(
        r=lang_outstanding.values,
        theta=lang_outstanding.index,
        fill='toself',
        name=language,
        fillcolor=radar_main_color,
        line=dict(color=radar_line_color),
        marker=dict(size=6),
    ))
    radar_fig.add_trace(go.Scatterpolar(
        r=others_outstanding.values,
        theta=others_outstanding.index,
        fill='toself',
        name='Others',
        fillcolor='rgba(17, 12, 15, 0.25)',
        line=dict(color='rgba(17, 12, 15, 0.5)'),
        marker=dict(size=6),
    ))

    radar_fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True, tickvals=[0.2, 0.4, 0.6, 0.8, 1.0]),
            angularaxis=dict(tickfont=dict(size=12, color='black'), rotation=90, direction='clockwise')
        ),
        showlegend=True,
        title=dict(text=radar_title, font=dict(size=16, color='black')),
        plot_bgcolor='white'
    )
    radar_fig.show()


In [25]:
language_feature_diff(sp_2007, numerical_columns, 'Hindi', 
                      'Differences of Hindi Compare to Others in 2007', 
                      "Outstanding Music Features: Hindi vs Others",
                     'rgba(247, 128, 9, 1)', 'rgba(255, 239, 184, 1)', 'rgba(247, 128, 9, 0.5)', 'rgba(247, 128, 9, 1)')

# Part 3 - Rise of K-pop After 2014 and Its Musical Features

After 2014, K-pop emerged as one of the most popular music genres, even though Korean songs were not the most numerous. To explore why K-pop gained such widespread global popularity, examined how its musical features differ from other languages during the same period.

### ⚡ 1. High Energy and Electronic Production
- Top in `Energy`: K-pop tracks generally exhibit the highest energy levels across all languages. This creates a dynamic and powerful sound — ideal for choreographed performances and high-impact visuals.

- Low `Acousticness`: The relatively low acousticness indicates strong reliance on electronic production and studio effects, rather than traditional acoustic instruments.

### 🎭 2. Designed for Performance and Stage Presence
- High `Liveness`: K-pop shows a distinct focus on liveness, reflecting the genre’s strong connection to live performances, TV shows, and real-time fan events.

- Visual-Performance Synergy: This emphasis on live appeal sets K-pop apart from many genres that prioritize polished studio recordings over interactive or performative elements.

### 📊 Visualizing Differences in Music Features
To better understand what makes K-pop stand out, calculated the average difference in musical features between Korean tracks and all other languages after 2014.

- 📈 Bar Chart: Differences in Music Features – Korean vs. Others 
(This chart highlights which features contributed most to K-pop’s uniqueness. A dashed line shows the average difference across all features.)

- 🧭 Radar Chart: Outstanding Music Features – K-pop vs. Others
(Only the most distinctive features — such as energy, acousticness, liveness, and popularity — are visualized to clearly show K-pop’s strengths.)

## 🎯 Counts of K-pop songs after 2014

In [26]:
kpop_2014 = normalized_df[(normalized_df['year'] >= 2014) & (normalized_df['language'] != 'Unknown')]
kpop_2014_count = kpop_2014['language'].value_counts().reset_index()

In [27]:
fig = px.pie(
    kpop_2014_count,
    names='language',
    values='count',
    color='language',
    color_discrete_map=custom_color_map,
    title='Overall Language Distribution of Songs After 2014'
)

fig.update_traces(textinfo='label+percent')
fig.update_layout(showlegend=False) 
fig.show()

## 🎯 Highlight Key Differences in K-pop Music Features

Compare the average values of each music feature in K-pop tracks against those of all other languages. 

Identify the features with the most significant differences and visualize them using a radar chart to clearly highlight how K-pop stands out in terms of musical characteristics.

In [28]:
language_feature_diff(kpop_2014, numerical_columns, 'Korean',  
                      'Differences of K-pop Compare to Others After 2014', 
                      "Outstanding Music Features: K-pop vs Others",
                      'rgba(255, 52, 120, 1)', 'rgba(247, 193, 211, 1)', 'rgba(255, 52, 120, 0.5)', 'rgba(255, 52, 120, 1)')

# 🔥 PART 4 - Characteristics of Recent Popular Songs (2018–2023)
To understand what makes a song popular in recent years, analyzed the top 5% most popular tracks between 2018 and 2023 and compared their musical features with all other songs during the same period.

### 🔥 1. More Danceable, More Energetic
- `Danceability` & `Energy`: Popular tracks tend to have significantly higher danceability and energy, suggesting a strong preference for rhythmic and upbeat music.

- This aligns with current streaming and short-form content trends (e.g., TikTok), where catchy, performance-ready music thrives.

### 🎸 2. Less Acoustic, Less Instrumental
- `Acousticness` & `Instrumentalness`: Popular songs show noticeably lower values in these features, indicating fewer traditional or acoustic elements and a strong tilt toward electronic production.

- These results reflect the dominance of studio-produced pop, hip-hop, and dance genres in recent years.

### 📊 Visualizing Feature Differences
To pinpoint the most distinctive features of popular songs:

📈 Bar Chart: Difference in Musical Features – Popular vs. Non-Popular Songs (2018–2023)
(Positive values represent features more prominent in popular songs; negative values indicate features that are less common.)

🧭 Radar Chart: Distinctive Features of Popular Songs
(Only the top contributing features are shown — such as energy, mode, danceability, and acousticness — for quick visual comparison.)



In [29]:
recent_years = normalized_df[(normalized_df['year'] >= 2018) & (normalized_df['language'] != 'Unknown')]
threshold = recent_years['popularity'].quantile(0.95)

popular_tracks = recent_years[recent_years['popularity'] >= threshold]
non_popular_tracks = recent_years[recent_years['popularity'] < threshold]

feature_cols = [col for col in numerical_columns if col not in ['popularity', 'year']]
popular_avg = popular_tracks[feature_cols].mean()
non_popular_avg = non_popular_tracks[feature_cols].mean()

feature_diff = (popular_avg - non_popular_avg).abs().sort_values().reset_index()
feature_diff.columns = ['feature', 'diff']
feature_diff

Unnamed: 0,feature,diff
0,duration_ms,0.000756
1,loudness,0.001175
2,key,0.00198
3,speechiness,0.014343
4,time_signature,0.015268
5,tempo,0.017443
6,liveness,0.024274
7,valence,0.030428
8,danceability,0.041248
9,mode,0.057542


In [30]:
# Add color category
feature_diff['color'] = ['highlight' if val > feature_diff['diff'].mean() else 'neutral' for val in feature_diff['diff']]
    
# Bar chart of differences
bar_fig = px.bar(
        feature_diff,
        x='diff',
        y='feature',
        color='color',
        orientation='h',
        color_discrete_map={
            'highlight': 'rgba(255, 197, 0, 1)', 
            'neutral': 'rgba(239, 222, 167, 1)'    
        },
        labels={'diff': 'Difference', 'feature': 'Feature'},
        title='Differences of Popular Songs Compare to Non Songs Compare in Recent 5 Years'
    )
bar_fig.add_vline(x=feature_diff['diff'].mean(), line_dash='dash', line_color='blue', annotation_text='Mean diff')
bar_fig.update_layout(plot_bgcolor='white', showlegend=False)
bar_fig.show()

In [31]:
# Radar chart of outstanding features
outstanding = feature_diff[feature_diff['diff'] > feature_diff['diff'].mean()]['feature'].tolist()
popular_outstanding = popular_avg.loc[outstanding]
npopular_outstanding = non_popular_avg.loc[outstanding]

radar_fig = go.Figure()
radar_fig.add_trace(go.Scatterpolar(
        r=popular_outstanding.values,
        theta=popular_outstanding.index,
        fill='toself',
        name='Popular',
        fillcolor='rgba(255, 197, 0, 0.5)',
        line=dict(color='rgba(255, 197, 0, 1)'),
        marker=dict(size=6),
    ))
radar_fig.add_trace(go.Scatterpolar(
        r=npopular_outstanding.values,
        theta=npopular_outstanding.index,
        fill='toself',
        name='Non-Popular',
        fillcolor='rgba(17, 12, 15, 0.25)',
        line=dict(color='rgba(17, 12, 15, 0.5)'),
        marker=dict(size=6),
    ))

radar_fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True, tickvals=[0.2, 0.4, 0.6, 0.8, 1.0]),
            angularaxis=dict(tickfont=dict(size=12, color='black'), rotation=90, direction='clockwise')
        ),
        showlegend=True,
        title=dict(text='Outstanding Music Features: Popular vs Non Popular', 
                   font=dict(size=16, color='black')),
        plot_bgcolor='white'
    )
radar_fig.show()