In [1]:
## 사전실행코드
import polars as pl
df_spotify = (pl.read_csv("./universal_top_spotify_songs.csv", try_parse_dates = True,
null_values = [""])
    .filter(pl.col('snapshot_date').dt.year() == 2024).sort('snapshot_date'))

df_spotify = (
    df_spotify.with_columns(pl.when(pl.col('country').is_null() == True) ## country 열이 null이면
        .then(pl.lit('WW')) ## WW로 변경
        .otherwise(pl.col('country')).alias('country')) ## 아니면 원래대로
    .drop_nulls()) ## 그 외 null이 들어간 행 삭제

key_levels = pl.Enum(["C", "C#", "D", "Eb", "E", "F", "F#", "G", "G#", "A", "Bb", "B"])

df_spotify = (df_spotify.with_columns(pl.col('key').cast(pl.String)
    .replace(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
        ["C", "C#", "D", "Eb", "E", "F", "F#", "G", "G#", "A", "Bb", "B"]))
    .with_columns(pl.col('key').cast(key_levels)).sort('key'))

df_spotify = (df_spotify.with_columns(pl.col('artists').str.split(', ')) ## ,를 기준으로 문자열을 분리
    ## 리스트의 첫 번째 아이템을 가져와서 main_vocal로 저장
    .with_columns(pl.col('artists').list.get(0, null_on_oob = True).alias('main_vocal'),
        pl.col('artists').list.tail(-1).alias('featuring')) ## 첫 번째 아이템을 제외한 나머지를featuring으로 저장
    .with_columns(pl.when(pl.col('featuring').list.len() == 0) ## 리스트 길이가 0이면
        .then(None) ## None으로 설정
        .otherwise(pl.col('featuring')).name.keep())) ## 아니면 그대로 유지

import pycountry_convert as pc

def get_continent_name(nation_code: str) -> str:
    if nation_code != 'WW':
        continent_code = pc.country_alpha2_to_continent_code(nation_code)
    else:
        continent_code = 'WW'
    continent_dict = {"NA": "North America","SA": "South America", "AS": "Asia", "AF": "Africa",
        "OC": "Oceania", "EU": "Europe", "AQ": "Antarctica", "WW": "Global"}
    return continent_dict[continent_code]

df_spotify = (df_spotify.with_columns(pl.col('country')
    ## 앞서 정의한 함수를 country 열에 적용
    .map_elements(get_continent_name, return_dtype = pl.String).alias('continent')))

In [2]:
pl.Config(set_tbl_cols = 10, set_tbl_rows = 25)
(df_spotify.select(pl.col([pl.Int64, pl.Float64])).describe()
    .transpose(include_header = True, header_name = 'columns',
column_names = ["count", "null_count", "mean", "std", "min", "25%", "50%", "75%", "max"])[1:])

columns,count,null_count,mean,std,min,25%,50%,75%,max
str,str,str,str,str,str,str,str,str,str
"""daily_rank""","""1281585.0""","""0.0""","""25.488521635318765""","""14.428953828238736""","""1.0""","""13.0""","""25.0""","""38.0""","""50.0"""
"""daily_movement""","""1281585.0""","""0.0""","""0.9522794040192418""","""7.048585204849036""","""-49.0""","""-1.0""","""0.0""","""2.0""","""49.0"""
"""weekly_movement""","""1281585.0""","""0.0""","""2.6433954829371444""","""11.958440767017487""","""-49.0""","""-3.0""","""0.0""","""5.0""","""49.0"""
"""popularity""","""1281585.0""","""0.0""","""75.99977215713355""","""15.612366849702711""","""0.0""","""65.0""","""79.0""","""88.0""","""100.0"""
"""duration_ms""","""1281585.0""","""0.0""","""192147.9038237807""","""49510.28735642052""","""16320.0""","""160413.0""","""184250.0""","""216338.0""","""939666.0"""
"""danceability""","""1281585.0""","""0.0""","""0.6840186434766325""","""0.13658257971471674""","""0.0""","""0.594""","""0.703""","""0.785""","""0.988"""
"""energy""","""1281585.0""","""0.0""","""0.6552570154470442""","""0.1615410243104661""","""0.0000201""","""0.557""","""0.673""","""0.768""","""0.998"""
"""loudness""","""1281585.0""","""0.0""","""-6.383302003378629""","""2.551654916926727""","""-37.334""","""-7.754""","""-5.952""","""-4.668""","""3.233"""
"""mode""","""1281585.0""","""0.0""","""0.5399329736225065""","""0.49840300103880864""","""0.0""","""0.0""","""1.0""","""1.0""","""1.0"""
"""speechiness""","""1281585.0""","""0.0""","""0.09353411915713745""","""0.08890688930938012""","""0.0""","""0.0391""","""0.0574""","""0.108""","""0.937"""


In [None]:
## plotly가 주피터 노트북, 주피터 랩에서 표시되지 않는 경우 아래의 코드를 실행시키세요
import plotly.io as pio
pio.renderers.default = "notebook_connected"

import plotly.express as px
fig = px.histogram(df_spotify, x = 'daily_rank')
fig.show()

In [None]:
fig = px.histogram(df_spotify, x = 'daily_movement')
fig.show()

In [None]:
fig = px.histogram(df_spotify, x = 'weekly_movement')
fig.show()

In [None]:
fig = px.histogram(df_spotify, x = 'popularity')
fig.show()

In [None]:
fig = px.pie(df_spotify.group_by('is_explicit').len('count'), values = "count")
fig.update_traces(textinfo = "percent+label")
fig.show()

In [8]:
pl.Config(set_tbl_cols = 10, set_tbl_rows = 25)
(df_spotify.select(pl.col([pl.String, pl.List(pl.String)])). describe()
    .transpose(include_header = True, header_name = 'columns',
column_names = ['count', 'null_count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])[1:])

columns,count,null_count,mean,std,min,25%,50%,75%,max
str,str,str,str,str,str,str,str,str,str
"""spotify_id""","""1281585""","""0""",,,"""003vvx7Niy0yvhvHt4a68B""",,,,"""7zyWm8JihcIiYmfNkbzeHE"""
"""name""","""1281585""","""0""",,,"""""Se""""",,,,"""한 페이지가 될 수 있게"""
"""artists""","""1281585.0""","""0.0""",,,,,,,
"""country""","""1281585""","""0""",,,"""AE""",,,,"""ZA"""
"""album_name""","""1281585""","""0""",,,"""""Cold Sweats""""",,,,"""黑玻璃"""
"""main_vocal""","""1281585""","""0""",,,"""$$Double-Dolla$$""",,,,"""高爾宣 OSN"""
"""featuring""","""519897.0""","""761688.0""",,,,,,,
"""continent""","""1281585""","""0""",,,"""Africa""",,,,"""South America"""


In [9]:
df_spotify.select(pl.col([pl.String]).unique().len())

spotify_id,name,country,album_name,main_vocal,continent
u32,u32,u32,u32,u32,u32
16360,14550,73,11086,5770,7


In [None]:
fig = px.bar((df_spotify.group_by('country').agg(pl.len().alias('count'))
    .with_columns(pl.when(pl.col('country') == "KR").then(pl.lit("KR"))
        .when(pl.col('country') == "WW").then(pl.lit("Global")).otherwise(pl.lit("Others")).
alias('국가'))),
    x = 'country', y = 'count', color = '국가')
fig.update_xaxes(categoryorder = "total descending")
fig.show()

In [11]:
df_spotify.group_by('country').len().sort('len', descending = True)

country,len
str,u32
"""IT""",17720
"""DO""",17718
"""NI""",17715
"""PL""",17709
"""CZ""",17709
"""EG""",17708
"""HN""",17708
"""SV""",17708
"""CR""",17708
"""FI""",17707


In [None]:
fig = px.bar(df_spotify.group_by('continent').agg(pl.col('country').unique().len()),
    x = 'continent', y = 'country', text = 'country')
fig.update_xaxes(categoryorder = "total descending")
fig.show()

In [13]:
(df_spotify.group_by('continent').agg(pl.col('country').unique().len()).sort('country',
descending = True))

continent,country
str,u32
"""Europe""",29
"""Asia""",17
"""North America""",10
"""South America""",10
"""Africa""",4
"""Oceania""",2
"""Global""",1


In [None]:
fig = px.histogram(df_spotify, x = 'snapshot_date')
fig.show()

In [None]:
fig = px.box(df_spotify, x = 'continent', y = 'popularity')
fig.show()

In [None]:
fig = px.box((df_spotify. filter(pl.col('country').is_in(["WW", "KR", "US", "BR", "GB", "AU",
"NG"]))), x = 'country', y = 'popularity')
fig.show()

In [17]:
(df_spotify.filter(pl.col('country').is_in(["KR", "GB", "WW", "US"]))
    .group_by('country', 'name').agg(pl.all().sort_by('snapshot_date').first())
    .select(pl.col('country', 'name'), (pl.col('snapshot_date')-pl.col('album_release_date')).
alias('duration'))
    .group_by('country').agg(pl.col('duration').mean().dt.total_days().alias('duration_mean'),
        pl.col('duration').median().dt.total_days().alias('duration_median'))
    .sort('duration_mean', descending = True))

country,duration_mean,duration_median
str,i64,i64
"""GB""",2948,19
"""US""",1870,3
"""WW""",1722,4
"""KR""",610,2


In [None]:
fig = px.box((df_spotify.filter(pl.col('country').is_in(["KR", "GB", "WW", "US"]))
        .group_by('country', 'name').agg(pl.all().sort_by('snapshot_date').first())
        .select(pl.col('country', 'name'),
            (pl.col('snapshot_date')-pl.col('album_release_date')).dt.total_days().alias('duration'))),
x = 'country', y = 'duration')
fig.show()

In [None]:
fig = px.box(df_spotify.filter(pl.col('country').is_in(["WW", "KR", "US", "BR", "GB", "AU",
"NG"])), x = 'country', y = 'popularity', color = 'is_explicit')
fig.show()

In [20]:
(df_spotify.filter(pl.col('country').is_in(["WW", "KR", "US", "BR", "GB", "AU", "NG"]))
    .group_by('country').agg(((pl.col('is_explicit')
    .filter(pl.col('is_explicit') == True).len()) / (pl.col('is_explicit').len())*100).round(2).
alias('Explicit_True(%)'),
            ((pl.col('is_explicit').filter(pl.col('is_explicit') == False).len()) / (pl.col('is_explicit')
.len())*100).round(2).alias('Explicit_False(%)'))
    .sort('Explicit_True(%)', descending = True))

country,Explicit_True(%),Explicit_False(%)
str,f64,f64
"""US""",46.25,53.75
"""WW""",37.39,62.61
"""AU""",33.92,66.08
"""GB""",32.69,67.31
"""NG""",25.98,74.02
"""BR""",25.55,74.45
"""KR""",9.28,90.72


## 10.2 수치형 변수 간 상관관계 회귀분석하기

In [21]:
df_spotify.select(pl.col(pl.Int64), pl.col(pl.Float64)).corr()

daily_rank,daily_movement,weekly_movement,popularity,duration_ms,…,acousticness,instrumentalness,liveness,valence,tempo
f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64
1.0,-0.145062,-0.264775,-0.112801,0.036033,…,0.040069,0.013488,0.030228,-0.057007,0.018175
-0.145062,1.0,0.368973,-0.156114,0.009213,…,0.000971,0.013345,0.004058,-0.003661,-0.002467
-0.264775,0.368973,1.0,-0.176558,0.009557,…,-0.000584,0.008384,0.012274,-0.007706,0.000593
-0.112801,-0.156114,-0.176558,1.0,0.015487,…,-0.111156,-0.010133,-0.056468,-0.008356,0.001929
0.036033,0.009213,0.009557,0.015487,1.0,…,0.040664,0.02811,-0.024323,-0.200502,-0.025502
0.011965,0.009721,0.010423,0.103172,0.043542,…,0.034684,0.036304,-0.002485,-0.08638,0.022574
0.027519,0.00442,0.008576,-0.133768,0.033176,…,-0.104765,0.013961,0.00932,0.012608,-0.067961
-0.062886,-0.017507,-0.029831,-0.033194,-0.196576,…,-0.230753,-0.006804,-0.112974,0.417625,-0.183312
-0.042519,-0.00345,-0.00634,0.015687,-0.119117,…,-0.527654,-0.080463,0.141553,0.358156,0.10058
-0.044374,-0.018784,-0.025519,0.136663,-0.135113,…,-0.435017,-0.220426,0.056927,0.283396,0.048295


In [None]:
fig = px.imshow((df_spotify.select(pl.col(pl.Int64), pl.col(pl.Float64)).corr().with_columns(pl.
all().round(1))),
    y = df_spotify.select(pl.col(pl.Int64), pl.col(pl.Float64)).columns,
    text_auto = True, aspect = "auto", color_continuous_scale = "RdBu_r")
fig.show()

In [23]:
(df_spotify.select(pl.col(pl.Int64), pl.col(pl.Float64)).corr()
    .with_columns(index = pl.lit(pl.Series(df_spotify.select(pl.col(pl.Int64), pl.col(pl.Float64)).
columns)))
    .unpivot(index = 'index')
    .filter(pl.col('index') != pl.col('variable'))
    .filter((pl.col('value') > 0.5) | (pl.col('value') < -0.5))
    .sort('value', descending = True))

index,variable,value
str,str,f64
"""loudness""","""energy""",0.724866
"""energy""","""loudness""",0.724866
"""acousticness""","""energy""",-0.527654
"""energy""","""acousticness""",-0.527654


In [None]:
fig = px.scatter(df_spotify.sample(fraction = 0.1, seed = 123),
    x = 'loudness', y = 'energy', trendline = 'ols', trendline_color_override = "red", opacity = 0.1,
    range_y = [0, 1])
fig.show()

In [25]:
result = px.get_trendline_results(fig)
result.px_fit_results.iloc[0].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.528
Model:,OLS,Adj. R-squared:,0.528
Method:,Least Squares,F-statistic:,143400.0
Date:,"Sat, 06 Dec 2025",Prob (F-statistic):,0.0
Time:,19:28:01,Log-Likelihood:,100000.0
No. Observations:,128158,AIC:,-200000.0
Df Residuals:,128156,BIC:,-200000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9488,0.001,1138.448,0.000,0.947,0.950
x1,0.0460,0.000,378.679,0.000,0.046,0.046

0,1,2,3
Omnibus:,408.453,Durbin-Watson:,0.49
Prob(Omnibus):,0.0,Jarque-Bera (JB):,508.583
Skew:,-0.058,Prob(JB):,3.65e-111
Kurtosis:,3.286,Cond. No.,18.8


In [None]:
fig = px.scatter(df_spotify.sample(fraction = 0.1, seed = 123),
    x = 'acousticness', y = 'energy', trendline_color_override = "red", trendline = "ols",
opacity = 0.1)
fig.show()

In [27]:
result = px.get_trendline_results(fig)
result.px_fit_results.iloc[0].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.274
Model:,OLS,Adj. R-squared:,0.274
Method:,Least Squares,F-statistic:,48390.0
Date:,"Sat, 06 Dec 2025",Prob (F-statistic):,0.0
Time:,19:28:01,Log-Likelihood:,72408.0
No. Observations:,128158,AIC:,-144800.0
Df Residuals:,128156,BIC:,-144800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.7473,0.001,1319.663,0.000,0.746,0.748
x1,-0.3441,0.002,-219.967,0.000,-0.347,-0.341

0,1,2,3
Omnibus:,1380.436,Durbin-Watson:,0.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1356.962
Skew:,-0.231,Prob(JB):,2.18e-295
Kurtosis:,2.798,Cond. No.,4.38


## 10.3 스포티파이 데이터로 글로벌 인기도 파악하기

In [28]:
expr_1 = pl.col('name').unique().len().over('main_vocal')
df_spotify_EDA1 = (
pl.concat([
    ## 글로벌 메인보컬 Top 10 산출
    (df_spotify.filter(pl.col('country') == 'WW').select(pl.col('main_vocal').alias('Global_Main_Vocal'),
        expr_1.alias('Global_Songs')).unique().sort('Global_Songs', descending = True).head(10)),
    ## 우리나라 메인보컬 Top 10 산출
    (df_spotify.filter(pl.col('country') == 'KR').select(pl.col('main_vocal').alias('KR_Main_Vocal'),
        expr_1.alias('KR_Songs')).unique().sort('KR_Songs', descending = True).head(10)),
    ## 미국 메인보컬 Top 10 산출
    (df_spotify.filter(pl.col('country') == 'US').select(pl.col('main_vocal').alias('US_Main_Vocal'),
        expr_1.alias('US_Songs')).unique().sort('US_Songs', descending = True).head(10)),
    ## 영국 메인보컬 Top 10 산출
    (df_spotify.filter(pl.col('country') == 'GB').select(pl.col('main_vocal').alias('GB_Main_Vocal'),
        expr_1.alias('GB_Songs')).unique().sort('GB_Songs', descending = True).head(10))],
    how = 'horizontal')
    .with_columns(pl.int_range(1, 11).alias('rank'))
    .select(pl.col('rank'), pl.all().exclude('rank')))
df_spotify_EDA1

rank,Global_Main_Vocal,Global_Songs,KR_Main_Vocal,KR_Songs,US_Main_Vocal,US_Songs,GB_Main_Vocal,GB_Songs
i64,str,u32,str,u32,str,u32,str,u32
1,"""Taylor Swift""",36,"""Jimin""",21,"""Taylor Swift""",37,"""Taylor Swift""",36
2,"""Beyoncé""",17,"""Lim Young Woong""",19,"""Future""",35,"""Oasis""",29
3,"""Kendrick Lamar""",17,"""DAY6""",16,"""Beyoncé""",23,"""Kanye West""",20
4,"""Future""",16,"""aespa""",16,"""Kendrick Lamar""",20,"""Beyoncé""",19
5,"""Kanye West""",15,"""NewJeans""",15,"""Zach Bryan""",18,"""Eminem""",19
6,"""Tyler""",15,"""Jung Kook""",13,"""Kanye West""",18,"""Ariana Grande""",16
7,"""Sabrina Carpenter""",15,"""V""",12,"""Post Malone""",17,"""Kendrick Lamar""",16
8,"""Ariana Grande""",14,"""LE SSERAFIM""",11,"""Ariana Grande""",17,"""Tyler""",15
9,"""Eminem""",14,"""YANGHONGWON""",10,"""Eminem""",17,"""Sabrina Carpenter""",15
10,"""Billie Eilish""",11,"""Kanye West""",10,"""Tyler""",16,"""Charli xcx""",14


In [None]:
fig = px.bar(
    (df_spotify.filter(pl.col('country').is_in(["WW", "KR", "US", "GB"])) ## 대상 국가 필터링
        .group_by('country', 'main_vocal') ## 국가와 메인보컬로 그룹화
        .agg(pl.col('name').unique().len()) ## 노래 이름의 고윳값에 대한 개수 산출
        .sort(['country', 'name'], descending = True) ## 결과를 국가와 노래 이름으로 정렬
        .group_by('country', maintain_order = True) ## 결과를 국가명으로 다시 그룹화
        .head(10)), ## 상위 10개만 선택
    ## X축은 main_vocal, Y축과 막대 텍스트는 name으로 설정
    x = 'main_vocal', y = 'name', text = 'name',
    facet_row = 'country', facet_row_spacing = 0.07, ## facet 설정
    labels = {"main_vocal": "메인보컬", "name": "노래수"}) ## 축 라벨 설정
fig.update_xaxes(matches = None, showticklabels = True) ## X축 간의 매칭과 틱라벨을 제거
fig.show()

In [30]:
(df_spotify.filter(pl.col('country') == "WW", ## 글로벌만 필터링
        pl.col('main_vocal') == "Taylor Swift") ## 테일러 스위프트만 필터링
    .group_by(['main_vocal', 'name']) ## 메인보컬과 노래명으로 그룹화
    .len('chart in days') ## 전체 개수 산출
    .sort('chart in days', descending = True).head(10))

main_vocal,name,chart in days
str,str,u32
"""Taylor Swift""","""Cruel Summer""",290
"""Taylor Swift""","""Fortnight (feat. Post Malone)""",119
"""Taylor Swift""","""I Can Do It With a Broken Hear…",76
"""Taylor Swift""","""Down Bad""",32
"""Taylor Swift""","""Who’s Afraid of Little Old Me?""",27
"""Taylor Swift""","""Guilty as Sin?""",27
"""Taylor Swift""","""But Daddy I Love Him""",24
"""Taylor Swift""","""So Long, London""",24
"""Taylor Swift""","""My Boy Only Breaks His Favorit…",24
"""Taylor Swift""","""Florida!!! (feat. Florence + T…",16


In [31]:
(df_spotify.filter(pl.col('country') == "KR", pl.col('main_vocal') == "Jimin")
    .group_by(['main_vocal', 'name']).len('chart in days').sort('chart in days', descending = 
True).head(10))

main_vocal,name,chart in days
str,str,u32
"""Jimin""","""Closer Than This""",353
"""Jimin""","""Like Crazy""",352
"""Jimin""","""Set Me Free Pt.2""",201
"""Jimin""","""Like Crazy (English Version)""",201
"""Jimin""","""Alone""",180
"""Jimin""","""Face-off""",179
"""Jimin""","""Smeraldo Garden Marching Band …",176
"""Jimin""","""Interlude : Showtime""",155
"""Jimin""","""Rebirth (Intro)""",155
"""Jimin""","""Slow Dance (feat. Sofia Carson…",155


In [32]:
df_spotify_EDA2 = ( pl.concat([
    (df_spotify.filter(pl.col('country') == "WW") ## 글로벌 차트만 필터링
        .select(pl.col('name').alias('Global_Song'), ## 노래명 열 선택
            pl.col('main_vocal').alias('Global_Vocal'), ## 메인보컬 열 선택
            pl.col('name').len().over('name').alias('Global_Day')) ## 노래명별 노래 수 산출
    ## 고유 행만 산출 정렬 후 상위 10곡만 출력
    .unique().sort('Global_Day', descending = True).head(10)),
    (df_spotify.filter(pl.col('country') == "KR") ## 한국 차트만 필터링
        .select(pl.col('name').alias('KR_Song'),
            pl.col('main_vocal').alias('KR_Vocal'),
            pl.col('name').len().over('name').alias('KR_Day'))
        .unique().sort('KR_Day', descending = True).head(10)),
    (df_spotify.filter(pl.col('country') == "US") ## 미국 차트만 필터링
        .select(pl.col('name').alias('US_Song'),
            pl.col('main_vocal').alias('US_Vocal'),
            pl.col('name').len().over('name').alias('US_Day'))
        .unique().sort('US_Day', descending = True).head(10)),
    (df_spotify.filter(pl.col('country') == "GB")
        .select(pl.col('name').alias('GB_Song'), ## 영국 차트만 핕터링
            pl.col('main_vocal').alias('GB_Vocal'),
            pl.col('name').len().over('name').alias('GB_Day'))
        .unique().sort('GB_Day', descending = True).head(10))],
    how = 'horizontal')
    .with_columns(pl.int_range(1, 11).alias('rank')) ## 순위 열 생성
    .select(pl.col('rank'), pl.all().exclude('rank')) ## 순위 열을 앞으로 재배치
)
df_spotify_EDA2

rank,Global_Song,Global_Vocal,Global_Day,KR_Song,…,US_Vocal,US_Day,GB_Song,GB_Vocal,GB_Day
i64,str,str,u32,str,…,str,u32,str,str,u32
1,"""One Of The Girls (with JENNIE,…","""The Weeknd""",341,"""3D (feat. Jack Harlow)""",…,"""Noah Kahan""",326,"""Stick Season""","""Noah Kahan""",327
2,"""I Wanna Be Yours""","""Arctic Monkeys""",341,"""Seven (feat. Latto) (Explicit …",…,"""Teddy Swims""",325,"""Lose Control""","""Teddy Swims""",320
3,"""Lose Control""","""Teddy Swims""",340,"""Closer Than This""",…,"""Zach Bryan""",322,"""Beautiful Things""","""Benson Boone""",309
4,"""Beautiful Things""","""Benson Boone""",328,"""Like Crazy""",…,"""Zach Bryan""",317,"""Mr. Brightside""","""The Killers""",300
5,"""The Night We Met""","""Lord Huron""",324,"""Standing Next to You""",…,"""Benson Boone""",313,"""Cruel Summer""","""Taylor Swift""",272
6,"""Cruel Summer""","""Taylor Swift""",290,"""Love Me Again""",…,"""Morgan Wallen""",272,"""Scared To Start""","""Michael Marcagi""",266
7,"""LUNA""","""Feid""",276,"""Grain of Sand""",…,"""Tyler""",263,"""Too Sweet""","""Hozier""",244
8,"""End of Beginning""","""Djo""",266,"""Do or Die""",…,"""Chappell Roan""",247,"""Unwritten""","""Natasha Bedingfield""",243
9,"""we can't be friends (wait for …","""Ariana Grande""",255,"""London Boy""",…,"""Sabrina Carpenter""",241,"""The Night We Met""","""Lord Huron""",232
10,"""Too Sweet""","""Hozier""",251,"""Polaroid""",…,"""Shaboozey""",239,"""Good Luck, Babe!""","""Chappell Roan""",229


In [33]:
(df_spotify_EDA2.style
    .tab_header(title = "2024년 노래 Top 10") ## 표 제목 설정
    .tab_stub(rowname_col = 'rank') ## 스텁 설정
    ## 스패너 설정
    .tab_spanner("글로벌", ['Global_Song', 'Global_Vocal', 'Global_Day'])
    .tab_spanner("한국", ['KR_Song', 'KR_Vocal', 'KR_Day'])
    .tab_spanner("미국", ['US_Song', 'US_Vocal', 'US_Day'])
    .tab_spanner("영국", ['GB_Song', 'GB_Vocal', 'GB_Day'])
    ## 열 정렬 설정
    .cols_align(align = "center")
    ## 열 라벨 설정
    .cols_label(Global_Song = "노래", Global_Vocal = "메인보컬", Global_Day = "차트일수",
        KR_Song = "노래", KR_Vocal = "메인보컬", KR_Day = "차트일수",
        US_Song = "노래", US_Vocal = "메인보컬", US_Day = "차트일수",
        GB_Song = "노래", GB_Vocal = "메인보컬", GB_Day = "차트일수"))

2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10,2024년 노래 Top 10
Unnamed: 0_level_1,글로벌,글로벌,글로벌,한국,한국,한국,미국,미국,미국,영국,영국,영국
Unnamed: 0_level_2,노래,메인보컬,차트일수,노래,메인보컬,차트일수,노래,메인보컬,차트일수,노래,메인보컬,차트일수
1,"One Of The Girls (with JENNIE, Lily Rose Depp)",The Weeknd,341,3D (feat. Jack Harlow),Jung Kook,353,Stick Season,Noah Kahan,326,Stick Season,Noah Kahan,327
2,I Wanna Be Yours,Arctic Monkeys,341,Seven (feat. Latto) (Explicit Ver.),Jung Kook,353,Lose Control,Teddy Swims,325,Lose Control,Teddy Swims,320
3,Lose Control,Teddy Swims,340,Closer Than This,Jimin,353,I Remember Everything (feat. Kacey Musgraves),Zach Bryan,322,Beautiful Things,Benson Boone,309
4,Beautiful Things,Benson Boone,328,Like Crazy,Jimin,352,Something in the Orange,Zach Bryan,317,Mr. Brightside,The Killers,300
5,The Night We Met,Lord Huron,324,Standing Next to You,Jung Kook,352,Beautiful Things,Benson Boone,313,Cruel Summer,Taylor Swift,272
6,Cruel Summer,Taylor Swift,290,Love Me Again,V,336,Last Night,Morgan Wallen,272,Scared To Start,Michael Marcagi,266
7,LUNA,Feid,276,Grain of Sand,Lim Young Woong,327,See You Again (feat. Kali Uchis),Tyler,263,Too Sweet,Hozier,244
8,End of Beginning,Djo,266,Do or Die,Lim Young Woong,322,"Good Luck, Babe!",Chappell Roan,247,Unwritten,Natasha Bedingfield,243
9,we can't be friends (wait for your love),Ariana Grande,255,London Boy,Lim Young Woong,299,Espresso,Sabrina Carpenter,241,The Night We Met,Lord Huron,232
10,Too Sweet,Hozier,251,Polaroid,Lim Young Woong,282,A Bar Song (Tipsy),Shaboozey,239,"Good Luck, Babe!",Chappell Roan,229


In [34]:
## 국가별, 메인보컬별, 노래별 차트일 수 붙이기
(pl.concat([
    (df_spotify.filter(pl.col('country') == "WW", pl.col('daily_rank') == 1)
        .group_by('name')
        .agg(pl.col('main_vocal').first().alias('Global_Main_Vocal'),
        pl.len().alias('Global_Chart_Days'))
        .rename({"name": "Global_Song"}).sort('Global_Chart_Days', descending = True).head(10)),
    (df_spotify.filter(pl.col('country') == "KR", pl.col('daily_rank') == 1)
        .group_by('name')
        .agg(pl.col('main_vocal').first().alias('KR_Main_Vocal'), pl.len().alias('KR_Chart_Days'))
        .rename({"name": "KR_Song"}).sort('KR_Chart_Days', descending = True).head(10)),
    (df_spotify.filter(pl.col('country') == "US", pl.col('daily_rank') == 1)
        .group_by('name')
        .agg(pl.col('main_vocal').first().alias('US_Main_Vocal'), pl.len().alias('US_Chart_Days'))
        .rename({"name": "US_Song"}).sort('US_Chart_Days', descending = True).head(10)),
    (df_spotify.filter(pl.col('country') == "GB", pl.col('daily_rank') == 1)
        .group_by('name')
        .agg(pl.col('main_vocal').first().alias('GB_Main_Vocal'), pl.len().alias('GB_Chart_Days'))
        .rename({"name": "GB_Song"}).sort('GB_Chart_Days', descending = True).head(10))],
    how = 'horizontal')
    .with_columns(pl.int_range(1, 11).alias('rank')) ## 순위 열 만들기
    .select(pl.col('rank'), pl.all().exclude('rank')).style ## 순위 열 순서 설정
    .tab_header(title = "2024년 차트 1위 노래 Top 10") ## 표 제목 설정
    .tab_stub(rowname_col = 'rank')
    ## 스패너 설정
    .tab_spanner("글로벌", ['Global_Song', 'Global_Main_Vocal', 'Global_Chart_Days'])
    .tab_spanner("한국", ['KR_Song', 'KR_Main_Vocal', 'KR_Chart_Days'])
    .tab_spanner("미국", ['US_Song', 'US_Main_Vocal', 'US_Chart_Days'])
    .tab_spanner("영국", ['GB_Song', 'GB_Main_Vocal', 'GB_Chart_Days'])
.cols_align(align = "center") ## 열 제목 정렬
    ## 열 라벨 설정
    .cols_label(Global_Song = "노래", Global_Main_Vocal = "메인보컬", Global_Chart_Days = "차트일수",
        KR_Song = "노래", KR_Main_Vocal = "메인보컬", KR_Chart_Days = "차트일수",
        US_Song = "노래", US_Main_Vocal = "메인보컬", US_Chart_Days = "차트일수",
        GB_Song = "노래", GB_Main_Vocal = "메인보컬", GB_Chart_Days = "차트일수"))

2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10,2024년 차트 1위 노래 Top 10
Unnamed: 0_level_1,글로벌,글로벌,글로벌,한국,한국,한국,미국,미국,미국,영국,영국,영국
Unnamed: 0_level_2,노래,메인보컬,차트일수,노래,메인보컬,차트일수,노래,메인보컬,차트일수,노래,메인보컬,차트일수
1,Die With A Smile,Lady Gaga,95,Like Crazy,Jimin,170.0,Not Like Us,Kendrick Lamar,48,Stick Season,Noah Kahan,73
2,Beautiful Things,Benson Boone,36,Who,Jimin,155.0,Taste,Sabrina Carpenter,35,Espresso,Sabrina Carpenter,65
3,Espresso,Sabrina Carpenter,33,Magnetic,ILLIT,11.0,CARNIVAL,Kanye West,34,Taste,Sabrina Carpenter,59
4,BIRDS OF A FEATHER,Billie Eilish,24,How Sweet,NewJeans,9.0,Please Please Please,Sabrina Carpenter,27,Please Please Please,Sabrina Carpenter,23
5,La Diabla,Xavi,21,Supernova,aespa,7.0,Lovin On Me,Jack Harlow,23,Last Christmas,Wham!,20
6,Who,Jimin,20,Supernatural,NewJeans,1.0,That’s So True,Gracie Abrams,18,Too Sweet,Hozier,17
7,APT.,ROSÉ,19,,,,Too Sweet,Hozier,17,That’s So True,Gracie Abrams,13
8,Please Please Please,Sabrina Carpenter,19,,,,Rockin' Around The Christmas Tree,Brenda Lee,16,BACKBONE,Chase & Status,9
9,i like the way you kiss me,Artemas,13,,,,Die With A Smile,Lady Gaga,16,Sailor Song,Gigi Perez,8
10,MILLION DOLLAR BABY,Tommy Richman,11,,,,"Good Luck, Babe!",Chappell Roan,13,BIRDS OF A FEATHER,Billie Eilish,8


In [35]:
(df_spotify.filter(pl.col('country').is_in(["WW", "KR", "US", "GB"]), ## 국가 필터링
        pl.col('daily_rank') == 1) ## 1위만 필터링
    ## 국가별 1위곡 수 산출
    .select(pl.col('country'), pl.col('name').unique().len().over('country'))
    .unique().sort('name', descending = True))

country,name
str,u32
"""US""",35
"""GB""",22
"""WW""",21
"""KR""",6


In [36]:
fig = px.line((df_spotify.
    ## 우리나라 1위곡들만 필터링
    filter(pl.col('name').is_in(["Like Crazy", "Who", "Magnetic", "How Sweet", "Supernova",
"Supernatural"]),
        pl.col('country') == 'KR')),
    x = 'snapshot_date', y = 'daily_rank', color = 'name', line_dash = 'name',
    labels = {"snapshot_date": "날짜", "daily_rank": "순위", "name": "노래"})
fig.update_yaxes(autorange = "reversed")
fig.show()

In [37]:
(df_spotify.filter(pl.col('name') == "APT.", pl.col('daily_rank') == 1)
    .select(pl.col('continent'), pl.col('country').unique().len().over('continent').alias('NO.1'))
    .unique().sort('NO.1', descending = True))

continent,NO.1
str,u32
"""Asia""",9
"""Europe""",2
"""Oceania""",2
"""North America""",2
"""Global""",1


In [38]:
(df_spotify.filter(pl.col('name') == "APT.", pl.col('daily_rank') == 1,
    pl.col('continent') == "Global").select(pl.col('snapshot_date')))

snapshot_date
date
2024-10-22
2024-10-23
2024-10-24
2024-10-25
2024-10-26
2024-10-27
2024-10-28
2024-10-29
2024-10-30
2024-10-31


In [39]:
(df_spotify.filter(pl.col('country') == "KR", pl.col('name') == "APT.")
.select(pl.col('daily_rank').min()))

daily_rank
i64
2


In [40]:
fig = px.line((df_spotify.filter(pl.col('country').is_in(["WW", "KR", "US", "GB"]),
pl.col('name') == "APT.")),
    x = 'snapshot_date', y = 'daily_rank', color = 'country', line_dash = 'country')
fig.update_yaxes(autorange = "reversed")
fig.show()

In [41]:
df_spotify_EDA4 = (
    df_spotify.filter(pl.col('name') == "APT.").select(pl.col('country'), pl.col('continent'),
        pl.col('country').map_elements(
            lambda x: pc.country_name_to_country_alpha3(pc.country_alpha2_to_country_name(x))
                if x != "WW" else "WW", return_dtype = pl.String).alias('nation'),
                    (pl.col('country').map_elements(
                        lambda x: pc.country_alpha2_to_country_name(x) if x != "WW" else "WW",
                            return_dtype = pl.String).alias('nation_name')),
        pl.col('popularity'), pl.col('daily_rank'), pl.col('name').len().over('country').
alias('chart_days'))
            .group_by('nation').agg(pl.col('country').first(),
        pl.col('nation_name').first(), pl.col('continent').first(),
        pl.col('popularity').mean(), pl.col('daily_rank').mean(),
        pl.col('chart_days').first()))
df_spotify_EDA4.sort('daily_rank')

nation,country,nation_name,continent,popularity,daily_rank,chart_days
str,str,str,str,f64,f64,u32
"""SGP""","""SG""","""Singapore""","""Asia""",90.301587,1.238095,63
"""HKG""","""HK""","""Hong Kong""","""Asia""",90.301587,1.301587,63
"""MYS""","""MY""","""Malaysia""","""Asia""",90.301587,1.31746,63
"""TWN""","""TW""","""Taiwan, Province of China""","""Asia""",90.301587,2.31746,63
"""ARE""","""AE""","""United Arab Emirates""","""Asia""",90.301587,2.412698,63
"""JPN""","""JP""","""Japan""","""Asia""",90.532258,2.467742,62
"""WW""","""WW""","""WW""","""Global""",90.301587,2.507937,63
"""PHL""","""PH""","""Philippines""","""Asia""",90.301587,2.587302,63
"""SAU""","""SA""","""Saudi Arabia""","""Asia""",89.809524,2.873016,63
"""KAZ""","""KZ""","""Kazakhstan""","""Asia""",90.301587,3.587302,63


In [42]:
df_spotify.select(pl.col('country').unique()).join(df_spotify_EDA4, on = 'country', how = "anti")

country
str
"""AR"""
"""BY"""
"""NG"""
"""UY"""
"""EG"""


In [43]:
fig = px.choropleth(df_spotify_EDA4, locations = 'nation', color = 'popularity', scope = "world",
    hover_name = 'nation_name', color_continuous_scale = "greens", width = 800, height = 600,
    title = "로제의 APT. 인기도")
fig.show()

In [44]:
fig = px.choropleth(df_spotify_EDA4, locations = 'nation', color = 'daily_rank', scope = "world",
    hover_name = 'nation_name', color_continuous_scale = "greens_r", width = 800, height = 600,
    title = "로제의 APT. 평균 순위")
fig.show()