In [1]:
# IMPORT LIBRARY
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import datetime

In [2]:
# GET CURRENT TIME FOR FILENAME
now_kst = datetime.datetime.utcnow() + datetime.timedelta(hours=9)
now_kst = now_kst.strftime('%Y-%m-%d %H:%M:%S')

In [3]:
# TYPE OPTIONS
type_1 = ""         # TOP 100
type_2 = "day/"     # DAILY
type_3 = "week/"    # WEEKLY
type_4 = "month/"   # MONTHLY

# SELECT TYPE
type = type_1

# URL WITH TYPE
url = f"https://www.melon.com/chart/{type}index.htm"

In [4]:
# REQUEST HTML DOCUMENT
response = requests.get(url, headers={"user-agent":"Chrome/105.0.0.0"})

# BEAUTIFUL SOUP HTML PARSING
html = bs(response.text, 'html.parser')

# LOCATE <tbody> WHICH CONTAINS THE RANKINGS
tbody = html.select_one('#frm > div > table > tbody')

In [5]:
item_list = []              # BLANK LIST TO COLLECT ENTIRE DATA
col = [                     # COLUMN LABEL LIST
          'Rank'
        , 'Id'
        , 'Title'
        , 'Artist'
        , 'Album'
        , 'Image'
        , 'Likes'
      ]

trs = tbody.select('tr')    # LOCATE ALL <tr> IN <tbody> (CONTAIN MUSIC INFO)

for tr in trs :             # LOOP EACH <tr> TO COLLECT DATA (<tr> REPRESENTS EACH MUSIC)

    rank = tr.select('td:nth-of-type(2) > div > span.rank')[0].text
    id = tr['data-song-no'] # REQUIRED FOR LIKE COUNT SEARCH
    title = tr.select('td:nth-of-type(6) > div > div > div.ellipsis.rank01 > span > a')[0].text
    artist = tr.select('td:nth-of-type(6) > div > div > div.ellipsis.rank02 > span > a')[0].text
    album = tr.select('td')[6].text.strip()
    image = tr.select('td')[3].img['src']
    likes = 0               # TEMPORARY 0

    # CREATE LIST WITH DATA COLLECTED ABOVE
    col_data = [rank, id, title, artist, album, image, likes]

    # CREATE DICTIONARY {'rank' : rank, 'id' : id ...}
    item = {}
    for index, c in enumerate(col) :
        item[c]=col_data[index]

    # APPEND THE DICTIONARY TO THE PARENT LIST (ITEM_LIST) BEFORE LOOPING TO THE NEXT SONG (<tr>)
    item_list.append(item)

In [6]:
# 멜론에서 좋아요 숫자는 동적 변수로 Javascript가 작동 후에 HTML이 채워진다.
# 그래서 위 URL으로 BeautifulSoup을 사용해서 크롤링하면 좋아요 수가 0으로 나온다.
# CHROME INSPECT료 NETWORK에 들어가서 FETCH/XHR을 누르고 getSongLike.json?의 Request URL을 가져오면 좋아요 수를 가져올 수 있다.
# 동적 페이지는 SELENIUM을 추가로 사용해서 크롤링 할 수 있지만 아래 방법으로 BEAUTIFULSOUP만 사용해서 가져올 수도 있다.

# GATHER ALL MUSIC IDS INTO A STRING (SEPARATED WITH COMMA)
contsId = ""
for index, item in enumerate(item_list) :
    contsId += item['Id']
    if index != len(item_list)-1 :
        contsId += ','

# URL TO GET LIVE LIKE COUNTS FOR ALL MUSIC
like_url = f"https://www.melon.com/commonlike/getSongLike.json?contsIds={contsId}"
like_response = requests.get(like_url, headers={"user-agent":"Chrome/105.0.0.0"})
# JSON FORMAT
like_list = like_response.json()['contsLike']

# EDIT LIKE COUNTS OF EACH MUSIC WITHIN THE ITEM_LIST
for index, like in enumerate(like_list) :
    item_list[index]['Likes'] = like['SUMMCNT']

In [7]:
# CONVERT ITEM_LIST INTO A PANDAS DATA FRAME
df = pd.DataFrame.from_dict(item_list)
display(df)

Unnamed: 0,Rank,Id,Title,Artist,Album,Image,Likes
0,1,35595136,새삥 (Prod. ZICO) (Feat. 호미들),지코 (ZICO),스트릿 맨 파이터(SMF) Original Vol.3 (계급미션),https://cdnimg.melon.co.kr/cm2/album/images/11...,84765
1,2,35546497,After LIKE,IVE (아이브),After LIKE,https://cdnimg.melon.co.kr/cm2/album/images/11...,168701
2,3,35454425,Attention,NewJeans,NewJeans 1st EP 'New Jeans',https://cdnimg.melon.co.kr/cm2/album/images/11...,166310
3,4,35640077,Shut Down,BLACKPINK,BORN PINK,https://cdnimg.melon.co.kr/cm2/album/images/11...,81163
4,5,35454426,Hype boy,NewJeans,NewJeans 1st EP 'New Jeans',https://cdnimg.melon.co.kr/cm2/album/images/11...,136808
...,...,...,...,...,...,...,...
95,96,32698101,OHAYO MY NIGHT,디핵 (D-Hack),OHAYO MY NIGHT,https://cdnimg.melon.co.kr/cm2/album/images/10...,196041
96,97,33625988,바라만 본다,MSG워너비(M.O.M),MSG워너비 1집,https://cdnimg.melon.co.kr/cm2/album/images/10...,182026
97,98,35609035,인생은 뷰티풀,김호중,인생은 뷰티풀: 비타돌체,https://cdnimg.melon.co.kr/cm2/album/images/11...,13375
98,99,35640083,Ready For Love,BLACKPINK,BORN PINK,https://cdnimg.melon.co.kr/cm2/album/images/11...,29251


In [8]:
# SAVE DATAFRAME AS A CSV FILE
filename = f"{now_kst}_melon_top_100_raw.csv"
df.to_csv(filename, index=False, encoding="utf-8-sig")
pd.read_csv(filename)

Unnamed: 0,Rank,Id,Title,Artist,Album,Image,Likes
0,1,35595136,새삥 (Prod. ZICO) (Feat. 호미들),지코 (ZICO),스트릿 맨 파이터(SMF) Original Vol.3 (계급미션),https://cdnimg.melon.co.kr/cm2/album/images/11...,84765
1,2,35546497,After LIKE,IVE (아이브),After LIKE,https://cdnimg.melon.co.kr/cm2/album/images/11...,168701
2,3,35454425,Attention,NewJeans,NewJeans 1st EP 'New Jeans',https://cdnimg.melon.co.kr/cm2/album/images/11...,166310
3,4,35640077,Shut Down,BLACKPINK,BORN PINK,https://cdnimg.melon.co.kr/cm2/album/images/11...,81163
4,5,35454426,Hype boy,NewJeans,NewJeans 1st EP 'New Jeans',https://cdnimg.melon.co.kr/cm2/album/images/11...,136808
...,...,...,...,...,...,...,...
95,96,32698101,OHAYO MY NIGHT,디핵 (D-Hack),OHAYO MY NIGHT,https://cdnimg.melon.co.kr/cm2/album/images/10...,196041
96,97,33625988,바라만 본다,MSG워너비(M.O.M),MSG워너비 1집,https://cdnimg.melon.co.kr/cm2/album/images/10...,182026
97,98,35609035,인생은 뷰티풀,김호중,인생은 뷰티풀: 비타돌체,https://cdnimg.melon.co.kr/cm2/album/images/11...,13375
98,99,35640083,Ready For Love,BLACKPINK,BORN PINK,https://cdnimg.melon.co.kr/cm2/album/images/11...,29251
