- html에 대한 1차 파싱을 진행한다.
- 이 파싱 정보로 artist_info, song_info 등을 가져와 저장해야 한다.

In [1]:
import os
import json
from bs4 import BeautifulSoup

## get file names

In [3]:
def getHtmlFileNames(target_year):
    html_path = "htmls/{}".format(target_year)
    html_file_names = os.listdir(html_path)
    html_file_names = [_ for _ in sorted(html_file_names) if _.endswith(".html")]
    #
    return html_file_names

## parse a file

In [8]:
def openHtml(target_year, file_name):
    html_path = "htmls/{}/{}".format(target_year, file_name)
    with open(html_path, "r") as f:
        html = f.read()
    #
    return html

In [9]:
def soupHtml(html):
    #
    soup = BeautifulSoup(html, "lxml")
    #
    return soup

In [10]:
def splitTrs(soup):
    # 각 순위별 tag들을 split하여 돌려준다.
    trs = soup.find_all("tr")
    # 첫 줄은 header이니 삭제.
    trs = trs[1:]
    #
    return trs

In [11]:
def parseAtr(a_tr):
    # ranking
    try:
        ranking = a_tr.find("span", class_="rank top").get_text(strip=True)
    except:
        ranking = a_tr.find("span", class_="rank").get_text(strip=True)
        
    # title_num
    title_num = a_tr.find("button", class_="btn_icon like")["data-song-no"]
    
    # title_name
    title_name = a_tr.find("div", class_="ellipsis rank01").strong.get_text(strip=True)
    
    # album_name 
    album_name = a_tr.find("div", class_="ellipsis rank03").find("a", class_="fc_mgray").get_text(strip=True)

    # album num
    album_num = a_tr.find("div", class_="ellipsis rank03").find("a")["href"]
    album_num = album_num.replace("javascript:melon.link.goAlbumDetail('", "").replace("');", "")

    # artist
    artist_obj = a_tr.find("div", class_="ellipsis rank02").find_all("a")
    artist = []
    for tag in artist_obj:
        artist_name = tag.get_text(strip=True)
        artist_num = tag["href"].replace("javascript:melon.link.goArtistDetail('", "").replace("');", "")
        if (artist_num, artist_name) not in artist:
            artist.append((artist_num, artist_name))
    
    # last line
    line = {"ranking": ranking, "title_num": title_num, "title_name": title_name,
           "album_num": album_num, "album_name": album_name, "artists": artist}
    
    return line

## do all html

In [7]:
target_year = "2023"
html_file_names = getHtmlFileNames(target_year)
print("files: {}".format(len(html_file_names)))

files: 20


In [12]:
lines = []
for cnt, a_name in enumerate(html_file_names, 1):
    yearweek = "{}{:02}".format(target_year, cnt)
    period = a_name.replace(".html", "")
    
    print("yearweek: {}".format(yearweek))
    
    html = openHtml(target_year, a_name)
    soup = soupHtml(html)
    trs = splitTrs(soup)
    for a_tr in trs:
        line = parseAtr(a_tr)
        line["yearweek"] = yearweek
        line["period"] = period
        lines.append(line)
        

yearweek: 202301
yearweek: 202302
yearweek: 202303
yearweek: 202304
yearweek: 202305
yearweek: 202306
yearweek: 202307
yearweek: 202308
yearweek: 202309
yearweek: 202310
yearweek: 202311
yearweek: 202312
yearweek: 202313
yearweek: 202314
yearweek: 202315
yearweek: 202316
yearweek: 202317
yearweek: 202318
yearweek: 202319
yearweek: 202320


In [13]:
print("total {} lines".format(len(lines)))

total 2000 lines


## save json

In [14]:
file_name = "parsed_chart_html/{}.jsonl".format(target_year)

In [15]:
with open(file_name, "w", encoding="utf-8") as of:
    for line in lines:
        print(json.dumps(line, ensure_ascii=False), file=of)