In [1]:
import os
import json
from bs4 import BeautifulSoup
from tqdm import tqdm

## parse artist info

In [8]:
def parseArtistInfo(artist_num):
    artist_info = {"artist_num": str(artist_num)}
    #
    artist_html = "htmls/artists/{}.html".format(artist_num)
    with open(artist_html, "r", encoding='UTF-8') as f:
        html = f.read()
        
    if "존재하지 않는 아티스트 정보입니다" in html:
        return artist_info
    #
    soup = BeautifulSoup(html, "lxml")
    #
    artist_name = soup.find("meta", attrs={"property": "og:title"})["content"]
    artist_info["이름"] = artist_name
    #
    info_part_tags = soup.find_all("dl", class_="list_define clfix")
    info_part_tag = [_ for _ in info_part_tags if "유형" in _.get_text()][0]
    #info_part_tag = soup.find_all("dl", class_="list_define clfix")[-2]
    
    dt_list = info_part_tag.find_all("dt")
    dd_list = info_part_tag.find_all("dd")
    #
    for dt, dd in zip(dt_list, dd_list):
        key = dt.get_text(strip=True).strip()
        val = dd.get_text(strip=True).strip()
        artist_info[key] = val
    #
    return artist_info

### read all htmls list

In [9]:
artist_nums = [_.replace(".html", "") for _ in os.listdir("htmls/artists/")]
len(artist_nums)

47

### check alread parsed

In [10]:
artist_info_file = "artist_infos.jsonl"
with open(artist_info_file, "r", encoding='utf-8') as f:
    lines = f.readlines()

already = [json.loads(_)["artist_num"] for _ in lines]
print("already: {}".format(len(already)))

already: 2553


In [11]:
artist_nums = [_ for _ in artist_nums if _ not in already]
print("artist_nums to parse: {}".format(len(artist_nums)))

artist_nums to parse: 18


### start parsing

In [12]:
artist_infos = []
for artist_num in tqdm(artist_nums):
    info = parseArtistInfo(artist_num)
    artist_infos.append(info)

100%|██████████| 18/18 [00:00<00:00, 33.70it/s]


In [13]:
artist_infos[0]

{'artist_num': '2006344',
 '이름': 'Stray Kids (스트레이 키즈)',
 '데뷔': '2018.03.25',
 '활동년대': '2010, 2020 년대',
 '유형': '그룹|남성',
 '장르': '랩/힙합, 댄스, 록/메탈, R&B/Soul, 발라드, 일렉트로니카, 국내드라마, 애니메이션/웹툰, 포크/블루스',
 '소속사명': '(주)JYP엔터테인먼트'}

### save

In [15]:
# 저장
artist_info_file = "artist_infos.jsonl"
with open(artist_info_file, "a", encoding='UTF-8') as of:
    for line in tqdm(artist_infos):
        print(json.dumps(line, ensure_ascii=False), file=of)

100%|██████████| 18/18 [00:00<00:00, 18005.60it/s]


## Parse Songinfo

### def parser

In [27]:
def parseSongHtml(song_id):
    #
    song_info = {"song_id": song_id}
    #
    song_html = "htmls/songs/{}.html".format(song_id)
    with open(song_html, "r", encoding='utf-8') as f:
        html = f.read()
        #
    if "존재하지 않는 곡 정보입니다." in html:
        return song_info
    #
    soup = BeautifulSoup(html, "lxml")
    print(soup.text)
    #
    #
    sub_tag = soup.find("div", class_="section_info")
    #
    # song_name
    song_name = list(sub_tag.find("div", class_="song_name").children)[-1].strip()
    song_info["song_name"] = song_name
    #
    # artist_name
    #artist_name = sub_tag.find("div", class_="artist").get_text(strip=True)
    artist_names = [_["title"] for _ in sub_tag.find("div", class_="artist").find_all("a", class_="artist_name")]
    artist_name = " | ".join(artist_names)
    song_info["artist_name"] = artist_name
    #
    dt_list = sub_tag.find("div", class_="meta").find_all("dt")
    dd_list = sub_tag.find("div", class_="meta").find_all("dd")

    for dt, dd in zip(dt_list, dd_list):
        key = dt.get_text(strip=True)
        val = dd.get_text(strip=True)

        if key == "FLAC":
            continue
        song_info[key]= val

        if key == "앨범":
            song_info["album_num"] = dd.a["href"].replace("javascript:melon.link.goAlbumDetail('", "").replace("');", "")
    #
    lyric_tag = soup.find("div", class_="lyric")
    if not lyric_tag:
        lyric = ""
    else:
        lyric = lyric_tag.get_text(separator="\n").strip()
    song_info["lyric"] = lyric
    #
    return song_info

### read song html list

In [19]:
song_nums = [_.replace(".html", "") for _ in os.listdir("htmls/songs/")]
song_nums = [_ for _ in song_nums if _.isdigit()]
len(song_nums)

5140

### cheak already parsed

In [20]:
song_info_file = "song_infos.jsonl"
try:
    with open(song_info_file, "r", encoding='utf-8') as f:
        lines = f.readlines()
except:
    lines = []

already = [json.loads(_)["song_id"] for _ in lines]
print("already: {}".format(len(already)))

already: 14897


In [21]:
song_nums = [_ for _ in song_nums if _ not in already]
print("song_nums to parse: {}".format(len(song_nums)))

song_nums to parse: 10


### start parsing

In [28]:
song_infos = []
for song_id in tqdm(song_nums):
    info = parseSongHtml(song_id)
    song_infos.append(info)

  0%|          | 0/10 [00:00<?, ?it/s]







AttributeError: 'NoneType' object has no attribute 'find'

In [24]:
song_infos[-1]

IndexError: list index out of range

### save

In [15]:
song_info_file = "song_infos.jsonl"
with open(song_info_file, "a") as of:
    for line in tqdm(song_infos):
        print(json.dumps(line, ensure_ascii=False), file=of)

100%|█████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 10658.97it/s]
