In [1]:
# Import
import pandas as pd

import requests
from bs4 import BeautifulSoup as bs

import src.mongos as mg

# Functions

In [2]:
def get_soup(year, week, service):

    url = f'http://www.gaonchart.co.kr/main/section/chart/online.gaon?\
    nationGbn=T&serviceGbn={service}&targetTime={week}&hitYear={year}\
    &termGbn=week'

    r = requests.get(url)
    return bs(r.text)

In [3]:
def get_date_and_tr_list(soup):
    try:
        date = soup.select('#chart_week_select option[selected=""]')[0].get_text()
    except IndexError:
        date = None
    tr_list = soup.select('div.chart tr')
    return date, tr_list

In [4]:
def get_ranking_dict(tr):
#     list_td = tr.select('td'); list_td
    ranking = tr.select_one('td.ranking').get_text(); ranking
    change_span = tr.select_one('td.change').select_one('span'); change_span
    change1 = change_span.get('class', [''])[0]; change1
    change2 = change_span.get_text(); change2
    change = change1 + change2; change
    try:
        album_img_src = tr.select_one('td.albuming').find('img')['src']; album_img_src
    except:
        album_img_src = ''
    subject_p = tr.select_one('td.subject').select('p'); subject_p
    subject = subject_p[0]['title']; subject
    singer_and_album = subject_p[1]['title']; singer_and_album
    singer, album = [x.strip() for x in singer_and_album.split('|')];
    count_td = tr.select_one('td.count')
    if count_td:
        gaon_score = count_td.find('p').get_text(); gaon_score
    else:
        gaon_score = None
    production_p = tr.select_one('td.production').select('p'); production_p
    production = production_p[0]['title']
    distribution = production_p[1]['title']
    return {
        'ranking': ranking,
        'change': change,
        'album_img_src': album_img_src,
        'subject': subject,
        'singer': singer,
        'album': album,
        'gaon_score': gaon_score,
        'production': production,
        'distribution': distribution
    }

In [5]:
def get_ranking_list(tr_list):
    result = []
    for tr in tr_list:
        ranking_dict = get_ranking_dict(tr)
        result.append(ranking_dict)

    return result

In [6]:
def get_ranking_df(ranking_list, year, week, date):
    ranking_dict = pd.DataFrame(data=ranking_list)
    ranking_dict['year'] = year
    ranking_dict['week'] = week
    ranking_dict['date'] = date if date else (year + week)
    return ranking_dict

In [7]:
def get_year_week_list():
    '''
    2019: <= 12
    2018: 52
    2017: 52
    2016: 53
    2015: 53
    2014: 52
    2013: 52
    2012: 52
    2011: 53
    2010: 01 ~ 52
    '''
    service = 'ALL'
    
    year_week_list = []
    for y in range(2010, 2020):
        year = str(y)
        for w in range(1, 54):
            if (y == 2019) and (w > 12):
                continue
            elif (y in [2018, 2017, 2014, 2013, 2012, 2010]) and (w == 53):
                continue
            else:
                week = f'{w:0>2}'
                year_week_list.append((year, week))
    
    return year_week_list

# get_year_week_list()

In [8]:
def fetch_ranking_df(year, week, service):
    soup = get_soup(year, week, service)
    date, tr_list = get_date_and_tr_list(soup)
    try:
        ranking_list = get_ranking_list(tr_list[1:])  # Ignore Header row
    except IndexError as e:
        print(year + week)
        raise e
    ranking_df = get_ranking_df(ranking_list, year, week, date)
    return ranking_df

In [9]:
def fetch_all_ranking_and_save(service='ALL'):
    year_week_list = get_year_week_list()
    for year, week in year_week_list:
        ranking_df = fetch_ranking_df(year, week, service)
        mg.to_mongo(ranking_df, 'gaon', 'all', )

In [10]:
# fetch_all_ranking_and_save()

In [11]:
# df = fetch_ranking_df('2019', '13', 'ALL'); df.head()



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Unnamed: 0,album,album_img_src,change,distribution,gaon_score,production,ranking,singer,subject,year,week,date
0,사계 (Four Seasons),,newnew,IRIVER,69891643,SM Entertainment,1,태연 (TAEYEON),사계 (Four Seasons),2019,13,2019.03.24~2019.03.30
1,장범준 3집,,up10,카카오 M,61581453,버스커버스커,2,장범준,당신과는 천천히,2019,13,2019.03.24~2019.03.30
2,Our love is great,,down2,IRIVER,48153538,JYP Entertainment,3,백예린,그건 아마 우리의 잘못은 아닐 거야,2019,13,2019.03.24~2019.03.30
3,FLY HIGH PROJECT #2 `옥탑방`,,-,카카오 M,41879085,FNC엔터테인먼트,4,엔플라잉 (N.Flying),옥탑방,2019,13,2019.03.24~2019.03.30
4,sleepless in __________,,down3,카카오 M,38925829,OURS Co.,5,에픽하이 (EPIK HIGH),술이 달다 (Feat. 크러쉬),2019,13,2019.03.24~2019.03.30


In [13]:
# mg.to_mongo(df, 'gaon', 'all',)

Inserted rows: 200


In [12]:
# soup = get_soup('2010', '01', 'ALL')
# soup