In [2]:
import math
import re
import time
from src.utils import get_soup

# Const

In [3]:
comments_url_form = 'http://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code={}&type={}&onlyActualPointYn=N&order=newest&page={}' # idx, type, page

# Functions

In [7]:
def num_of_comment_pages(idx, after=True):
    url = comments_url_form.format(idx, 'after' if after else 'before', 1)
    soup = get_soup(url)

    try:
        num_comments = int(soup.select('div[class=score_total] em')[1].text.replace(',',''))
        return math.ceil(num_comments / 5)
    except Exception as e:
        return 0

In [6]:
def parse_a_page(soup, after_strf):
    comments = []
    for row in soup.select('div[class=score_result] li'):
        try:
            score = int(row.select('div[class=star_score] em')[0].text.strip())
            text = row.select('div[class=score_reple] p')[0].text.strip()
            user = row.select('a[onclick^=javascript]')[0].attrs.get('onclick', '').split('(')[1].split(',')[0]
            nickname = row.select('a[onclick^=javascript]')[0].text.strip()
            written_at = re.search(r"\d+\.\d+\.\d+ \d+:\d+", row.text).group()
            agree = int(row.select('span[class^=sympathy]')[0].text.strip())
            disagree = int(row.select('span[class^=notSympathy]')[0].text)
            comments.append(
                {'type': after_strf,
                 'score': score,
                 'text': text,
                 'user': user,
                 'nickname': nickname,
                 'written_at': written_at,
                 'agree': agree,
                 'disagree': disagree,
                })
        except:
            continue
    return comments

In [5]:
def _scrap_comments(idx, limit, after, sleep=0.05):
    after_strf = 'after' if after else 'before'
    max_page = num_of_comment_pages(idx, after)
    if limit > 0:
        max_page = min(limit, max_page)
    if max_page <= 0:
        return []

    comments = []
    for p in range(1, max_page + 1):
        url = comments_url_form.format(idx, 'after' if after else 'before', p)
        comments += parse_a_page(get_soup(url), after_strf)
        if p % 20 == 0:
            print('\r  movie {}, {}, {} / {} ...'.format(idx, after_strf, p, max_page), end='')
    print('\r  movie {}, {}, {} / {} done'.format(idx, after_strf, p, max_page))
    return comments

In [4]:
def scrap_comments(idx, limit=-1, sleep=0.05):
    comments = _scrap_comments(idx, limit, after=True, sleep=sleep)
    comments += _scrap_comments(idx, limit, after=False, sleep=sleep)
    return comments

In [None]:
def scrap(idx, directory, casting=True, bestscripts=True, comments=True, limit=3, sleep=0.05):
    # basic
    save_json(scrap_basic(idx), '{}/meta/{}.json'.format(directory, idx))
    print('scraped {} basic'.format(idx))

    # castings
    if casting:
        castings = scrap_casting(idx)
        for key in ['actors', 'directors', 'staffs']:
            if castings.get(key, []):
                save_list_of_dict(castings[key], '{}/{}/{}'.format(directory, key, idx))
        print('scraped {} casting'.format(idx))

    # best scripts
    if bestscripts:
        scripts = scrap_bestscripts(idx, limit, sleep)
        if scripts:
            save_list_of_dict(scripts, '{}/bestscripts/{}'.format(directory, idx))
        print('scraped {} best scripts'.format(idx))

    # comments
    if comments:
        comments_ = scrap_comments(idx, limit, sleep)
        if comments_:
            save_list_of_dict(comments_, '{}/comments/{}'.format(directory, idx))
        print('scraped {} comments'.format(idx))

    print('')
    time.sleep(sleep)

# Execute

In [None]:
exceptions = []
for idx in idxs:
    try:
        scrap(idx, directory, casting, bestscripts, comments, limit, sleep)
    except Exception as e:
        print('movie id = {}'.format(idx))
        print(e)
        exceptions.append((idx, str(e)))
    if exceptions:
        print('Exist {} exceptions'.format(len(exceptions)))

with open('./log', 'w', encoding='utf-8') as f:
    if not exceptions:
        f.write('Information of all movies were scraped successfully.\n')
    else:
        f.write('Exist exceptions\n\n')
        for idx, e in exceptions:
            f.write('movie id = {}'.format(idx))
            f.write('{}\n'.format(e))
