In [26]:
import urllib.parse

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

import src.mongos as mg

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Const

In [2]:
# Const
url_first_page = 'http://www.kobis.or.kr/kobis/business/mast/thea/findScreenQuota.do'
url_basic_area = 'http://www.kobis.or.kr/kobis/business/mast/thea/findBasareaCdList.do'
url_theater = 'http://www.kobis.or.kr/kobis/business/mast/thea/findTheaCdList.do'
url_screen_quota = 'http://www.kobis.or.kr/kobis/business/mast/thea/findScreenQuota.do'

# Get Wide Areas

In [3]:
def get_wide_area_table():
    r = requests.get(url_first_page)
    soup = bs(r.text, 'lxml')

    wide_areas = soup.select('#sWideareaCd option')
    wide_area_codes = [x['value'] for x in  wide_areas if x['value'] != '']
    wide_area_names = [x.get_text() for x in  wide_areas if x.get_text() != '광역선택']
    wide_area_table = dict(zip(wide_area_codes, wide_area_names))

    return wide_area_table

# Get Basic Areas

In [4]:
def get_basic_area_table(wide_area_code):
#     wide_area_code = wide_area_codes[0]
    r2 = requests.post(url_basic_area, {'sWideareaCd': wide_area_code})
    result2 = r2.json()
    basic_areas = result2.get('basareaCdList', [])

    basic_areas
    basic_area_codes = [x['cd'] for x in basic_areas]
    basic_area_names = [x['cdNm'] for x in basic_areas]
    basic_area_table = dict(zip(basic_area_codes, basic_area_names))
    return basic_area_table

# Get Theaters

In [5]:
def get_theater_table(wide_area_code, basic_area_code):
#     basic_area_code = basic_area_codes[0]
    r3 = requests.post(url_theater, {'sWideareaCd': wide_area_code, 'sBasareaCd': basic_area_code, 'mode': 'history'})
    result3 = r3.json()

    theaters = result3.get('theaCdList', [])
    theaters

    theater_codes = [x['cd'] for x in theaters]
    theater_names = [x['cdNm'] for x in theaters]
    theater_table = dict(zip(theater_codes, theater_names))
    return theater_table

# Get Screen Quota

In [6]:
def get_screen_quota_response(year, wide_area_code, basic_area_code, theater_code):
#     theater_code = theater_codes[8]
    params = {
        'theaCd': '', 'theaArea': '', 'sTheaNm': '',
        'sYear': year, 'choice': '1', 'sWideareaCd': wide_area_code, 'sBasareaCd': basic_area_code, 'sTheaCd': theater_code,
    }
    encoded_params = urllib.parse.urlencode(params)

    # urllib.parse.urlencode(params)
#     print(encoded_params)

    r = requests.get('http://localhost:8050/render.html',
                  params={'url': url_screen_quota,
                          'http_method': 'POST',
    #                       'body': 'theaCd=&theaArea=&sYear=2019&choice=1&sWideareaCd=0105001&sBasareaCd=010600101&sTheaCd=001128&sTheaNm=',
                          'body': encoded_params,
                          'wait': 2})
    return r


year = '2019'
wide_area_code = "0105001"
basic_area_code = "010600101"
# theater_code = "001128"
theater_code = "001070"

r = get_screen_quota_response(year, wide_area_code, basic_area_code, theater_code)

# Parse theater details & quota summary

In [24]:
def parse_theater_detail(r, year, wide_area_code, wide_area_table, basic_area_code, basic_area_table, theater_code, theater_table):
    try:
        screen_tables = pd.read_html(r.text)
    except ValueError as e:
        return pd.DataFrame()
    theater_info = screen_tables[0]

    cols = theater_info[0].append(theater_info[2])
    rows = [theater_info[1].append(theater_info[3]).tolist()]
    rows
    theater_detail = pd.DataFrame(columns=cols, data=rows)
    theater_detail['wide_area_code'] = wide_area_code
    theater_detail['wide_area_name'] = wide_area_table[wide_area_code]
    theater_detail['basic_area_code'] = basic_area_code
    theater_detail['basic_area_name'] = basic_area_table[basic_area_code]
    theater_detail['theater_code'] = theater_code
    theater_detail['theater_name'] = theater_table[theater_code]

    return theater_detail

In [13]:
def parse_quota_summary(r, year, theater_code, theater_table):
    screen_tables = pd.read_html(r.text)

    try:
        quota_summary = screen_tables[1]
        quota_summary['year'] = year
        quota_summary['theater_code'] = theater_code
        quota_summary['theater_name'] = theater_table[theater_code]
        return quota_summary
    except IndexError as e:
        return pd.DataFrame()

# Parse Quota

In [20]:
def parse_quota(r, year, theater_code):
    soup = bs(r.text, 'lxml')
    tables = soup.select('table')
    quota = pd.DataFrame(columns=['상영일자', '스크린', 'year', 'theater_code'], data=[['', {}, year, theater_code]])

    try:
        quota_table = tables[2]
    except IndexError as e:
        return quota

    cols = [x.get_text() for x in quota_table.select('thead th')]
    trs = quota_table.select('tbody tr')

    rows = []
    for tr in trs:
        tds = tr.select('td')
        row = [td.get_text().strip() + '(quota)' if td.strong else td.get_text().strip() for td in tds]
        screen = dict(zip(cols[1:], row[1:]))
        rows.append([row[0], screen])

    if rows:
        quota = pd.DataFrame(columns=['상영일자', '스크린'], data=rows)
        quota['year'] = year
        quota['theater_code'] = theater_code
#     if len(quota) == 0:
#         quota = pd.DataFrame(columns=['상영일자', '스크린', 'year', 'theater_code'], data=[['', {}, year, theater_code]])
    return quota

# result = parse_quota(r, year, theater_code)
# result

# query = {'theater_code': theater_code}
# query = mg.update_mongo(quota, db, 'quota', query)

# Execute

In [17]:
db = 'screen_quota'

In [25]:
existing = mg.read_mongo(db, 'quota')
existing.head()

existing_year_theater = existing.groupby(['year', 'theater_code']).agg({'상영일자': 'max'})
# (2019, '001003') in existing_year_theater.index[0]
# (2019, '001003') in existing_year_theater.index

wide_area_table = get_wide_area_table()

for year in range(2019, 2003, -1):

    for wide_area_code in wide_area_table.keys():
        basic_area_table = get_basic_area_table(wide_area_code)

        for basic_area_code in basic_area_table.keys():
            theater_table = get_theater_table(wide_area_code, basic_area_code)

            for theater_code in theater_table.keys():
                if (year, theater_code) in existing_year_theater.index:
                    continue
                r = get_screen_quota_response(year, wide_area_code, basic_area_code, theater_code)
                theater_detail = parse_theater_detail(r, year, wide_area_code, wide_area_table,
                                                        basic_area_code, basic_area_table,
                                                        theater_code, theater_table)
                quota_summary = parse_quota_summary(r, year, theater_code, theater_table)
                quota = parse_quota(r, year, theater_code)
                mg.update_mongo(theater_detail, db, 'theater_detail', ['theater_code'])
                mg.update_mongo(quota_summary, db, 'quota_summary', ['year', 'theater_code', '스크린'])
                mg.update_mongo(quota, db, 'quota', ['theater_code', '상영일자'])

ValueError: No tables found