In [1]:
import os
from pytz import utc
import time
from datetime import datetime, timedelta
import requests
from selenium import webdriver
from bs4 import BeautifulSoup

# set phantomJS driver
DRIVER_PATH = './rsc/phantomjs-2.1.1-macosx/bin/phantomjs'
driver = webdriver.PhantomJS(DRIVER_PATH)
driver.implicitly_wait(10)

# set urls
LIST_URL = 'http://theminjoo.kr/schedule.do'
DETAIL_URL = 'http://theminjoo.kr/scheduleDetail.do?seq='
BASE_DIR = os.path.dirname('./rsc/')


def split_scheds(txt):
    scheds = []
    sched_txt = txt.strip()
    
    if '*' in txt:
        sched_txt = txt.split('*')[1].strip()
        ## print('sched_txt: ' + sched_txt)
        
    if '일정없음' not in sched_txt:
        details = []
        details = sched_txt.replace(',','.').replace('\r\n\r\n','\r\n').split('\r\n')
        
        for detail in details:
            if ':' in detail:
                detail = detail[1:].strip()
                tt = detail[:5]
                desc = detail[5:]
                loc = ''
                
                if '/' in detail:
                    desc = detail[5:].split('/')[0].strip()
                    loc = detail[5:].split('/')[1].strip()
                
                scheds.append(tt + '\t' + desc + '\t' + loc)
    
    return scheds


def parse_schedule():
    # load schedule list page
    driver.get(LIST_URL)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # set parse starting date
    from_dt = datetime(2016,8,27)
    today_dt = datetime.today( )
    
    # set date navigation variables
    year_diff = today_dt.year - from_dt.year
    month_diff = today_dt.month - from_dt.month   
    
    nav_year = 'year_prev'
    nav_month = 'month_prev'
    nav_max = year_diff * 12 + month_diff + 1
    
    if year_diff < 0 :
        print('Error: from date is later than today')
        return
    if month_diff < 0 :
        nav_month = 'month_next'
        month_diff = - month_diff
    
    print('From:%s, Today:%s, Year diff:%s, Month diff:%s' % (from_dt, today_dt, year_diff, month_diff))
    
    
    # navigate to starting date
    for i in range(year_diff):
        driver.find_element_by_class_name(nav_year).click()
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

    for i in range(month_diff):
        driver.find_element_by_class_name(nav_month).click()
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

    # parse schedules thru monthly calendars
    csv_scheds = []
    for i in range(nav_max):
        yyyymm = soup.find(id='STR_DATE').get('value')
        if yyyymm is not None:
            yyyy = yyyymm[0:4]
            mm = yyyymm[4:6]
            print('Parsing %s %s'% (yyyy,mm))
        
        start_dd = 1
        if i == 0: 
            start_dd = from_dt.day
        
        last_seq = soup.find(id='getData_seq_0').get('value')
        if last_seq is not None:
            
            for dd in range (start_dd, 31):

                # get schedule sequence
                sched_tag = soup.find(id='getDay_' + str(dd)).find('a')
                if sched_tag is not None:
                    seq = sched_tag.get('onclick')[6:-2]
                    
                    # get schedule data
                    response = requests.get(DETAIL_URL + seq)
                    json_data = response.json()
                    content = json_data['getData']['CONTENT']
                    
                    temp_scheds = []
                    temp_scheds = split_scheds(content)
                    
                    for s in temp_scheds:
                        csv_scheds.append(yyyy + '\t' + mm + '\t' + str(dd).zfill(2) + '\t' + s)
                    
                    if last_seq == seq:
                        break

        else:
            print('No schedules in %s' % yyyymm)
            
        # navigate to next month   
        driver.find_element_by_class_name('month_next').click()
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

    # save as a csv file
    with open(os.path.join(BASE_DIR, 'scheds.csv'), 'w+') as f_write:
        print('Start Saving CSV')
        
        for csv_sched in csv_scheds:
            f_write.write(csv_sched + '\r\n')
        f_write.close()
        print('%d lines saved'% len(csv_sched))
        
        
parse_schedule()

From:2016-08-27 00:00:00, Today:2018-06-26 10:54:34.894800, Year diff:2, Month diff:2
Parsing 2016 08
Parsing 2016 09
Parsing 2016 10
Parsing 2016 11
Parsing 2016 12
Parsing 2017 01
Parsing 2017 02
Parsing 2017 03
Parsing 2017 04
Parsing 2017 05
Parsing 2017 06
Parsing 2017 07
Parsing 2017 08
Parsing 2017 09
Parsing 2017 10
Parsing 2017 11
Parsing 2017 12
Parsing 2018 01
Parsing 2018 02
Parsing 2018 03
Parsing 2018 04
Parsing 2018 05
Parsing 2018 06
Start Saving CSV
40 lines saved
