In [36]:
import os
import pandas as pd
import time
import requests
from datetime import datetime, timedelta
from pandas import Series, DataFrame
import html5lib

class ClosePriceFetcher():

    STOCK_DATA_FOLDER_NAME = 'stock_data'
    TWSE_PRICE_HISTORY_URL = 'http://www.tse.com.tw/exchangeReport/MI_INDEX'

    TWSE_CLOSE_PRICE_TABLE_COLUMNS = \
        ['證券代號', '證券名稱', '成交股數', '成交筆數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌(+/-)',
         '漲跌價差', '最後揭示買價', '最後揭示買量', '最後揭示賣價', '最後揭示賣量', '本益比']

    def __init__(self, isKeepOld=False):
        self._init_data_folder()

    def _init_data_folder(self):
        # create root data folder if not exists
        if not os.path.isdir(self.STOCK_DATA_FOLDER_NAME):
            os.mkdir(self.STOCK_DATA_FOLDER_NAME)

    def _convert_datetime_str(self, datetime_obj):
        return "{0}{1:02d}{2:02d}".format(datetime_obj.year, datetime_obj.month, datetime_obj.day)

    def _fetch_html(self, date_str):
        query_params = {
            'date': date_str,
            'response': 'html',
            'type': 'ALLBUT0999'
        }
        page = requests.get(self.TWSE_PRICE_HISTORY_URL, params=query_params)
        return page
    
    def _fetch_html_pandas(self, date_str):
        return pd.read_html('http://www.tse.com.tw/exchangeReport/MI_INDEX?response=html&date={0}&type=ALLBUT0999'.format(date_str)) 

    def _write_csv(self, datatime_obj, close_price_dataframe):
        filepath = "{0}/close_price_{1}.csv".format(ClosePriceFetcher.STOCK_DATA_FOLDER_NAME, datatime_obj.year)
        if os.path.isfile(filepath):
            with open(filepath, 'a') as f:
                close_price_dataframe.to_csv(f, header=False)
        else:
            close_price_dataframe.to_csv(filepath);

    def _retrieve_price_df(self, df_html):
        for i in range(len(df_html)):
            df_price = df_html[i]
            if len(df_price.columns) == 16:
                return df_price


    def _crawl_data_since_date(self, datetime_since):
        if datetime_since == datetime.today():
            print('Price info already up-to date')
            return

        while datetime_since <= datetime.today():
            date_str = self._convert_datetime_str(datetime_since)         
            try:
                page = self._fetch_html(date_str)
                df_html = pd.read_html(page.text, header=None)
                #df_html = self._fetch_html_pandas(date_str)
                df_price = self._retrieve_price_df(df_html)
            except ValueError:
                pass
            except Exception as error:
                print('[Edward]' + str(error))
                pass
            else:
                df_price.columns = ClosePriceFetcher.TWSE_CLOSE_PRICE_TABLE_COLUMNS
                df_price['日期'] = date_str
                self._write_csv(datetime_since, df_price)

            datetime_since+=timedelta(days=1)
            time.sleep(10)

In [None]:
fetcher = ClosePriceFetcher()
date_since = datetime(2015,1,1)
fetcher._crawl_data_since_date(date_since)