In [1]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from tqdm import trange
import time as tm
import logging
from datetime import datetime

In [2]:
class Crawler:
    def __init__(self, year, month, co_id = " "):
        self.year = str(year)
        self.month = str(month).zfill(2)
        self.co_id = co_id
    
    def start(self, display = False):
        Id, name, date, time, place, message, ch_file, en_file, more_information, video_address, attention = [[] for x in range(11)]
        
        # Create log file
        FORMAT = '%(asctime)s %(levelname)s: %(message)s'
        logging.basicConfig(level = logging.INFO, filename = 'crawler.log', filemode = 'w', format = FORMAT)
        logging.info('Updating data start')
        
        dic_name = datetime.now().strftime("%Y_%m_%d")
        
        # Initialize params
        payload = {
            # 上市 / 未上市
            "encodeURIComponent": "1",
            "step": "1",
            "firstin": "1",
            "off": "1",
            "TYPEK": "sii",
            # 年度
            "year": self.year,
            # 月份
            "month": self.month,
            # 公司代號
            "co_id": self.co_id
        }
        
        # get the response data through post method and params
        response = requests.post("https://mops.twse.com.tw/mops/web/t100sb02_1"
                                 , data = payload)

        soup = BeautifulSoup(response.text, "html.parser")
        
        if(display):
            print("Result: ")
            print(soup.prettify())
        
        result_even = soup.find_all("tr", class_ = "even")
        result_odd = soup.find_all("tr", class_ = "odd")
        result_total = result_even + result_odd
        
        for i in trange(len(result_total)):
            data_td = result_total[i].find_all("td")
            
            for ptr in range(len(data_td)):
                if ptr == 0:
                    Id.append(data_td[ptr].getText())
                elif ptr == 1:
                    name.append(data_td[ptr].getText())
                elif ptr == 2:
                    date.append(data_td[ptr].getText())
                elif ptr == 3:
                    time.append(data_td[ptr].getText())
                elif ptr == 4:
                    place.append(data_td[ptr].getText())
                elif ptr == 5:
                    message.append(data_td[ptr].getText())
                elif ptr == 6:
                    temp = data_td[ptr].getText()
                    
                    if ".pdf" in temp:
                        download_payload = {
                            "step": "9",
                            "filePath": "/home/html/nas/STR/",
                            "fileName": str(temp),
                            "functionName": "t100sb02_1"
                        }
                        
                        while(True):
                            try:
                                download_response = requests.post("https://mops.twse.com.tw/server-java/FileDownLoad"
                                                     , data = download_payload)
                            except:
                                logging.error('Ch file download error')
                        
                            data = download_response.content
                            tm.sleep(1)
                            
                            if(len(data) != 496):
                                break
                        
                        if not os.path.isdir("file"):
                            os.mkdir("file")
                            
                        if not os.path.isdir("./file/" + dic_name):
                                os.mkdir("./file/" + dic_name)
                            
                        if not os.path.isdir("./file/" + dic_name + "/ch"):
                            os.mkdir("./file/" + dic_name + "/ch")
                        
                        with open("./file/" + dic_name + "/ch/" + Id[-1] + "_" + temp, 'wb') as s:
                            s.write(data)
                            
                        ch_file.append(Id[-1] + "_" + temp)
                    else:
                        ch_file.append(temp)
                elif ptr == 7:
                    temp = data_td[ptr].getText()
                    
                    if ".pdf" in temp:
                        download_payload = {
                            "step": "9",
                            "filePath": "/home/html/nas/STR/",
                            "fileName": str(temp),
                            "functionName": "t100sb02_1"
                        }
                        
                        while(True):
                            try:
                                download_response = requests.post("https://mops.twse.com.tw/server-java/FileDownLoad"
                                                     , data = download_payload)
                            except:
                                logging.error('En file download error')
                        
                            data = download_response.content
                            tm.sleep(1)
                            
                            if(len(data) != 496):
                                break
                        
                        if not os.path.isdir("file"):
                            os.mkdir("file")
                            
                        if not os.path.isdir("./file/" + dic_name):
                            os.mkdir("./file/" + dic_name)
                            
                        if not os.path.isdir("./file/" + dic_name + "/en"):
                            os.mkdir("./file/" + dic_name + "/en")
                        
                        with open("./file/" + dic_name + "/en/" + Id[-1] + "_" + temp, 'wb') as s:
                            s.write(data)
                            
                        en_file.append(Id[-1] + "_" + temp)
                    else:
                        en_file.append(temp)
                elif ptr == 8:
                    more_information.append(data_td[ptr].getText())
                elif ptr == 9:
                    temp = data_td[ptr].getText()
                    temp = temp.replace("\r", "")
                    temp = temp.replace("\n", "")
                    temp = temp.replace(".", " ")
                    
                    video_address.append(temp)
                elif ptr == 10:
                    attention.append(data_td[ptr].getText())
        
        df = pd.DataFrame({"Co_ID" : Id, "Co_name" : name, "Date" : date, "Time" : time, "Form" : place, "Message" : message
                          , "CH_file" : ch_file, "EN_file" : en_file, "More information" : more_information, "Video address" : video_address, "Attention" : attention})
        
        logging.info('Updating data completed')
        
        return df

In [3]:
crawler = Crawler(111, 2)
df = crawler.start()

100%|███████████████████████████████████████████| 62/62 [19:03<00:00, 18.45s/it]


In [4]:
df

Unnamed: 0,Co_ID,Co_name,Date,Time,Form,Message,CH_file,EN_file,More information,Video address,Attention
0,2308,台達電,111/02/25,15:00,線上法說會,本公司召開法人說明會說明110年第四季財務報告相關資訊,內容檔案於當日會後公告於公開資訊觀測站,內容檔案於當日會後公告於公開資訊觀測站,https://www.deltaww.com/zh-TW/investors/analys...,同步收看網址：https://www deltaww com/zh-TW/investors...,無
1,4961,天鈺,111/02/25,14:00,線上法說會,(1)公布本公司2021年第4季自結數合併財務報表。\r\n(2)參加方式：請Email至本...,內容檔案於當日會後公告於公開資訊觀測站,內容檔案於當日會後公告於公開資訊觀測站,https://www.fitipower.com/investment.asp?id=12,影音資訊網址：未輸,無
2,4968,立積,111/02/24,15:30,網路直播,立積電子(4968) 2021年第四季線上法人說明會。,內容檔案於當日會後公告於公開資訊觀測站,內容檔案於當日會後公告於公開資訊觀測站,https://www.richwave.com.tw/zh/invest_sharhold...,同步收看網址：http://www zucast com/webcast/26usY6gR,因考量新型冠狀病毒疫情，法說會改為網路直播進行。
3,3036,文曄,111/02/24,14:30,線上法說會,本公司受邀參加兆豐證券於111年2月24日舉辦之線上法人說明會，向投資人說明110年第4季營...,內容檔案於當日會後公告於公開資訊觀測站,內容檔案於當日會後公告於公開資訊觀測站,https://www.wtmec.com/WT/?page_id=2016,,欲參加者請需事先線上報名，網址：http://www.wtmec.com/online/in...
4,3673,TPK-KY,111/02/24,14:30,線上法說會,TPK舉辦2021年第四季營運結果線上法人說明會,內容檔案於當日會後公告於公開資訊觀測站,內容檔案於當日會後公告於公開資訊觀測站,https://www.tpk.com/ir/,影音資訊網址：未輸,相關資料內容請至本公司網頁查詢 https://www.tpk.com/ir/
...,...,...,...,...,...,...,...,...,...,...,...
57,5269,祥碩,111/02/21 至 111/02/22,14:00,線上法說會,公告本公司111年2月21日至2月22日受邀參加摩根大通證券舉辦之Taiwan CEO-CF...,5269_526920220221M001.pdf,5269_526920220221E001.pdf,https://www.asmedia.com.tw,,無
58,2882,國泰金,111/02/21 至 111/02/22,11:00,採線上投資人會議形式進行。,本公司受邀參加摩根大通證券舉辦之線上投資人會議，說明110年第三季財務業務相關資訊。,2882_288220220218M001.pdf,2882_288220220218E001.pdf,http://www.ir-cloud.com/taiwan/2882/irwebsite_...,,無
59,8454,富邦媒,111/02/21 至 111/02/22,10:00,線上會議,本公司受邀參加 J.P. Morgan舉辦之Taiwan CEO-CFO Conferenc...,8454_845420220216M002.pdf,8454_845420220216E002.pdf,https://corp.momo.com.tw/,,無
60,2382,廣達,111/02/21 至 111/02/22,09:00,電話會議,"本公司受邀參加J.P. Morgan所舉辦之""Taiwan CEO-CFO Conferen...",2382_238220211112M001.pdf,2382_238220211112E001.pdf,http://www.quantatw.com,,詳細資料請詳公開資訊觀測站及本公司網站
