In [1]:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import read_config, MySQLAgent
import time, random

In [2]:
config = read_config('.env/local_conn.json')
sql_agent = MySQLAgent(config['local_mysql'])

In [3]:
query = """
show tables
"""

sql_agent.read_table(query=query)

Unnamed: 0,tables_in_generaldb


In [4]:
# 網站連線資料
BalanceSheetURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb03";      # 資產負債表
ProfitAndLoseURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb04";    # 損益表
CashFlowStatementURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb05"; # 現金流量表

In [5]:
def get_balancesheet(stock_number, year, season, url):
    df = pd.DataFrame()
    form_data = {
        'encodeURIComponent':1,
        'step':1,
        'firstin':1,
        'off':1,
        'co_id':stock_number,
        'year': year,
        'season': season,
    }

    try:
        r = requests.post(url,form_data)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find_all('table')[1]
        # headers = [header.text for header in table.find_all('th')]
        year_seaon = table.find_all('th')[0].get_text()
        rows = []
        for row in table.find_all('tr')[1:]:
            cells = row.find_all('td')
            cells_text = [cell.text.strip() for cell in cells]
            rows.append(cells_text)
            
        # the format of season 4 is different with each other
        if season != 4:
            columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_hy_amt', 'last_hy_amt_%', 'last_year_amt', 'last_year_percent']
            df_temp = pd.DataFrame(rows, columns=columns)
            df_select = df_temp.iloc[:, [0,1,2,5,6]].iloc[3:].copy()
        elif season == 4:
            columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_year_amt', 'last_year_percent']
            df_temp = pd.DataFrame(rows, columns=columns)
            df_select = df_temp.iloc[3:].copy()
        else:
            raise print('wrong season input.')
      
        df_select['report_name'] = 'BalanceSheet'
        df_select['report_time_raw'] = year_seaon
        df_select['period_year'] = year
        df_select['season'] = season
        df_select['creation_date'] = datetime.now().strftime('%Y-%m-%d')
        

        df_select['stock'] = stock_number

        df = pd.concat([df, df_select], axis=0)
        
        
        
        # time.sleep(random.uniform(10, 35))
            
    except requests.RequestException as e:
        print(f'An unexpected error occurred for stock number {stock_number}: {e}')
        
    return df

In [39]:
# stock_number = '2330'
# year = 113
# season = 1
# url = BalanceSheetURL
# df_balancesheet = get_balancesheet(stock_number, year, season, url=url)

get 2330 balancesheet


In [6]:
def balancesheet_crawler(stock_number, url):

    current_year = datetime.now().year - 1911
    year_list = [i for i in range(current_year-5, current_year+1,1)]
    season_list = [1, 2, 3, 4]
    max_year = current_year
    max_season_by_max_year = 1 # hard code
    df_result = pd.DataFrame()
    for year in year_list:
        for season in season_list:
            if year == max_year and season > max_season_by_max_year:
                break
            df_temp = get_balancesheet(stock_number, year, season, url=url)
            print(f'Get {stock_number} balancesheet with year {year}, season {season}')
            
            df_result = pd.concat([df_result, df_temp], axis=0)
                
        
    return df_result

In [7]:
df_balancesheet = balancesheet_crawler(stock_number='2330', url=BalanceSheetURL)

Get 2330 balancesheet with year 108, season 1
Get 2330 balancesheet with year 108, season 2
Get 2330 balancesheet with year 108, season 3
Get 2330 balancesheet with year 108, season 4
Get 2330 balancesheet with year 109, season 1
Get 2330 balancesheet with year 109, season 2
Get 2330 balancesheet with year 109, season 3
Get 2330 balancesheet with year 109, season 4
Get 2330 balancesheet with year 110, season 1
Get 2330 balancesheet with year 110, season 2
Get 2330 balancesheet with year 110, season 3
Get 2330 balancesheet with year 110, season 4
Get 2330 balancesheet with year 111, season 1
Get 2330 balancesheet with year 111, season 2
Get 2330 balancesheet with year 111, season 3
Get 2330 balancesheet with year 111, season 4
Get 2330 balancesheet with year 112, season 1
Get 2330 balancesheet with year 112, season 2
Get 2330 balancesheet with year 112, season 3
Get 2330 balancesheet with year 112, season 4
Get 2330 balancesheet with year 113, season 1


In [8]:
df_balancesheet

Unnamed: 0,acct_name,this_year_amt,this_year_percent,last_year_amt,last_year_percent,report_name,report_time_raw,period_year,season,creation_date,stock
3,流動資產,,,,,BalanceSheet,民國108年第1季,108,1,2024-08-05,2330
4,現金及約當現金,645670527,29.52,577782963,28.20,BalanceSheet,民國108年第1季,108,1,2024-08-05,2330
5,透過損益按公允價值衡量之金融資產－流動,3084399,0.14,963915,0.05,BalanceSheet,民國108年第1季,108,1,2024-08-05,2330
6,透過其他綜合損益按公允價值衡量之金融資產－流動,107313205,4.91,95713446,4.67,BalanceSheet,民國108年第1季,108,1,2024-08-05,2330
7,按攤銷後成本衡量之金融資產－流動,4179386,0.19,9888741,0.48,BalanceSheet,民國108年第1季,108,1,2024-08-05,2330
...,...,...,...,...,...,...,...,...,...,...,...
67,權益總額,3665716254,63.33,3092897598,61.30,BalanceSheet,民國113年第1季,113,1,2024-08-05,2330
68,負債及權益總計,5787891082,100.00,5045844348,100.00,BalanceSheet,民國113年第1季,113,1,2024-08-05,2330
69,待註銷股本股數（單位：股）,1401750,,419466,,BalanceSheet,民國113年第1季,113,1,2024-08-05,2330
70,預收股款（權益項下）之約當發行股數（單位：股）,0,,0,,BalanceSheet,民國113年第1季,113,1,2024-08-05,2330


In [9]:
sql_agent.write_table(df_balancesheet, 'balancesheet', if_exists='replace', index=False, data_type=None)