In [20]:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import read_config, MySQLAgent
import time, random

In [21]:
config = read_config('.env/connections.json')
sql_agent = MySQLAgent(config['CREDITREPORT']['VM1_mysql_conn_info'])

In [22]:
query = """
show tables
"""

sql_agent.read_table(query=query)

Unnamed: 0,tables_in_crawlerdb
0,Demo
1,company
2,companyinfo01
3,companyinfo01_busi
4,companyinfo02
5,companyinfo03_detail
6,companyinfo03_detail_busi
7,companyinfo_branchoffice
8,customer_account
9,demo


In [23]:
query = """
select 公司代號 as stock_num, 營利事業統一編號 as busi_acc, 公司名稱 as company_name
from listed_otc_company
"""

otc = sql_agent.read_table(query=query)
otc.head()

Unnamed: 0,stock_num,busi_acc,company_name
0,1101,11913502,臺灣水泥股份有限公司
1,1102,3244509,亞洲水泥股份有限公司
2,1103,11892801,嘉新水泥股份有限公司
3,1104,7568009,環球水泥股份有限公司
4,1108,40601248,幸福水泥股份有限公司


In [24]:
otc[otc['stock_num'] == '1101'].busi_acc[0]

'11913502'

In [25]:
query = """
select *
from mops_season_report
limit 5
"""

fr = sql_agent.read_table(query=query)
fr

Unnamed: 0,report_name,company_id,company_name,period_year,season,acct_name,this_year_amt,this_year_percent,last_year_amt,last_year_percent,creation_date,seq
0,BalanceSheet,1101,台泥,112,1,流動資產,0.0,0.0,0.0,0.0,2024-04-10 13:45:47,1
1,BalanceSheet,1101,台泥,112,1,現金及約當現金,70957209.0,15.37,88842494.0,19.28,2024-04-10 13:45:47,2
2,BalanceSheet,1101,台泥,112,1,透過損益按公允價值衡量之金融資產－流動,648145.0,0.14,611802.0,0.13,2024-04-10 13:45:47,3
3,BalanceSheet,1101,台泥,112,1,透過其他綜合損益按公允價值衡量之金融資產－流動,6377062.0,1.38,5934753.0,1.29,2024-04-10 13:45:47,4
4,BalanceSheet,1101,台泥,112,1,按攤銷後成本衡量之金融資產－流動,30795636.0,6.67,20954299.0,4.55,2024-04-10 13:45:47,5


In [26]:
fr.columns

Index(['report_name', 'company_id', 'company_name', 'period_year', 'season',
       'acct_name', 'this_year_amt', 'this_year_percent', 'last_year_amt',
       'last_year_percent', 'creation_date', 'seq'],
      dtype='object')

In [27]:
company_list = pd.read_csv('company_data.csv')
company_list.head()

Unnamed: 0,business_accounting_no,company_name,internal_id,query_name
0,700019,國昌綜合工廠有限公司,K7913,國昌綜合工廠
1,713181,華聯食品工業股份有限公司,J5984,華聯食品
2,784930,建寶食品股份有限公司,K4395,建寶食品
3,833117,高立展業有限公司,Q0717,高立展業
4,966299,財團法人台北市瑠公農業產銷基金會,K9232,台北市瑠公農業產銷基金會


In [28]:
# 網站連線資料
BalanceSheetURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb03";      # 資產負債表
ProfitAndLoseURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb04";    # 損益表
CashFlowStatementURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb05"; # 現金流量表

In [29]:
stock_list = otc[otc['company_name'].isin(set(company_list.company_name))].stock_num.unique()
df_frc = pd.DataFrame()
error_stock_list = []

In [30]:
for stock_number in stock_list:
       
    # stock_number = '1231'
    year = 113
    season = 1
    url = BalanceSheetURL

    form_data = {
        'encodeURIComponent':1,
        'step':1,
        'firstin':1,
        'off':1,
        'co_id':stock_number,
        'year': year,
        'season': season,
    }
    
    try:
        r = requests.post(url,form_data)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find_all('table')[1]
        headers = [header.text for header in table.find_all('th')]
        rows = []
        for row in table.find_all('tr')[1:]:
            cells = row.find_all('td')
            cells_text = [cell.text.strip() for cell in cells]
            rows.append(cells_text)

        columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_hy_amt', 'last_hy_amt_%', 'last_year_amt', 'last_year_percent']
        # Create a DataFrame
        df_temp = pd.DataFrame(rows, columns=columns)
        df_select = df_temp.iloc[:, [0,1,2,5,6]].iloc[3:].copy()
        df_select = df_select
        
        df_select['report_name'] = 'BalanceSheet'
        df_select['company_id'] = otc[otc['stock_num'] == str(stock_number)].busi_acc.values[0]
        df_select['company_name'] = otc[otc['stock_num'] == str(stock_number)].company_name.values[0]
        df_select['creation_date'] = datetime.now().strftime('%Y-%m-%d')
        df_select['seq'] = range(1, len(df_select) + 1)
            

        df_select['stock'] = stock_number

        df_frc = pd.concat([df_frc, df_select], axis=0)
        
        print(f'get {stock_number}')
        
        time.sleep(random.uniform(10, 35))
         
    except requests.RequestException as e:
        error_stock_list.append(stock_number)
        print(f'Failed to retrieve data for stock number {stock_number}: {e}')
    except ValueError as e:
        error_stock_list.append(stock_number)
        print(f'Error parsing data for stock number {stock_number}: {e}')
    except Exception as e:
        error_stock_list.append(stock_number)
        print(f'An unexpected error occurred for stock number {stock_number}: {e}')
        

get 1104
get 1201
get 1203
get 1210
get 1215
get 1216
get 1217
get 1218
get 1219
get 1227
get 1229
get 1231
get 1233
get 1234
get 1236
get 1303
get 1307
get 1315
get 1319
get 1321
get 1323
get 1325
get 1339
get 1402
get 1409
get 1413
get 1417
get 1419
get 1440
get 1441
get 1443
get 1444
get 1445
get 1447
get 1449
get 1452
get 1454
get 1455
get 1457
get 1459
get 1460
get 1465
get 1466
get 1467
get 1472
get 1474
get 1504
get 1515
get 1526
get 1539
get 1541
get 1558
get 1563
get 1598
Error parsing data for stock number 1605: 7 columns passed, passed data had 9 columns
get 1609
get 1615
get 1617
get 1618
get 1701
get 1707
get 1711
get 1712
get 1717
get 1720
get 1721
get 1726
get 1727
get 1730
get 1731
get 1732
get 1733
get 1734
get 1736
get 1752
get 1760
get 1776
get 1783
get 1789
get 1809
get 1810
get 1904
get 1905
get 1907
get 1909
get 2012
get 2022
get 2027
get 2049
get 2062
get 2107
get 2108
get 2114
get 2233
get 2241
get 2301
get 2303
get 2313
get 2321
get 2324
get 2329
get 2330
get 2

In [31]:
df_frc.to_csv('fr_result.csv', index=False)