In [15]:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import read_config, MySQLAgent 
import time, random

In [16]:
config = read_config('.env/local_conn.json')
sql_agent = MySQLAgent(config['local_mysql'])

In [17]:
query = """
show tables
"""

sql_agent.read_table(query=query)

Unnamed: 0,tables_in_generaldb
0,balancesheet
1,port_activity
2,profitandlose


In [18]:
# 網站連線資料
BalanceSheetURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb03";      # 資產負債表
ProfitAndLoseURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb04";    # 損益表
CashFlowStatementURL = "https://mops.twse.com.tw/mops/web/ajax_t164sb05"; # 現金流量表

### Balancesheet

In [19]:
def get_balancesheet(stock_number, year, season, url):
    df = pd.DataFrame()
    form_data = {
        'encodeURIComponent':1,
        'step':1,
        'firstin':1,
        'off':1,
        'co_id':stock_number,
        'year': year,
        'season': season,
    }

    try:
        r = requests.post(url,form_data)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find_all('table')[1]
        # headers = [header.text for header in table.find_all('th')]
        year_seaon = table.find_all('th')[0].get_text()
        rows = []
        for row in table.find_all('tr')[1:]:
            cells = row.find_all('td')
            cells_text = [cell.text.strip() for cell in cells]
            rows.append(cells_text)
            
        # the format of season 4 is different with each other
        if season != 4:
            columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_hy_amt', 'last_hy_amt_%', 'last_year_amt', 'last_year_percent']
            df_temp = pd.DataFrame(rows, columns=columns)
            df_select = df_temp.iloc[3:, [0,1,2,5,6]].copy()
        elif season == 4:
            columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_year_amt', 'last_year_percent']
            df_temp = pd.DataFrame(rows, columns=columns)
            df_select = df_temp.iloc[3:].copy()
        else:
            raise print('wrong season input.')
      
        df_select['report_name'] = 'BalanceSheet'
        df_select['report_time_raw'] = year_seaon
        df_select['period_year'] = year
        df_select['season'] = season
        df_select['creation_date'] = datetime.now().strftime('%Y-%m-%d')
        df_select['year_season'] = df_select['period_year'].astype(str) + '-' + df_select['season'].astype(str)
        

        df_select['stock'] = stock_number

        df = pd.concat([df, df_select], axis=0)
        
        
        
        # time.sleep(random.uniform(10, 35))
            
    except requests.RequestException as e:
        print(f'An unexpected error occurred for stock number {stock_number}: {e}')
        
    return df

In [20]:
# stock_number = '2330'
# year = 113
# season = 1
# url = BalanceSheetURL
# df_balancesheet = get_balancesheet(stock_number, year, season, url=url)

In [24]:
current_year = datetime.now().year - 1911
year_list = [i for i in range(current_year-5, current_year+1,1)]
year_list

[108, 109, 110, 111, 112, 113]

In [34]:
def balancesheet_crawler(stock_number, url):

    current_year = datetime.now().year - 1911
    year_list = [i for i in range(current_year-5, current_year+1,1)]
    season_list = [1,2,3,4]
    max_year = current_year
    # max_season_by_max_year = 1
    df_result = pd.DataFrame()
    for year in year_list:
        for season in season_list:
            if year == max_year:
                season_list_current_year = [1]
                break
            df_temp = get_balancesheet(stock_number, year, season, url=url)
            print(f'Get {stock_number} balancesheet with year {year}, season {season}')
            
            df_result = pd.concat([df_result, df_temp], axis=0)
                
        
    return df_result

In [None]:
def balancesheet_crawler(stock_number, season_list_current_year, url):

    current_year = datetime.now().year - 1911
    year_list = [i for i in range(current_year-5, current_year+1,1)]
    season_list = [1,2,3,4]
    max_year = current_year
    # max_season_by_max_year = 1
    df_result = pd.DataFrame()
    for year in year_list:
        if year < max_year:
            season_list = [1,2,3,4]
            for season in season_list:
                df_temp = get_balancesheet(stock_number, year, season, url=url)
                print(f'Get {stock_number} balancesheet with year {year}, season {season}')
        else:
            for season in season_list_current_year:
                df_temp = get_balancesheet(stock_number, year, season, url=url)
                print(f'Get {stock_number} balancesheet with year {year}, season {season}')
            
            df_result = pd.concat([df_result, df_temp], axis=0)
                
        
    return df_result

In [35]:
season_list_current_year = [1]
df_balancesheet = balancesheet_crawler(stock_number='2330', season_list_current_year=season_list_current_year, url=BalanceSheetURL)

Get 2330 balancesheet with year 108, season 1
Get 2330 balancesheet with year 108, season 2
Get 2330 balancesheet with year 108, season 3
Get 2330 balancesheet with year 108, season 4
Get 2330 balancesheet with year 109, season 1
Get 2330 balancesheet with year 109, season 2
Get 2330 balancesheet with year 109, season 3
Get 2330 balancesheet with year 109, season 4
Get 2330 balancesheet with year 110, season 1
Get 2330 balancesheet with year 110, season 2
Get 2330 balancesheet with year 110, season 3
Get 2330 balancesheet with year 110, season 4
Get 2330 balancesheet with year 111, season 1
Get 2330 balancesheet with year 111, season 2
Get 2330 balancesheet with year 111, season 3
Get 2330 balancesheet with year 111, season 4
Get 2330 balancesheet with year 112, season 1
Get 2330 balancesheet with year 112, season 2
Get 2330 balancesheet with year 112, season 3
Get 2330 balancesheet with year 112, season 4
Get 2330 balancesheet with year 113, season 1


In [36]:
df_balancesheet

Unnamed: 0,acct_name,this_year_amt,this_year_percent,last_year_amt,last_year_percent,report_name,report_time_raw,period_year,season,creation_date,year_season,stock
3,流動資產,,,,,BalanceSheet,民國108年第1季,108,1,2024-09-07,108-1,2330
4,現金及約當現金,645670527,29.52,577782963,28.20,BalanceSheet,民國108年第1季,108,1,2024-09-07,108-1,2330
5,透過損益按公允價值衡量之金融資產－流動,3084399,0.14,963915,0.05,BalanceSheet,民國108年第1季,108,1,2024-09-07,108-1,2330
6,透過其他綜合損益按公允價值衡量之金融資產－流動,107313205,4.91,95713446,4.67,BalanceSheet,民國108年第1季,108,1,2024-09-07,108-1,2330
7,按攤銷後成本衡量之金融資產－流動,4179386,0.19,9888741,0.48,BalanceSheet,民國108年第1季,108,1,2024-09-07,108-1,2330
...,...,...,...,...,...,...,...,...,...,...,...,...
67,權益總額,3665716254,63.33,3092897598,61.30,BalanceSheet,民國113年第1季,113,1,2024-09-07,113-1,2330
68,負債及權益總計,5787891082,100.00,5045844348,100.00,BalanceSheet,民國113年第1季,113,1,2024-09-07,113-1,2330
69,待註銷股本股數（單位：股）,1401750,,419466,,BalanceSheet,民國113年第1季,113,1,2024-09-07,113-1,2330
70,預收股款（權益項下）之約當發行股數（單位：股）,0,,0,,BalanceSheet,民國113年第1季,113,1,2024-09-07,113-1,2330


In [37]:
sql_agent.write_table(df_balancesheet, 'balancesheet', if_exists='replace', index=False, data_type=None)

### Profitloss

In [8]:
def get_profitloss(stock_number, year, season, url):
    df = pd.DataFrame()
    form_data = {
        'encodeURIComponent':1,
        'step':1,
        'firstin':1,
        'off':1,
        'co_id':stock_number,
        'year': year,
        'season': season,
    }
    
    try:
        r = requests.post(url,form_data)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find_all('table')[1]
        # headers = [header.text for header in table.find_all('th')]
        year_seaon = table.find_all('th')[0].get_text()
        rows = []
        for row in table.find_all('tr')[1:]:
            cells = row.find_all('td')
            cells_text = [cell.text.strip() for cell in cells]
            rows.append(cells_text)

        if season == 2 or  season == 3:
            columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_year_amt', 'last_year_percent', 'this_year_y2m_amt', 'this_year_y2m_percnet', 'last_year_y2m_amt', 'last_year_y2m_percnet']
            df_temp = pd.DataFrame(rows, columns=columns)
            df_select = df_temp.iloc[3:, :5].copy()
        else:
            columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_year_amt', 'last_year_percent']
            df_temp = pd.DataFrame(rows, columns=columns)
            df_select = df_temp.iloc[3:].copy()
        
        df_select['report_name'] = 'ProfitAndLose'
        df_select['report_time_raw'] = year_seaon
        df_select['period_year'] = year
        if season != 4:
            df_select['season'] = season
        elif season == 4:
            df_select['season'] = 'year'
        df_select['creation_date'] = datetime.now().strftime('%Y-%m-%d')
        df_select['year_season'] = df_select['period_year'].astype(str) + '-' + df_select['season'].astype(str)

        df_select['stock'] = stock_number
            
        df = pd.concat([df, df_select], axis=0)
        
        # time.sleep(random.uniform(10, 35))
         
    except requests.RequestException as e:
        print(f'Failed to retrieve data for stock number {stock_number}: {e}')
    except ValueError as e:
        print(f'Error parsing data for stock number {stock_number}: {e}')
    except Exception as e:
        print(f'An unexpected error occurred for stock number {stock_number}: {e}')
        
    return df

In [9]:
def profitloss_crawler(stock_number, url):

    current_year = datetime.now().year - 1911
    year_list = [i for i in range(current_year-5, current_year+1,1)]
    season_list = [1, 2, 3, 4]
    max_year = current_year
    max_season_by_max_year = 1 # hard code
    df_result = pd.DataFrame()
    for year in year_list:
        for season in season_list:
            if year == max_year and season > max_season_by_max_year:
                break
            df_temp = get_profitloss(stock_number, year, season, url=url)
            print(f'Get {stock_number} balancesheet with year {year}, season {season}')
            
            df_result = pd.concat([df_result, df_temp], axis=0)
                
        
    return df_result

In [10]:
df_profitloss = profitloss_crawler(stock_number='2330', url=ProfitAndLoseURL)

Get 2330 balancesheet with year 108, season 1
Get 2330 balancesheet with year 108, season 2
Get 2330 balancesheet with year 108, season 3
Get 2330 balancesheet with year 108, season 4
Get 2330 balancesheet with year 109, season 1
Get 2330 balancesheet with year 109, season 2
Get 2330 balancesheet with year 109, season 3
Get 2330 balancesheet with year 109, season 4
Get 2330 balancesheet with year 110, season 1
Get 2330 balancesheet with year 110, season 2
Get 2330 balancesheet with year 110, season 3
Get 2330 balancesheet with year 110, season 4
Get 2330 balancesheet with year 111, season 1
Get 2330 balancesheet with year 111, season 2
Get 2330 balancesheet with year 111, season 3
Get 2330 balancesheet with year 111, season 4
Get 2330 balancesheet with year 112, season 1
Get 2330 balancesheet with year 112, season 2
Get 2330 balancesheet with year 112, season 3
Get 2330 balancesheet with year 112, season 4
Get 2330 balancesheet with year 113, season 1


In [11]:
def get_season_4_profitloss_data(df, stock_number):

    df_profitloss_season_123 = df[(df['season'] != 'year')]
    df_profitloss_season_year = df_profitloss[(df_profitloss['season'] == 'year')]
    df_profitloss_season_123 = df_profitloss_season_123[df_profitloss_season_123['this_year_amt'] != '']
    df_profitloss_season_123['this_year_amt'] = df_profitloss_season_123['this_year_amt'].str.replace(',','').astype(float)
    df_acct_season_123_sum = df_profitloss_season_123.groupby(['acct_name','period_year']).agg(sum_season_123=('this_year_amt','sum')).reset_index()

    df_seanson_4_data = pd.merge(df_profitloss_season_year, df_acct_season_123_sum, how='inner', on=['acct_name', 'period_year'])
    df_seanson_4_data = df_seanson_4_data[df_seanson_4_data['this_year_amt'] != '']
    df_seanson_4_data['this_year_amt'] = df_seanson_4_data['this_year_amt'].str.replace(',','').astype(float)
    df_seanson_4_data['seanson_4_amt'] = df_seanson_4_data['this_year_amt'] - df_seanson_4_data['sum_season_123']
    target_cols = ['acct_name', 'seanson_4_amt', 'period_year']
    df_result = df_seanson_4_data[target_cols].rename(columns={'seanson_4_amt':'this_year_amt'})

    # add metatdata
    df_result['season'] = 4
    df_result['year_season'] = df_result['period_year'].astype(str) + '-' + '4'
    df_result['creation_date'] = datetime.now().strftime('%Y-%m-%d')
    df_result['stock'] = stock_number


    return df_result

In [12]:
df_seanson_4_data = get_season_4_profitloss_data(df_profitloss, stock_number = '2330')
df_seanson_4_data

Unnamed: 0,acct_name,this_year_amt,period_year,season,year_season,creation_date,stock
0,營業收入合計,3.172371e+08,108,4,108-4,2024-09-07,2330
1,營業成本合計,1.579961e+08,108,4,108-4,2024-09-07,2330
2,營業毛利（毛損）,1.592410e+08,108,4,108-4,2024-09-07,2330
3,未實現銷貨（損）益,-2.312200e+04,108,4,108-4,2024-09-07,2330
4,營業毛利（毛損）淨額,1.592018e+08,108,4,108-4,2024-09-07,2330
...,...,...,...,...,...,...,...
199,非控制權益（淨利∕損）,-4.059420e+05,112,4,112-4,2024-09-07,2330
200,母公司業主（綜合損益）,2.012023e+08,112,4,112-4,2024-09-07,2330
201,非控制權益（綜合損益）,-1.223030e+05,112,4,112-4,2024-09-07,2330
203,基本每股盈餘,9.210000e+00,112,4,112-4,2024-09-07,2330


In [13]:
df_profitloss_with_season4 = pd.concat([df_profitloss, df_seanson_4_data], axis=0)

In [14]:
sql_agent.write_table(df_profitloss_with_season4, 'profitandlose', if_exists='replace', index=False, data_type=None)

In [45]:
year = 108
season = 2
stock_number = 2330
url=ProfitAndLoseURL


df = pd.DataFrame()
form_data = {
    'encodeURIComponent':1,
    'step':1,
    'firstin':1,
    'off':1,
    'co_id':stock_number,
    'year': year,
    'season': season,
}


r = requests.post(url,form_data)
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find_all('table')[1]
# headers = [header.text for header in table.find_all('th')]
year_seaon = table.find_all('th')[0].get_text()
rows = []
for row in table.find_all('tr')[1:]:
    cells = row.find_all('td')
    cells_text = [cell.text.strip() for cell in cells]
    rows.append(cells_text)
if season == 2 or  season == 3:
    columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_year_amt', 'last_year_percent', 'this_year_y2m_amt', 'this_year_y2m_percnet', 'last_year_y2m_amt', 'last_year_y2m_percnet']
    df_temp = pd.DataFrame(rows, columns=columns)
    df_select = df_temp.iloc[3:, :5].copy()
else:
    columns = ['acct_name', 'this_year_amt', 'this_year_percent', 'last_year_amt', 'last_year_percent']
    df_temp = pd.DataFrame(rows, columns=columns)
    df_select = df_temp.iloc[3:].copy()



In [46]:
df_select

Unnamed: 0,acct_name,this_year_amt,this_year_percent,last_year_amt,last_year_percent
3,營業收入合計,240998475.0,100.0,233276811.0,100.0
4,營業成本合計,137325245.0,56.98,121688707.0,52.16
5,營業毛利（毛損）,103673230.0,43.02,111588104.0,47.84
6,未實現銷貨（損）益,0.0,0.0,57170.0,0.02
7,已實現銷貨（損）益,56830.0,0.02,0.0,0.0
8,營業毛利（毛損）淨額,103730060.0,43.04,111530934.0,47.81
9,營業費用,,,,
10,推銷費用,1483004.0,0.62,1477977.0,0.63
11,管理費用,4288263.0,1.78,5070594.0,2.17
12,研究發展費用,21393728.0,8.88,19891553.0,8.53
