In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
from io import StringIO
import time
from datetime import datetime
from tqdm import tqdm
import pdfplumber

In [20]:
def monthly_report(year, month):
    
    if year > 1990:
        year -= 1911
    
    url = 'https://mops.twse.com.tw/nas/t21/sii/t21sc03_'+str(year)+'_'+str(month)+'_0.html'

    if year <= 98:
        url = 'https://mops.twse.com.tw/nas/t21/sii/t21sc03_'+str(year)+'_'+str(month)+'.html'

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    
    # 下載該年月的網站，並用pandas轉換成dataframe
    r = requests.get(url, headers=headers)
    r.encoding = 'big5'
    dfs = pd.read_html(StringIO(r.text), encoding='big-5')
    df = pd.concat([df for df in dfs if df.shape[1] <= 11 and df.shape[1] > 5])
    
    # if it's multiindex then replace it by next index
    if 'levels' in dir(df.columns):
        df.columns = df.columns.get_level_values(1)
    else:
        df = df[list(range(0,10))]
        column_index = df.index[(df[0] == '公司代號')][0]
        df.columns = df.iloc[column_index]
    
    df['當月營收'] = pd.to_numeric(df['當月營收'], 'coerce')
    df = df[~df['當月營收'].isnull()]
    df = df[df['公司 代號'] != '合計']
    x = pd.DataFrame(df[df['公司 代號'] == ''])
    y = pd.DataFrame(df[df['公司 代號'] == ''])
    time.sleep(5)

    return x, y

In [21]:
x = pd.DataFrame()
y = pd.DataFrame()
def get_data(x, y, start, end):
    start_date = datetime.strptime(start, "%Y-%m")
    end_date = datetime.strptime(end, "%Y-%m")
    months_difference = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month + 1
    current_month = start_date.month
    year = 0

    for _ in tqdm(range(1, months_difference + 1)):
        x_data, y_data = monthly_report(start_date.year + year, current_month)
        date_value = f"{start_date.year + year}-{current_month}"
        x_data['日期'] = date_value
        y_data['日期'] = date_value
        if y_data.empty:
            # 如果空，用 'X' 填充
            y_data = pd.DataFrame({
                '公司 代號': [''],
                '公司名稱': [''],
                '當月營收': ['-'],
                '上月營收': ['-'],
                '去年當月營收': ['-'],
                '上月比較 增減(%)': ['-'],
                '去年同月 增減(%)': ['-'],
                '當月累計營收': ['-'],
                '去年累計營收': ['-'],
                '前期比較 增減(%)': ['-'],
                '備註': ['-'],
                '日期': [date_value]
            })
        x = pd.concat([x, x_data], ignore_index=True)
        y = pd.concat([y, y_data], ignore_index=True)
        
        if current_month == 12:
            year += 1
            current_month = 0
        
        current_month += 1
        
    return x.drop(['公司 代號','上月營收','上月比較 增減(%)','去年同月 增減(%)', '備註'], axis=1), y.drop(['公司 代號','上月營收','上月比較 增減(%)','去年同月 增減(%)', '備註'], axis=1)

x, y = get_data(x, y, "2018-1", "2024-6")

100%|██████████| 78/78 [07:11<00:00,  5.53s/it]


In [22]:
x.to_csv('.csv', index=False, encoding='utf-8-sig')
y.to_csv('.csv', index=False, encoding='utf-8-sig')

In [None]:
x_from_csv = pd.read_csv('.csv',encoding='utf-8-sig')
x_from_csv['日期'] = pd.to_datetime(x_from_csv['日期'], format='%Y-%m')

filtered_data = x_from_csv[x_from_csv['日期'] >= '2018-1']

plt.figure(figsize=(14, 8))
plt.plot(filtered_data['日期'], filtered_data['當月營收'], marker='o', label='Revenue')
plt.title('')
plt.xlabel('Date')
plt.ylabel('Revenue (10 billion)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
def table_to_markdown(table):
    df = pd.DataFrame(table[1:], columns=table[0])
    return df.to_markdown(index=False)

def extract_tables_to_markdown(pdf_file):
    markdown_tables = []
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                markdown_table = table_to_markdown(table)
                markdown_tables.append(markdown_table)
    return markdown_tables

# Replace with your PDF file path
pdf_file = "x_revenue_table.pdf"
markdown_tables = extract_tables_to_markdown(pdf_file)

# Print extracted tables in Markdown format
for i, table in enumerate(markdown_tables):
    print(f"Table {i+1}:\n{table}\n")

# Get total revenue every year

In [30]:
def get_12_data(x, start, end):
    start_date = datetime.strptime(start, "%Y-%m")
    end_date = datetime.strptime(end, "%Y-%m")
    months_difference = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month + 1
    current_month = start_date.month
    year = 0

    for _ in tqdm(range(1, months_difference + 1)):
        if current_month == 12:
            x_data, _ = monthly_report(start_date.year + year, current_month)
            x_data['日期'] = start_date.year + year - 1911
            x = pd.concat([x, x_data], ignore_index=True)
        
        if current_month == 12:
            year += 1
            current_month = 0
        
        current_month += 1
        
    return x.drop(['公司 代號','上月營收','上月比較 增減(%)','去年同月 增減(%)', '備註'], axis=1)

x_salary = pd.DataFrame()  
x= get_12_data(x_salary, "2018-12", "2024-6")

x.to_csv('x_annual_revenue.csv', index=False, encoding='utf-8-sig')