In [1]:
import requests
import pandas as pd
import json
import numpy as np
from datetime import datetime, date
import csv
import time
import ssl
import os
import threading
ssl._create_default_https_context = ssl._create_unverified_context
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
DEBUG = False
time_data = [2024, 10, 28]
otc_list_path = r".\db\otc.csv"
twse_list_path = r".\db\twse.csv"
output_db_path = r"D:\Stock\db"

In [3]:
def get_twse_stock_db_info():
    link = 'http://www.twse.com.tw/exchangeReport/BWIBBU_ALL?response=open_data'
    df = pd.read_csv(link, encoding='utf_8_sig')
    return df
    
def get_twse_stock_info(df, stock):
    target_data = df[df["股票代號"] == int(stock)]
    name = target_data.iloc[0]['股票名稱']
    priceEarningRatio = target_data.iloc[0]['本益比']
    yieldRatio = target_data.iloc[0]['殖利率(%)']
    priceBookRatio = target_data.iloc[0]['股價淨值比']
    name, priceEarningRatio, yieldRatio, priceBookRatio
    return name, priceEarningRatio, yieldRatio, priceBookRatio

def get_otc_stock_db_info():
    link = 'http://www.tpex.org.tw/openapi/v1/tpex_mainboard_peratio_analysis'
    json_data = requests.get(link).json()
    df = pd.DataFrame.from_records(json_data)
    return df
    
def get_otc_stock_info(df, stock):
    target_data = df[df['SecuritiesCompanyCode'] == stock]
    name = target_data.iloc[0]['CompanyName']
    priceEarningRatio = target_data.iloc[0]['PriceEarningRatio']
    dividendPerShare = target_data.iloc[0]['DividendPerShare']
    yieldRatio = target_data.iloc[0]['YieldRatio']
    priceBookRatio = target_data.iloc[0]['PriceBookRatio']
    name, priceEarningRatio, yieldRatio, priceBookRatio
    return name, priceEarningRatio, yieldRatio, priceBookRatio

def get_time_input(yy, mm, dd, days = 100):
    time_list = []
    month_num = days//20
    for i in range(month_num):
        yyy = yy
        mmm = mm
        if (mm - i > 0):
            yyy = yy
            mmm = mm - i
        else:
            yyy = yy-1
            mmm = 12 - (i - mm)
        time_list.append([yyy, mmm])
    time_list.reverse()
    return time_list, [yy, mm, dd]

In [4]:
def string_with_comma_to_int(x):
    return int(x.replace(",", ""))

def string_with_comma_to_float(x):
    try:
        return float(x.replace(",", ""))
    except:
        try:
            return float(x)
        except:
            return 0
    
def string_to_float(x):
    try:
        return float(x)
    except:
        return 0
    
def vol_for_twse(x):
    try:
        return round(float(x.replace(",", ""))/1000)
    except:
        return 0

def moving_average(x, w):
    return np.convolve(x, np.ones(w), "valid") / w

def get_stock_volumn_price(yy, mm, dd, stock_tag, last_date):
    date_tag = str(yy) + str(mm).zfill(2) + "01"
    url = 'http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=open_data&date=%s&stockNo=%s'%(date_tag, stock_tag)
    # print(url)
    try:
        df = pd.read_csv(url, encoding='utf_8_sig')
    except:
        return None
    # ["日期","成交股數","成交金額","開盤價","最高價","最低價","收盤價","漲跌價差","成交筆數"]
    df_target = df[['日期', '成交股數', '收盤價', '最高價', "最低價", "開盤價"]]
    df_target.iloc[:, 1] = df_target.iloc[:, 1].apply(vol_for_twse) # volumn, 成交張數 = 成交股數 / 1000
    df_target.iloc[:, 2] = df_target.iloc[:, 2].apply(string_with_comma_to_float) # end price
    df_target.iloc[:, 3] = df_target.iloc[:, 3].apply(string_with_comma_to_float) # highest price
    df_target.iloc[:, 4] = df_target.iloc[:, 4].apply(string_with_comma_to_float) # lowest price
    df_target.iloc[:, 5] = df_target.iloc[:, 5].apply(string_with_comma_to_float) # start price
    df_target.rename(columns = {'成交股數':'成交張數'}, inplace = True)
    return df_target

def get_otc_stock_volumn_price(yy, mm, dd, stock_tag, last_date):
    # yy = yy - 1911
    date_tag = str(yy) + "%2F" + str(mm).zfill(2) + "%2F01"
    # url = 'http://www.tpex.org.tw/web/stock/aftertrading/daily_trading_info/st43_result.php?d=%s/%s/%s&stkno=%s'%(yy, mm, dd, stock_tag)
    url = 'https://www.tpex.org.tw/www/zh-tw/afterTrading/tradingStock?code=%s&date=%s&id=&response=open_data'%(stock_tag, date_tag)
    # print(url)
    json_data = requests.get(url).json()
    # ['日期', '成交張數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數']
    columns = ['日期', '成交張數', '成交金額', '開盤價', '最高價', '最低價', '收盤價', '漲跌價差', '成交筆數']
    df = pd.DataFrame(json_data['tables'][0]['data'], columns=columns)
    df_target = df[['日期', '成交張數', '收盤價', '最高價', '最低價', "開盤價"]]
    df_target.iloc[:, 1] = df_target.iloc[:, 1].apply(string_with_comma_to_float) # volumn
    df_target.iloc[:, 2] = df_target.iloc[:, 2].apply(string_with_comma_to_float) # end price
    df_target.iloc[:, 3] = df_target.iloc[:, 3].apply(string_with_comma_to_float) # start price
    df_target.iloc[:, 4] = df_target.iloc[:, 4].apply(string_with_comma_to_float) # lowest price
    df_target.iloc[:, 5] = df_target.iloc[:, 5].apply(string_with_comma_to_float) # start price
    return df_target

def print_log(ss, is_debug):
    if is_debug:
        print(ss)

In [5]:
def get_time_duration_stock_info(time_data, stock_tag, min_volumn = 150, ma_num = 5, isOtc = False):
    df_all = pd.DataFrame()
    time_list, last_date = get_time_input(time_data[0], time_data[1], time_data[2])
    for time_item in time_list:
        year = time_item[0]
        month = time_item[1]
        if isOtc:
            df = get_otc_stock_volumn_price(year, month, "01", stock_tag, last_date)
        else:
            df = get_stock_volumn_price(year, month, "01", stock_tag, last_date)
        if df is None:
            continue
        time.sleep(0.10)
        df_all = pd.concat([df_all, df], axis=0)
        # yy = time_item[0]
        # for mm in range(time_item[1], time_item[2] + 1):
        #     if isOtc:
        #         df = get_otc_stock_volumn_price(yy, mm, "01", stock_tag)
        #     else:
        #         df = get_stock_volumn_price(yy, mm, "01", stock_tag)
        #         if df is None:
        #             continue
        #         time.sleep(0.15)
        #     df_all = pd.concat([df_all, df], axis=0)
    
    # drop out of range date
    df_all = df_all.reset_index()
    df_all.drop(df_all.columns[0], axis=1, inplace = True)
    drop_num = 0
    for i in range(len(df_all)):
        try:
            row_idx = len(df_all) - i - 1
            row_date =df_all.loc[row_idx, "日期"]
            row_date = row_date.replace('*', "")
            row_yy = int(row_date.split('/')[0]) + 1911
            row_mm = int(row_date.split('/')[1])
            row_dd = int(row_date.split('/')[2])
            if row_yy >= last_date[0] and row_mm >= last_date[1] and row_dd > last_date[2]:
                drop_num = drop_num + 1
        except:
            # print(row_date)
            continue
    df_all.drop(df_all.tail(drop_num).index, inplace = True)
      
    df_np = df_all.to_numpy().copy()
    
    if len(df_np) == 0:
        return None, None, None, None
    # if df_np[:, 1].max() > min_volumn:
    #     df_np[:, 1][df_np[:, 1] < min_volumn] = min_volumn
    # if df_np[:, 1].max() == df_np[:, 1].min():
    #     return None, None
    # if df_np[:, 2].max() == df_np[:, 2].min():
    #     return None, None
    
    # df_np[:, 1] = (df_np[:, 1] - df_np[:, 1].min()) / ((df_np[:, 1].max() - df_np[:, 1].min()))
    # df_np[:, 2] = (df_np[:, 2] - df_np[:, 2].min()) / ((df_np[:, 2].max() - df_np[:, 2].min()))
    
    # df_vol_ma = moving_average(df_np[:, 1], ma_num)
    # df_pri_ma = moving_average(df_np[:, 2], ma_num)
    df_vol_ma = df_np[:, 1]
    df_pri_ma = df_np[:, 2]
    df_pri_max_ma = df_np[:, 3]
    
    vol_data = df_vol_ma
    pri_data = df_pri_ma
    return vol_data, pri_data, df_pri_max_ma, df_all

In [6]:
def export_stock_info(time_data, isOtc, target_list, thread_total = 1, thread_idx = 0):
    if isOtc:
        df = get_otc_stock_db_info()
        # data_path = r".\db\otc.csv"
        data_path = otc_list_path
    else:
        df = get_twse_stock_db_info()
        # data_path = r".\db\twse.csv"
        data_path = twse_list_path

    stock_list = []
    with open(data_path, newline='', encoding='utf_8_sig') as csvfile:
        line_list = csv.reader(csvfile)
        for line in line_list:
            stock_list.append(line[0])
    
    stock_list_len = len(stock_list)
    thread_item_num = np.ceil(stock_list_len / thread_total)
    idx_start = thread_item_num * thread_idx
    idx_end = idx_start + thread_item_num
    if thread_idx == thread_total - 1: # last thread
        idx_end = stock_list_len
    
    # target_list = []

    for stock in stock_list[int(idx_start):int(idx_end)]:
        vol_data, pri_data, pri_max_data, df_all = get_time_duration_stock_info(time_data, stock, min_volumn=150, ma_num=1, isOtc=isOtc)
        # df_all.loc[-1] = [time_data, stock, isOtc, "", "", ""]  # adding a row
        # df_all.index = df_all.index + 1  # shifting index
        # df_all.sort_index(inplace=True) 
        # header=[time_data, stock, isOtc, "", "", ""]
        # df_all.columns=header
        if vol_data is None or pri_data is None:
            continue
        output_file_path = os.path.join(output_db_path, "%s.csv"%(stock))
        if os.path.exists(output_file_path):
            print("File exist!!!")
        else:
            df_all.to_csv(output_file_path, encoding='utf-8-sig')
        #     with open(output_file_path, 'a', newline='') as csvfile:
        #         writer = csv.writer(csvfile)
        #         writer.writerows([time_data])
        #         # writer.writerows([pri_max_data])
        #         writer.writerow(vol_data)
        #         # writer.writerow([pri_data])
        # return df_all
        # break
    
# target_list = []
# df_all_ = export_stock_info(time_data, isOtc=False, target_list=target_list, thread_total = 1, thread_idx = 0)

In [7]:
# def export_stock_info(time_data, isOtc, target_list, thread_total = 1, thread_idx = 0):
#     if isOtc:
#         df = get_otc_stock_db_info()
#         # data_path = r".\db\otc.csv"
#         data_path = otc_list_path
#     else:
#         df = get_twse_stock_db_info()
#         # data_path = r".\db\twse.csv"
#         data_path = twse_list_path

#     stock_list = []
#     with open(data_path, newline='', encoding='utf_8_sig') as csvfile:
#         line_list = csv.reader(csvfile)
#         for line in line_list:
#             stock_list.append(line[0])
    
#     stock_list_len = len(stock_list)
#     thread_item_num = np.ceil(stock_list_len / thread_total)
#     idx_start = thread_item_num * thread_idx
#     idx_end = idx_start + thread_item_num
#     if thread_idx == thread_total - 1: # last thread
#         idx_end = stock_list_len
    
#     # target_list = []

#     stock = "6505"
#     vol_data, pri_data, pri_max_data, df_all = get_time_duration_stock_info(time_data, stock, min_volumn=150, ma_num=1, isOtc=isOtc)
#     # df_all.loc[-1] = [time_data, stock, isOtc, "", "", ""]  # adding a row
#     # df_all.index = df_all.index + 1  # shifting index
#     # df_all.sort_index(inplace=True) 
#     # header=[time_data, stock, isOtc, "", "", ""]
#     # df_all.columns=header
#     print(df_all)
#     output_file_path = os.path.join(output_db_path, "%s.csv"%(stock))
#     if os.path.exists(output_file_path):
#         print("File exist!!!")
#     else:
#         df_all.to_csv(output_file_path, encoding='utf-8-sig')
#     #     with open(output_file_path, 'a', newline='') as csvfile:
#     #         writer = csv.writer(csvfile)
#     #         writer.writerows([time_data])
#     #         # writer.writerows([pri_max_data])
#     #         writer.writerow(vol_data)
#     #         # writer.writerow([pri_data])
#     # return df_all
#     # break
    
# target_list = []
# df_all_ = export_stock_info(time_data, isOtc=False, target_list=target_list, thread_total = 1, thread_idx = 0)

In [8]:
THREAD_NUM = 6
threads = []
target_list = []
for i in range(THREAD_NUM):
    threads.append(threading.Thread(target = export_stock_info, args = (time_data, False, target_list, THREAD_NUM, i)))
    threads[i].start()
    
for i in range(THREAD_NUM):
  threads[i].join()

print("Done.")

Done.


In [9]:
THREAD_NUM = 6
threads = []
target_list_otc = []
for i in range(THREAD_NUM):
    threads.append(threading.Thread(target = export_stock_info, args = (time_data, True, target_list_otc, THREAD_NUM, i)))
    threads[i].start()
    
for i in range(THREAD_NUM):
  threads[i].join()

print("Done.")

Done.
