# Organize Stock Dataset
Stock data is store in an Excel file but the structure isn't what we ideally want. In this notebook we aim to do the following
1. make two Excel files one contain all 上市股票 other contain all 上櫃股票
2. separate each stock to a sheet and order by time
3. rename column to english so will be easy to use in the future

In [11]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Configuration

In [2]:
DATA_DIR = "./bda2023_mid_dataset"
ORGANIZED_DATASET_DIR = "./organized_dataset"
RAW_DATASET_NAME = 'stock_data_2019-2023.xlsx'
ORGANIZED_DATASET_NAME = "stocks.xlsx"
RENAMED_COLUMNS = [
    'name', 'date', 'open', 'high', 'low', 'close', 'volume(k)', 'turnover(k)', 'transaction', 'outstanding(k)', 'pe', 'pb'
]

raw_stocks_path = Path(DATA_DIR, RAW_DATASET_NAME)

# Utility functions

In [3]:
def get_raw_stocks_dfs() -> pd.DataFrame:
    """
    get raw stocks dfs, this function will load the excel file and return a dict of dfs
    """
    # Load the Excel file
    excel_file = pd.ExcelFile(raw_stocks_path)

    # Get the sheet names
    sheet_names = excel_file.sheet_names

    # Load all sheets
    dfs = {sheet_name: excel_file.parse(sheet_name, na_values=['-']) for sheet_name in sheet_names if sheet_name != '摘要'}
    return dfs

# Research & Preprocessing
I just simply open excel and look around a bit, here we check about null value and see if there is any error in the data

In [4]:
raw_stock_dfs = get_raw_stocks_dfs()
# for each sheet, check each column null value count
for sheet_name, df in tqdm(raw_stock_dfs.items()):
    print(f"{sheet_name} null value count:")
    print(df.isnull().sum())
    print()


上市2023 null value count:
證券代碼              0
年月日               0
開盤價(元)         3813
最高價(元)         3815
最低價(元)         3815
收盤價(元)            0
成交量(千股)           0
成交值(千元)           0
成交筆數(筆)           0
流通在外股數(千股)        0
本益比-TSE       21534
股價淨值比-TSE     15178
dtype: int64

上市2022 null value count:
證券代碼               0
年月日                0
開盤價(元)         19097
最高價(元)         19099
最低價(元)         19099
收盤價(元)             0
成交量(千股)            0
成交值(千元)            0
成交筆數(筆)            0
流通在外股數(千股)         6
本益比-TSE       109169
股價淨值比-TSE      77321
dtype: int64

上市2021 null value count:
證券代碼               0
年月日                0
開盤價(元)         20242
最高價(元)         20243
最低價(元)         20243
收盤價(元)             0
成交量(千股)            0
成交值(千元)            0
成交筆數(筆)            0
流通在外股數(千股)         0
本益比-TSE       113126
股價淨值比-TSE      75544
dtype: int64

上市2020 null value count:
證券代碼              0
年月日               0
開盤價(元)            0
最高價(元)            0
最低價(元)            0
收盤價(元)        

Look like there is some stock have missing prices value, we will mark them as null value and handle it later

first, let change column into english, and cast data type to what we want

In [5]:
for sheet_name, df in raw_stock_dfs.items():
    # rename columns
    df.columns = RENAMED_COLUMNS
    # cast data
    df['date'] = pd.to_datetime(df['date'])
    # if name have char * replace with _
    df['name'] = df['name'].str.replace('*', '_')

# check all df data type
for sheet_name, df in raw_stock_dfs.items():
    print(f"{sheet_name} data type:")
    print(df.dtypes)
    print()

上市2023 data type:
name                      object
date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
volume(k)                  int64
turnover(k)                int64
transaction                int64
outstanding(k)             int64
pe                       float64
pb                       float64
dtype: object

上市2022 data type:
name                      object
date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
volume(k)                  int64
turnover(k)                int64
transaction                int64
outstanding(k)           float64
pe                       float64
pb                       float64
dtype: object

上市2021 data type:
name                      object
date              datetime64[ns]
open                     float64
high                     

Every column data type seems correct, let make each stock a df and order by time

In [6]:
# make each stock a df
stock_dfs = dict()
for sheet_name, df in raw_stock_dfs.items():
    # make each stock a sheet
    for stock_id in tqdm(df['name'].unique()):
        stock_df = df[df['name'] == stock_id]
        # if stock_dfs have stock_id, stack it
        if stock_id in stock_dfs:
            stock_dfs[stock_id] = pd.concat([stock_dfs[stock_id], stock_df])
        else:
            stock_dfs[stock_id] = stock_df

In [7]:
# order by time
for stock_id, df in stock_dfs.items():
    stock_dfs[stock_id] = df.sort_values(by='date')

# preview 5 stocks head
for stock_id, df in list(stock_dfs.items())[:3]:
    display(f"{stock_id} head:")
    display(df.head())
    print()

'0050 元大台灣50 head:'

Unnamed: 0,name,date,open,high,low,close,volume(k),turnover(k),transaction,outstanding(k),pe,pb
305781,0050 元大台灣50,2021-01-04,116.7284,118.9254,116.5373,118.7821,6307,778088,5434,945500.0,,
304538,0050 元大台灣50,2021-01-05,118.6388,119.0687,118.209,119.0209,4962,616480,6179,945500.0,,
303294,0050 元大台灣50,2021-01-06,120.4537,121.4567,118.5433,120.3105,10859,1367952,9245,945500.0,,
302050,0050 元大台灣50,2021-01-07,120.8836,123.2239,120.8836,123.1284,6863,877015,6299,944000.0,,
300806,0050 元大台灣50,2021-01-08,124.7523,125.3731,123.7493,125.3254,13298,1733821,10629,922000.0,,





'0051 元大中型100 head:'

Unnamed: 0,name,date,open,high,low,close,volume(k),turnover(k),transaction,outstanding(k),pe,pb
305782,0051 元大中型100,2021-01-04,43.1954,43.3411,42.9038,43.3411,32,1424,35,10000.0,,
304539,0051 元大中型100,2021-01-05,43.3411,43.4772,43.137,43.38,175,7827,44,10000.0,,
303295,0051 元大中型100,2021-01-06,43.5355,43.8464,42.7581,43.0204,102,4556,77,10000.0,,
302051,0051 元大中型100,2021-01-07,43.1565,43.3606,43.0496,43.2828,65,2919,61,10000.0,,
300807,0051 元大中型100,2021-01-08,43.4189,43.516,43.2439,43.516,137,6127,50,10000.0,,





'0052 富邦科技 head:'

Unnamed: 0,name,date,open,high,low,close,volume(k),turnover(k),transaction,outstanding(k),pe,pb
305783,0052 富邦科技,2021-01-04,109.7031,111.3894,109.7031,111.191,1079,120653,549,36500.0,,
304540,0052 富邦科技,2021-01-05,111.191,111.9845,110.2487,111.9845,1536,172232,673,37000.0,,
303296,0052 富邦科技,2021-01-06,113.5715,114.4146,111.3894,112.6788,864,98497,743,37500.0,,
302052,0052 富邦科技,2021-01-07,114.0179,116.2,113.4723,115.8529,817,94932,810,37500.0,,
300808,0052 富邦科技,2021-01-08,118.035,118.531,117.1919,118.531,1244,148170,849,38000.0,,





Now we make metadata for all stock, this should include row_cnt, have_null_price, missing_rows_cnt

In [8]:
# make metadata for all stock
stock_metadata = dict()
for stock_id, df in stock_dfs.items():
    # get row count
    row_cnt = df.shape[0]
    # check if there is any null price
    have_null_price = df['close'].isnull().any()
    stock_metadata[stock_id] = {
        'row_cnt': row_cnt,
        'have_null_price': have_null_price
    }
# check type 上市 or 上櫃
for sheet_name, df in raw_stock_dfs.items():
    stock_type = "上市" if sheet_name == "上市股票" else "上櫃"
    for stock_id in df['name'].unique():
        stock_metadata[stock_id]['stock_type'] = stock_type
# convert to df
stock_metadata_df = pd.DataFrame(stock_metadata).T
stock_metadata_df.head()

Unnamed: 0,row_cnt,have_null_price,stock_type
0050 元大台灣50,539,False,上櫃
0051 元大中型100,539,False,上櫃
0052 富邦科技,539,False,上櫃
0053 元大電子,539,False,上櫃
0055 元大MSCI金融,539,False,上櫃


Save metadata and stock dfs to excel, each df will be a sheet

In [12]:
# save to excel
with pd.ExcelWriter(Path(ORGANIZED_DATASET_DIR, ORGANIZED_DATASET_NAME)) as writer:
    stock_metadata_df.to_excel(writer, sheet_name='metadata')
    for stock_id, df in tqdm(stock_dfs.items()):
        df.to_excel(writer, sheet_name=stock_id, index=False)


  2%|▏         | 56/2607 [00:00<00:42, 59.49it/s]Exception ignored in: <function ZipFile.__del__ at 0x106b8c540>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/zipfile.py", line 1870, in __del__
    self.close()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/zipfile.py", line 1887, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file
100%|██████████| 2607/2607 [13:51<00:00,  3.13it/s]


Check every thing is correct

In [13]:
# check every thing is correct
with pd.ExcelFile(Path(ORGANIZED_DATASET_DIR, ORGANIZED_DATASET_NAME)) as excel_file:
    # get sheet names
    sheet_names = excel_file.sheet_names
    # load all sheets
    dfs = {sheet_name: excel_file.parse(sheet_name) for sheet_name in sheet_names}
    # check metadata
    metadata_df = dfs['metadata']
    display("metadata:")
    display(metadata_df.head())
    print()

    # check random 3 stocks
    for stock_id in list(dfs.keys())[1:4]:
        display(f"{stock_id} head:")
        display(dfs[stock_id].head())
        print()


'metadata:'

Unnamed: 0.1,Unnamed: 0,row_cnt,have_null_price,stock_type
0,0050 元大台灣50,539,False,上櫃
1,0051 元大中型100,539,False,上櫃
2,0052 富邦科技,539,False,上櫃
3,0053 元大電子,539,False,上櫃
4,0055 元大MSCI金融,539,False,上櫃





'0050 元大台灣50 head:'

Unnamed: 0,name,date,open,high,low,close,volume(k),turnover(k),transaction,outstanding(k),pe,pb
0,0050 元大台灣50,2021-01-04,116.7284,118.9254,116.5373,118.7821,6307,778088,5434,945500,,
1,0050 元大台灣50,2021-01-05,118.6388,119.0687,118.209,119.0209,4962,616480,6179,945500,,
2,0050 元大台灣50,2021-01-06,120.4537,121.4567,118.5433,120.3105,10859,1367952,9245,945500,,
3,0050 元大台灣50,2021-01-07,120.8836,123.2239,120.8836,123.1284,6863,877015,6299,944000,,
4,0050 元大台灣50,2021-01-08,124.7523,125.3731,123.7493,125.3254,13298,1733821,10629,922000,,





'0051 元大中型100 head:'

Unnamed: 0,name,date,open,high,low,close,volume(k),turnover(k),transaction,outstanding(k),pe,pb
0,0051 元大中型100,2021-01-04,43.1954,43.3411,42.9038,43.3411,32,1424,35,10000,,
1,0051 元大中型100,2021-01-05,43.3411,43.4772,43.137,43.38,175,7827,44,10000,,
2,0051 元大中型100,2021-01-06,43.5355,43.8464,42.7581,43.0204,102,4556,77,10000,,
3,0051 元大中型100,2021-01-07,43.1565,43.3606,43.0496,43.2828,65,2919,61,10000,,
4,0051 元大中型100,2021-01-08,43.4189,43.516,43.2439,43.516,137,6127,50,10000,,





'0052 富邦科技 head:'

Unnamed: 0,name,date,open,high,low,close,volume(k),turnover(k),transaction,outstanding(k),pe,pb
0,0052 富邦科技,2021-01-04,109.7031,111.3894,109.7031,111.191,1079,120653,549,36500,,
1,0052 富邦科技,2021-01-05,111.191,111.9845,110.2487,111.9845,1536,172232,673,37000,,
2,0052 富邦科技,2021-01-06,113.5715,114.4146,111.3894,112.6788,864,98497,743,37500,,
3,0052 富邦科技,2021-01-07,114.0179,116.2,113.4723,115.8529,817,94932,810,37500,,
4,0052 富邦科技,2021-01-08,118.035,118.531,117.1919,118.531,1244,148170,849,38000,,





Everything look fine