## Compress using GZIP

In [5]:
!gzip -9k ../snapshot/Sample_new

In [6]:
file_path = "../snapshot/Sample_new.gz"

## Pure Python

In [10]:
from __future__ import annotations
import gzip
from pathlib import Path
from typing import Iterator

In [14]:
# with gzip.open(file_path, "rt", encoding="utf-8") as fh:
#     print(fh.read())

In [None]:
def stream_lines(path: str | Path, encoding: str = "utf-8") -> Iterator[str]:
    """
    Stream a gzip-compressed, line-separated text file.

    Args:
        path: Path to .gz file.
        encoding: Text encoding of the original file.

    Yields:
        One decoded line at a time (newline stripped).
    """
    with gzip.open(path, "rt", encoding=encoding) as fh:
        for line in fh:
            yield line.rstrip("\n")

# 用法
for idx, line in enumerate(stream_lines(file_path)):
    if idx < 3:
        print(line)
    else:
        break

0050  083004446448T   000000000000005 01995000000029019900000000200191500000000301900000000001018800000000011 020300000000020000000000000000000000000000000000000000000000000000000020241111AA
0050  083009462008T   000000000000005 01995000000027019900000000320198000000000101970000000001019500000000015 020000000000010200650000000102010000000001020150000000010203000000000420241111AA
0050  083014478574T   000000000000005 01995000000017019900000000330198000000000101970000000001019500000000015 020000000000020200650000000102010000000002020150000000010203000000000420241111AA


## Pandas

- [pandas.read_csv — pandas 2.3.1 documentation](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
- [IO tools (text, CSV, HDF5, …) — pandas 2.3.1 documentation](https://pandas.pydata.org/docs/user_guide/io.html#iterating-through-files-chunk-by-chunk)

In [29]:
import pandas as pd

with pd.read_csv(
    file_path,
    # ↓ 若是多欄位 CSV，就正常設定即可
    # sep="\t",            # or ","
    header=None,
    compression="gzip",
    # chunksize=1_000_000,  # 建議大檔用 iterator
    iterator=True, # TextFileReader
) as reader:
    for chunk in reader:
        print(chunk)
        break

                                                    0
0   0050  083004446448T   000000000000005 01995000...
1   0050  083009462008T   000000000000005 01995000...
2   0050  083014478574T   000000000000005 01995000...
3   0050  083019493222T   000000000000005 01995000...
4   0050  083024508770T   000000000000005 01995000...
5   0050  083029525440T   000000000000005 01995000...
6   0050  083034542076T   000000000000005 01995000...
7   0050  083039555650T   000000000000005 01995000...
8   0050  083044571226T   000000000000005 01995000...
9   0050  083049585802T   000000000000005 01995000...
10  0050  083054602448T   000000000000005 01995000...
11  0050  083059617025T   000000000000005 01995000...
12  0050  083104632642T   000000000000005 01995000...
13  0050  083109647214T   000000000000005 01995000...
14  0050  083114662367T   000000000000005 01992500...
15  0050  083119677393T   000000000000005 01990500...
16  0050  083124694034T   000000000000005 01990500...
17  0050  083129709667T   00

## Polars / Arrow

- [polars.scan_csv — Polars documentation](https://docs.pola.rs/api/python/dev/reference/api/polars.scan_csv.html)
- [polars.LazyFrame.collect — Polars documentation](https://docs.pola.rs/api/python/dev/reference/lazyframe/api/polars.LazyFrame.collect.html)

In [24]:
!uv pip install polars

[2mUsing Python 3.12.3 environment at: /home/daviddwlee84/Documents/Program/Tons/TWSE-Tick-Data/.venv[0m
[2K[2mResolved [1m1 package[0m [2min 630ms[0m[0m                                          [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m     0 B/33.43 MiB           [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 14.91 KiB/33.43 MiB         [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 30.91 KiB/33.43 MiB         [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 46.91 KiB/33.43 MiB         [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 62.91 KiB/33.43 MiB         [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 78.91 KiB/33.43 MiB         [1A
[2K[1A[37m⠙[0m [2mPrepar

In [28]:
import polars as pl

lazy_df = pl.scan_csv(
    file_path,
    has_header=False,
    # separator="\t",
    infer_schema_length=0,  # 全檔推斷欄位型別
    # row_count_name=None,  # 關閉自動 row index (deprecated)
    row_index_name=None,
)
result = lazy_df.collect(  # 隨便示範
    # streaming=True # (deprecated)
    engine="streaming",
)  # 真的要 materialize 時才讀
result

column_1
str
"""0050 083004446448T 00000000…"
"""0050 083009462008T 00000000…"
"""0050 083014478574T 00000000…"
"""0050 083019493222T 00000000…"
"""0050 083024508770T 00000000…"
…
"""9958 132937733590T 01845000…"
"""9958 132947767669T 01845000…"
"""9958 132952784230T 01845000…"
"""9958 132957801848T 01845000…"
