In [1]:
import polars as pl

import warnings
import os
os.chdir('/home/denisalpino/dev/FinABYSS')

In [2]:
warnings.simplefilter("ignore", Warning)

In [3]:
plan = (
    pl.scan_parquet("data/articles.parquet")
    .with_columns(
        # cast the column to its corresponding type
        pl.col("datetime")
        .str.to_datetime(format="%Y-%m-%dT%H:%M:%S%.fZ", time_zone="UTC")
        .alias("datetime")
    )
    .unique() # drop duplicates
    .sort(pl.col("datetime")) # sorting by datetime asceding
)

In [4]:
with pl.Config() as cfg:
    # use batches to save memory
    cfg.set_streaming_chunk_size(50_000)
    df = plan.collect(streaming=True)

In [5]:
df.write_parquet("data/articles_preprocessed.parquet", use_pyarrow=True)

In [6]:
estimated_size = df.estimated_size(unit="gb")
actual_size = os.path.getsize("data/articles_preprocessed.parquet")

print(
    f"Estimated size in memory: {estimated_size:.2f} GB\n"
    f"Real size on disk: {actual_size / 1024**3:.2f} GB"
)

Estimated size in memory: 6.40 GB
Real size on disk: 2.19 GB
