In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,omegaconf --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
omegaconf: 2.3.0

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [None]:
from glob import glob


def create_id_text_mapping(filepath: str) -> pl.DataFrame:
    """
    Create a mapping DataFrame from a Parquet file.

    This function reads a Parquet file, processes the data, and creates a mapping
    DataFrame with 'id', 'text', and 'label' columns.

    Parameters
    ----------
    filepath : str
        The path to the Parquet file.

    Returns
    -------
    pl.DataFrame
        A DataFrame with shape (1, 3) containing 'id', 'text', and 'label' columns.
    """
    pattern: str = r"salary|gigworker"
    delimiter: str = "|"

    df: pl.DataFrame = (
        pl.scan_parquet(filepath)
        .with_columns(tags=pl.col("tags").map_elements(lambda x: "".join(x)))
        .filter(pl.col("tags").str.to_lowercase().str.contains(pattern))
        .with_columns(
            label=pl.col("tags")
            .str.extract_all(pattern)
            .map_elements(lambda x: "".join(set(x)))
        )
        .drop("tags")
        .collect()
    )
    try:
        df = df.rename({"analysisId": "id"})
    except:
        pass

    df_grpby: pl.DataFrame = df.group_by("id").agg(
        text=(pl.struct(["date", "description", "amount"]))
    )
    body: list[str] = []
    for row in df_grpby.select("text").to_dicts():
        for data_ in row["text"]:
            date: str = data_["date"]
            description: str = data_["description"]
            amount: float = data_["amount"]
            b_str: str = f"{date} {delimiter} {description} {delimiter} {amount} "
            value: str = f"{b_str}\n"
            body.append(value)

    id: str = str(df.select("id").unique().to_numpy().squeeze())
    label: str = str(df.select("label").unique().to_numpy().squeeze())
    data: dict[str, Any] = {"id": id, "text": body, "label": label}
    mapping_df: pl.DataFrame = pl.DataFrame([data])

    return mapping_df




def create_dataset(
    filepath: str = "./data/*.parquet", output_path: str | None = None
) -> pl.DataFrame:
    """
    Create a dataset by combining multiple Parquet files and save as JSONL.

    Parameters
    ----------
    filepath : str, optional
        Glob pattern for input Parquet files, by default "./data/*.parquet"
    output_path : str | None, optional
        Path to save the output JSONL file, by default None

    Returns
    -------
    pl.DataFrame
        Combined DataFrame from all input files

    Notes
    -----
    If output_path is None, the function will use "output.jsonl" as default.
    """

    files: list[str] = glob(filepath)
    all_df: pl.DataFrame = pl.DataFrame()

    for f in files:
        df: pl.DataFrame = create_id_text_mapping(filepath=f)
        all_df = all_df.vstack(df)

    # Convert the DataFrame to JSONL
    if output_path is None:
        output_path = "output.jsonl"
    all_df.write_ndjson(output_path)
    return all_df