In [1]:
import polars as pl
import numpy as np


# Function to generate the initial dataset
def transform_data(n_rows: int) -> pl.DataFrame:
    """
    Generates a random dataset with n_rows and performs initial transformations.

    Parameters:
        n_rows (int): The number of rows in the dataset.

    Returns:
        pl.DataFrame: A Polars DataFrame with added columns.
    """
    df = pl.read_csv(
        "C:/Users/gilnr/OneDrive/Ambiente de Trabalho/ITC Contract/GitHub/Dagster_Scaffold/my-dagster-project/data/SFTP/smoker_data.csv"
    )

    # Transformation: Adding BMI (Body Mass Index) column
    df = df.with_columns((df["weight_kg"] / (df["height_cm"] / 100) ** 2).alias("bmi"))

    # Transformation: Adding a category for age groups
    df = df.with_columns(
        pl.when(df["age"] < 30)
        .then("Young")
        .when(df["age"] < 60)
        .then("Middle-aged")
        .otherwise("Senior")
        .alias("age_group")
    )

    return df


# Function to generate a third dataset for mapping codes to words
def generate_relation_table() -> pl.DataFrame:
    """
    Generates a small relation table mapping codes to words.

    Returns:
        pl.DataFrame: A Polars DataFrame representing the relation table.
    """
    data = {"code": [1, 2, 3, 4], "description": ["Low", "Medium", "High", "Very High"]}

    return pl.DataFrame(data)


# Function to perform transformations based on a relation table
def map_code_to_description(
    df: pl.DataFrame, relation_table: pl.DataFrame
) -> pl.DataFrame:
    """
    Maps a random code to a description using a relation table and performs transformations.

    Parameters:
        df (pl.DataFrame): The original DataFrame to be transformed.
        relation_table (pl.DataFrame): A mapping DataFrame (relation table).

    Returns:
        pl.DataFrame: The DataFrame with new transformations applied.
    """
    # Add a random code column to the original DataFrame
    df = df.with_columns(np.random.choice([1, 2, 3, 4], df.height).alias("risk_code"))

    # Join the relation table based on the 'risk_code' to add descriptions
    df = df.join(relation_table, left_on="risk_code", right_on="code", how="left")

    return df

In [2]:
df = generate_relation_table()
df.write_excel("hight_metadata.xlsx")

<xlsxwriter.workbook.Workbook at 0x15196af6640>