# Postprocess Units

#### Imports

In [None]:
import numpy as np
import pandas as pd
import re
import yaml

from datetime import datetime
from pathlib import Path
from pint import UnitRegistry

### 0. Specify Data Directories

In [None]:
project_root = Path.cwd().parents[0]
with open(project_root / "configs" / "config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [None]:
results_dir = project_root / config["results_dir"]

# Get all subdirectories with valid timestamp names
subdirs = [d for d in results_dir.iterdir() if d.is_dir() and "_" in d.name]

# Get the latest results subdirectory
latest_subdir = max(subdirs, key=lambda d: d.name)

In [None]:
raw_results_file = latest_subdir / "numerical_results.csv"
postprocessed_results_file = latest_subdir / "postprocessed_results.csv"

unit_registry_file = project_root / "configs" / "default_en.txt"
currency_mapping_file = project_root / config["datasets_dir"] / "fed_rates_yearly.csv"
indicator_metadata_file = project_root / config["datasets_dir"] / "indicator_metadata.csv"

### 1. Add tCO2e, Currencies and Special Units to Pint Unit Registry

In [None]:
ureg = UnitRegistry(unit_registry_file, case_sensitive=False)

# Ensure "ton" is correctly mapped to "tonne" (metric ton)
ureg.define("ton = tonne")
ureg.define("kt = ton * 1e3")
ureg.define("metric_ton = 1e3 * kilogram = t = tonne = metricton")

# Define CO2e emissions units as mass
ureg.define("gCO2e = [mass]")
ureg.define("kgCO2e = 1e3 * gCO2e")
ureg.define("tCO2e = 1e3 * kgCO2e")
ureg.define("ktCO2e = 1e3 * tCO2e")
ureg.define("MtCO2e = 1e3 * ktCO2e")
ureg.define("GtCO2e = 1e3 * MtCO2e")

# Define currencies
ureg.define("USD = [currency] = $ = usd")

# Define base energy per monetary unit
ureg.define("Wh_per_USD = watt_hour / USD")
ureg.define("kWh_per_USD = 1e3 * Wh_per_USD") 
ureg.define("MWh_per_USD = 1e6 * Wh_per_USD")

# Define joules per USD and related units
ureg.define("joule_per_USD = joule / USD = 3600 * Wh_per_USD = J_per_USD")
ureg.define("kJ_per_USD = 1e3 * J_per_USD")
ureg.define("MJ_per_USD = 1e6 * J_per_USD")
ureg.define("GJ_per_USD = 1e9 * J_per_USD")

# Define energy per monetary unit
ureg.define("TJ = 1e12 * joule")
ureg.define("MWh = 1e6 * watt_hour")
#ureg.define("Wh_per_USD = watt_hour / USD")
#ureg.define("joule_per_USD = joule / USD = 3600 * Wh_per_USD = J_per_USD")

# Define emissions per monetary unit
ureg.define("kgCO2e_per_USD = kgCO2e / USD")
ureg.define("tCO2e_per_USD = tCO2e / USD = 1e3 * kgCO2e_per_USD")
ureg.define("ktCO2e_per_USD = ktCO2e / USD = 1e3 * tCO2e_per_USD")
ureg.define("MtCO2e_per_USD = MtCO2e / USD = 1e3 * ktCO2e_per_USD")
ureg.define("GtCO2e_per_USD = GtCO2e / USD = 1e3 * MtCO2e_per_USD")

# Define volume consumed per monetary unit
ureg.define("cubic_meter = meter ** 3 = m³ = m3")
ureg.define("cubic_meter_per_USD = cubic_meter / USD")

# Define area
ureg.define("square_meter = meter ** 2 = m² = m2")
ureg.define("are = 100 * square_meter")
ureg.define("square_kilometer = kilometer ** 2 = 1e6 * square_meter = km² = km2")

### 2. Load Dataframe, Set Fixed Type Units, Remove Spaces and Split Fractions by "/"

In [None]:
columns_to_exclude = [
    'srn_compliance_item_id',
    'retrieved_context',
    'source_documents',
    'page_numbers',
    'retrieval_time',
    'extraction_time',
    'num_tokens']

In [None]:
df_raw = pd.read_csv(raw_results_file, usecols=lambda col: col not in columns_to_exclude)

In [None]:
# Set integers, decimals to NaN and handle percent
df_types = pd.read_csv(indicator_metadata_file)

In [None]:
df = df_raw.copy()

In [None]:
units_to_drop_list = ["integer", "decimal", "date"]
float_list = ["integer", "decimal", "percent"]
not_dimensionless_list = [
    'years',
    'USD',
    'tCO2e',
    'MWh',
    'energy consumed per monetary unit (USD)',
    'CO2 equivalent emissions per monetary unit (USD)',
    'USD (with indication of whether the amount is increased (+) or decreased (-))',
    'tonnes',
    'm3',
    'volume consumed per monetary unit (USD)',
    'hectares',
    'hours'
]

In [None]:
def is_valid_date(date):
    try:
        date_str = str(date)
        datetime.strptime(date_str, "%Y-%m-%d")
        return True
    except ValueError:
        return False
    
def update_date_row(row):
    if not is_valid_date(row["value"]):
        row["model_output_valid"] = False
        row["value"] = None
    return row

def transform_to_float(row):
    try:
        # Attempt to convert to float
        row["value"] = float(row["value"])
    except (ValueError, TypeError):
        # If conversion fails, set value to NaN and mark as invalid
        row["value"] = np.nan
        row["model_output_valid"] = False
    return row

In [None]:
# Step 1: Create a mapping from `id` to `type_standard` for faster lookups
id_to_type = df_types.set_index("id")["type_standard"]

# Step 2: Map `data_point_id` to their corresponding `type_standard` in the main DataFrame
df["type_standard"] = df["data_point_id"].map(id_to_type)

# Step 3: Handle rows where `type_standard` is in `units_to_drop_list`
df.loc[df["type_standard"].isin(units_to_drop_list), "unit"] = None

# Step 4 Handle rows which should have a unit but don't have one
df.loc[df["type_standard"].isin(not_dimensionless_list) & (df["value"].notna()) & (df["unit"].isna()), ["value", "model_output_valid"]] = None, False

# Step 5: Convert to float, otherwise set value to NaN and valid to False
float_mask = df["type_standard"].isin(float_list)
df.loc[float_mask] = df.loc[float_mask].apply(transform_to_float, axis=1)

# Step 6: Handle rows where `type_standard` is "percent"
percent_mask = df["type_standard"] == "percent"
df.loc[percent_mask & (df["unit"].isna()), "value"] *= 100
df.loc[percent_mask & (df["unit"].isna() | df["unit"].str.lower().isin(["%", "percent", "per cent"])), "unit"] = "percent"
df.loc[percent_mask, "value"] = df.loc[percent_mask, "value"].astype(str)

# Step 7: Handle rows where `type_standard` is date
date_mask = df["type_standard"] == "date"
df.loc[date_mask] = df.loc[date_mask].apply(update_date_row, axis=1)

# Step 8: Handle rows where "value" is None but unit is not None
df.loc[df["value"].isnull(), "unit"] = None

# Step 9: remove dimensionless rows from further processing
dimensionless_mask = df.loc[df["type_standard"].isin(units_to_drop_list)]

In [None]:
dimensionless_mask["value_final"] = dimensionless_mask["value"]
dimensionless_mask["unit_final"] = dimensionless_mask["unit"]

In [None]:
# Set aside dimensionless and model output False for later concatenation
df = df.loc[~df["type_standard"].isin(units_to_drop_list)]

In [None]:
# Remove spaces and split into numerator and denominator
df["unit_preprocessed"] = df["unit"].str.replace(r'\s+', '', regex=True).str.split("/")

In [None]:
# Extract numerator and denominator
df['numerator'] = df['unit_preprocessed'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
df['denominator'] = df['unit_preprocessed'].apply(lambda x: x[1] if isinstance(x, list) and len(x) > 1 else None)

### 3. Identify Multipliers with Regex

("billion", "million", etc.)

In [None]:
def extract_multiplier_and_unit(unit):
    multiplier_map = {
        'hundredsof': 1e2,
        'hundreds': 1e2,
        'hundred': 1e2,
        'thousandsof': 1e3,
        'thousands': 1e3,
        'thousand': 1e3,
        'millionsof': 1e6,
        'millions': 1e6,
        'million': 1e6,
        'billionsof': 1e9,
        'billions': 1e9,
        'billion': 1e9,
        'trillionsof': 1e12,
        'trillions': 1e12,
        'trillion': 1e12,
    }

    # Ensure unit is a str before processing
    if not isinstance(unit, str):
        return (1, unit)


    multiplier = 1  # Default multiplier
    base_unit = unit  # Default base unit

    unit_lower = unit.lower()

    for word, value in multiplier_map.items():
        if word in unit_lower:
            multiplier = value
            base_unit = re.sub(word, '', unit, flags=re.IGNORECASE).strip()
            return (multiplier, base_unit)

    return (1, unit)

In [None]:
# Identify multipliers for numerator
df['processed_numerator'] = df['numerator'].apply(extract_multiplier_and_unit)
df['multiplier_numerator'] = df['processed_numerator'].apply(lambda x: x[0])
df['unit_numerator'] = df['processed_numerator'].apply(lambda x: x[1])

# Identify multipliers for denominator
df['processed_denominator'] = df['denominator'].apply(extract_multiplier_and_unit)
df['multiplier_denominator'] = df['processed_denominator'].apply(lambda x: x[0])
df['unit_denominator'] = df['processed_denominator'].apply(lambda x: x[1])

df.drop(columns=['processed_numerator', 'processed_denominator'], inplace=True)

### 4. Unify Strings (including suffix-multipliers for currencies)

("tonnes of CO2 equivalents", "€m", "eurM" etc.)

In [None]:
def unify_units(unit):
    if not isinstance(unit, str):
        return unit, 1
    
    ton_patterns = [
        r'\b(?:metric\s?)?tons?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?tonnes?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?tons?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\btonnes?\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bmetric\s?tons?\s?of\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\btonnesCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\btCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
    ]

    kiloton_patterns = [
        r'\b(?:metric\s?)?kilotons?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?kilotonnes?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?kilotons?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bkilotonnes?\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bmetric\s?kiloton\s?of\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bkilotonnesCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bktCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
    ]

    megaton_patterns = [
        r'\b(?:metric\s?)?megatonnes?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?megatons?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?megatons?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bmegatonnes?\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bmetric\s?megaton\s?of\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bmegatonnesCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bMtCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
    ]

    gigaton_patterns = [
        r'\b(?:metric\s?)?gigatonnes?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?gigatons?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\b(?:metric\s?)?gigatons?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bgigatonnes?\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bmetric\s?gigaton\s?of\s?CO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bgigatonnesCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
        r'\bGtCO(?:2|²|₂)(?:e(?:q(?:uivalent)?)?s?)?\.?\b',
    ]

    kg_patterns = [
        r'\bkg\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:quivalent)?s?)?\b',
        r'\bkilograms?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:quivalent)?s?)?\b',
        r'\bkilos?\s?(?:of\s?)?CO(?:2|²|₂)(?:e(?:quivalent)?s?)?\b',
        r'\bkilogramsCO(?:2|²|₂)(?:e(?:quivalent)?s?)?\b',
    ]
    # Check for matches with metric tons patterns
    for pattern in gigaton_patterns:
        if re.search(pattern, unit, re.IGNORECASE):
            return 'GtCO2e', 1

    # Check for matches with metric tons patterns
    for pattern in megaton_patterns:
        if re.search(pattern, unit, re.IGNORECASE):
            return 'MtCO2e', 1

    # Check for matches with metric tons patterns
    for pattern in kiloton_patterns:
        if re.search(pattern, unit, re.IGNORECASE):
            return 'ktCO2e', 1
    
    # Check for matches with metric tons patterns
    for pattern in ton_patterns:
        if re.search(pattern, unit, re.IGNORECASE):
            return 'tCO2e', 1

    # Check for matches with kilograms patterns
    for pattern in kg_patterns:
        if re.search(pattern, unit, re.IGNORECASE):
            return 'kgCO2e', 1
        
    # Check for cubic_meters patterns
    if unit.lower() in ["m3", "m³", "cubicmeter", "cubicmeters"]:
        return 'cubic_meter', 1
    
    # Check for square_meters patterns
    if unit.lower() in ["m2", "m²", "squaremeter", "squaremeters"]:
        return 'square_meter', 1

    multiplier_map = {
        'k': 1e3,
        'm': 1e6,
        'mio': 1e6,
        'mio.': 1e6,
        'mn': 1e6,
        'b': 1e9,
        'bn': 1e9,
        't': 1e12,
        'tn': 1e12
    }

    currency_pattern = r'^(?!.*\d)(k|m|mio|mio\.|mn|b|bn|t|tn)?(EUR|€|USD|\$|GBP|£|BRL|CAD|CNY|DKK|NOK|SEK|SGD|CHF|PLN)(k|m|mio|mio\.|mn|b|bn|t|tn)?$'

    currency_symbol_map = {
        '€': 'EUR',
        '$': 'USD',
        '£': 'GBP'
    }

    match = re.search(currency_pattern, unit, re.IGNORECASE)
    if match:
        # Extract prefix multiplier, currency symbol, and suffix multiplier safely
        prefix = match.group(1)
        currency_symbol = match.group(2)
        suffix = match.group(3)

        # Ensure prefix and suffix are valid before calling .lower()
        prefix = prefix.lower() if prefix and prefix is not None else ""
        suffix = suffix.lower() if suffix and suffix is not None else ""
        
        # Map symbol to standard currency code if necessary
        currency_symbol = currency_symbol_map.get(currency_symbol, currency_symbol).upper()

        # Resolve multipliers (default to 1 if no prefix/suffix)
        prefix_multiplier = multiplier_map.get(prefix, 1)
        suffix_multiplier = multiplier_map.get(suffix, 1)

        # Calculate the combined multiplier
        total_multiplier = prefix_multiplier * suffix_multiplier
        return currency_symbol, total_multiplier

    # Return original unit if no match
    return unit, 1

In [None]:
# Apply the function to extract multipliers and base units
df['unit_numerator_unified'] = df['unit_numerator'].apply(lambda x: unify_units(x))
df['unit_numerator_regex'] = df['unit_numerator_unified'].apply(lambda x: x[0])
df['suffix_multiplier_numerator'] = df['unit_numerator_unified'].apply(lambda x: x[1])

df['unit_denominator_unified'] = df['unit_denominator'].apply(lambda x: unify_units(x))
df['unit_denominator_regex'] = df['unit_denominator_unified'].apply(lambda x: x[0])
df['suffix_multiplier_denominator'] = df['unit_denominator_unified'].apply(lambda x: x[1])

df.drop(columns=['unit_numerator_unified', 'unit_denominator_unified'], inplace=True)

#### Drop Redundant Columns

In [None]:
df.drop(columns=[
    'unit_preprocessed',
    'numerator',
    'denominator',
    'unit_numerator',
    'unit_denominator',
    ], inplace= True)

### 5. Transform into Pint Quantities and Apply Multipliers

#### Convert to USD

In [None]:
# Read currency conversion file
df_currency = pd.read_csv(currency_mapping_file, delimiter=";")

In [None]:
# Rename the column to "year"
df_currency.rename(columns={'Time Period': 'year'}, inplace=True)
# Rename other columns to remove "_USD"
df_currency.rename(columns=lambda col: col.replace('/USD', '') if '/USD' in col else col, inplace=True)

In [None]:
def convert_to_usd(row):
    year = row["year"]
    # Check and update numerator
    if row['unit_numerator_regex'] in [col for col in df_currency.columns if col != "year"]:
        row['multiplier_numerator'] *= df_currency[row['unit_numerator_regex']].loc[df_currency["year"] == year].values[0]
        row['unit_numerator_regex'] = 'USD'

    # Check and update denominator
    if row['unit_denominator_regex'] in df_currency.columns:
        row['multiplier_denominator'] *= df_currency[row['unit_denominator_regex']].loc[df_currency["year"] == year].values[0]
        row['unit_denominator_regex'] = 'USD'

    return row

In [None]:
# Apply the function to the DataFrame
df_converted = df.apply(convert_to_usd, axis=1)

#### Concatenate numerator and denominator

In [None]:
# Create the "unit_regex" column
def combine_units(row):
    num = row['unit_numerator_regex']
    den = row['unit_denominator_regex']
    
    if pd.isna(num) and pd.isna(den):
        return None
    elif pd.isna(num):
        return f"per_{den}"
    elif pd.isna(den):
        return num
    else:
        return f"{num}_per_{den}"

In [None]:
df_converted['unit_regex'] = df_converted.apply(combine_units, axis=1)

In [None]:
def transform_into_pint(unit):
    if pd.isna(unit):
        return unit
    else:
        try:
            return ureg(unit)
        except Exception:
            return "unification_error"

In [None]:
# Transform into pint units
df_converted['unit_pint'] = df_converted['unit_regex'].apply(transform_into_pint)

In [None]:
# Set unification error units to None and model output to False
error_mask = df_converted["unit_pint"] == "unification_error"
df_converted.loc[error_mask, "unit_pint"] = None
df_converted.loc[error_mask, "model_output_valid"] = False

### 6. Convert Value + Unit Pairs to Standard EFRAG Units

- If pint unit: convert with pint
- If NOT pint unit: set to NaN and set model_output_valid to False

#### Convert Value to Float except for Dates

In [None]:
# Create a mask for rows where standard_type is not 'date'
not_date_mask = df_converted['type_standard'] != 'date'

# Store original NaN mask for 'value' to distinguish existing NaN values
original_nan_mask = df_converted['value'].isna()

# Attempt to convert 'value' to float for non-date rows
df_converted.loc[not_date_mask, 'value_numeric'] = pd.to_numeric(df_converted.loc[not_date_mask, 'value'], errors='coerce')

# Update 'model_output_valid' to False for conversion errors only (exclude original NaNs)
conversion_error_mask = df_converted['value_numeric'].isna() & ~original_nan_mask & not_date_mask
df_converted.loc[conversion_error_mask, 'model_output_valid'] = False

In [None]:
# Perform the calculations only for rows in not_date_mask
df_converted.loc[not_date_mask, 'value_final'] = (
    df_converted.loc[not_date_mask, 'value_numeric']
    * df_converted.loc[not_date_mask, 'multiplier_numerator']
    * df_converted.loc[not_date_mask, 'suffix_multiplier_numerator']
    / df_converted.loc[not_date_mask, 'multiplier_denominator']
    / df_converted.loc[not_date_mask, 'suffix_multiplier_denominator']
)

In [None]:
# Drop multiplier columns
df_converted.drop(columns=['multiplier_numerator', 'suffix_multiplier_numerator', 'multiplier_denominator', 'suffix_multiplier_denominator'], inplace=True)

In [None]:
# Filter out rows with NaN or "date" in the unit_pint column
valid_rows = df_converted['unit_pint'].notna() & (df_converted['type_standard'] != 'date')
df_filtered = df_converted[valid_rows].copy()  # Filtered DataFrame

# Create the column with pint quantities (unit_pint is already a quantity)
df_filtered["quantity_pint"] = df_filtered['value_final'] * df_filtered['unit_pint']

# Optionally reassign to the original DataFrame (if needed)
df_converted.loc[valid_rows, "quantity_pint"] = df_filtered["quantity_pint"]

In [None]:
type_unit_dict = {
    'date': None,
    'years': ureg.year,
    'integer': None,
    'percent': ureg.percent,
    'USD': ureg.USD,
    'tCO2e': ureg.tCO2e,
    'MWh': ureg.MWh,
    'energy consumed per monetary unit (USD)': ureg.MWh_per_USD,
    'CO2 equivalent emissions per monetary unit (USD)': ureg.tCO2e_per_USD,
    'USD (with indication of whether the amount is increased (+) or decreased (-))': ureg.USD,
    'tonnes': ureg.metric_ton,
    'm3': ureg.cubic_meter,
    'volume consumed per monetary unit (USD)': ureg.cubic_meter_per_USD,
    'hectares': ureg.hectares,
    'decimal': None,
    'hours': ureg.hours,
}

In [None]:
def convert_to_target(row):
    quantity = row['quantity_pint']
    # Skip rows with NaN or "date" in the unit column
    if pd.isna(quantity) or row['type_standard'] == 'date':
        return quantity, row["model_output_valid"]
    try:
        # Determine target unit
        target_unit = type_unit_dict[row['type_standard']]
        # Convert to the target unit
        converted_quantity = quantity.to(target_unit)
        return converted_quantity, True  # Return the numerical value
    except Exception as e:
        # Handle conversion errors separately
        print(f"Conversion error for row {row.name}: {e}")
        return None, False

In [None]:
df_converted[['quantity_converted', 'model_output_valid']] = pd.DataFrame(
    df_converted.apply(convert_to_target, axis=1).tolist(), index=df_converted.index
)

In [None]:
# Function to handle extraction
def extract_value_and_unit(quantity):
    if isinstance(quantity, ureg.Quantity):  # Check if it's a pint.Quantity
        return quantity.magnitude, str(quantity.units)
    elif isinstance(quantity, str):  # Check if it's a string (e.g., a date)
        return quantity, None
    elif pd.isna(quantity):  # Handle NaN values
        return None, None
    else:
        raise ValueError(f"Unexpected type in quantity_pint: {type(quantity)}")

In [None]:
# Apply the extraction function and assign to new columns
df_converted[['value_final', 'unit_final']] = df_converted['quantity_converted'].apply(extract_value_and_unit).apply(pd.Series)

#### Concatenate dimensionless mask and converted values

In [None]:
# Concatenate the two DataFrames
df_final = pd.concat([dimensionless_mask, df_converted])

# Restore the original order using sort_index
df_final = df_final.sort_index()

In [None]:
# Final Step: Set unit to None for model output valid = False
df_final.loc[df_final["model_output_valid"] == False, ["value_final", "unit_final"]] = None

# Replace NaN with None for value
df_final["value_final"] = df_final["value_final"].where(pd.notna(df_final["value_final"]), None)

In [None]:
df_final.drop(columns=[
    'value',
    'unit',
    'unit_numerator_regex',
    'unit_denominator_regex',
    'unit_regex',
    'unit_pint',
    'value_numeric',
    'quantity_pint',
    'quantity_converted'
    ], inplace=True)

### 8. Save Postprocessed Results

In [None]:
df_final.head()

In [None]:
df_final.to_csv(postprocessed_results_file, index=False)