## Concrete Rule-based Keyword Identification

In [None]:
## Original

import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Pattern Definitions
# ------------------------------------------------------------------------------

# A) "cement" must appear in both name & desc to force "Cement"
CEMENT_WORD = r"\bcement\b"

# B) Numeric-based Concrete patterns
# Capture cases like "32c12/15", "36c30/37", "101c35/45", etc.
CONCRETE_PATTERNS = [
    r"\brck\d+\b",  # e.g. rck30, rck50
    r"\b\d*c\s*\d{1,2}/\d{1,2}\b",  # e.g. c20/25, 32c12/15, 101c35/45
    r"\bxc\d?\b",  # e.g. xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bsf2\b",  # slump-flow class
    r"\bcls\b",  # e.g., for “CLS” used in some Italian notations
]

# C) Aggregates keywords – used to trigger the Aggregates category
AGGREGATES_KEYWORDS = [
    r"\baggregates?\b",  # "aggregate" or "aggregates"
    r"\brubble\b",
    r"\bfill\b",
]

# D) Patterns for CEM references (fallback for Cement)
CEM_PATTERNS = [
    r"\bcem\s?[i-v]",  # e.g., CEM I, CEM II, etc.
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+",  # e.g., CEM IV/B-LL 32,5R
]

# E) Keywords to detect steel or similar products (for negative filtering)
STEEL_KEYWORDS = [
    r"\bsteel\b",
    r"\brebar\b",
    r"\bmesh\b",
    r"\bprofile\b",
    r"\bpipe\b",
    r"\bpex\b",
]

PRECAST_KEYWORDS = [
    r"\breinforced concrete\b",    
    # r"\breinforcing\b",            # give false positives for now
    r"\bpre-?fabricated concrete\b", # matches "prefabricated concrete" or "pre-fabricated concrete"
    r"\bpre-?cast concrete\b",       # matches "precast concrete" or "pre-cast concrete"
    r"\bpre-?cast\b",                # matches "precast" or "pre-cast"
    r"\bpre-?stressed\b"             # matches "prestressed" or "pre-stressed"
]

# ------------------------------------------------------------------------------
# 2) Helper Functions
# ------------------------------------------------------------------------------


def contains_whole_word(word_regex, text):
    """Check if 'word_regex' is present as a whole word in 'text' (case-insensitive)."""
    return bool(re.search(word_regex, text, re.IGNORECASE))


def matches_any(pattern_list, text):
    """Return True if any regex in 'pattern_list' matches 'text' (case-insensitive)."""
    for pattern in pattern_list:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False


def in_both_name_desc(word_regex, name_text, desc_text):
    """Return True if 'word_regex' appears in both name and description."""
    return contains_whole_word(word_regex, name_text) and contains_whole_word(
        word_regex, desc_text
    )


def name_has_aggregates(name_text):
    """Return True if the product name contains an aggregator keyword."""
    for kw in AGGREGATES_KEYWORDS:
        if re.search(kw, name_text, re.IGNORECASE):
            return True
    return False


def is_steel_accessory(name_lower):
    """Return True if the product name indicates a steel accessory (e.g. rebar, pipe)."""
    for kw in STEEL_KEYWORDS:
        if re.search(kw, name_lower):
            return True
    return False


# ------------------------------------------------------------------------------
# 3) Main Classification Function
# ------------------------------------------------------------------------------


def classify_product(
    product_name: str,
    product_desc: str,
    product_app: str,
    flow_property_name: str = None,
    flow_property_mean_value: str = None,
    flow_property_reference_unit: str = None,
) -> str:
    """
    Classify a product using a hierarchy of rules:

      1) If the product name contains both "admixture(s)" or "additive(s)" and "concrete", classify as:
         "Mineral building products > Mortar and Concrete > Concrete additive".

      2) If the product name contains a measurement in mm (e.g., "10 mm" or "10/20 mm")
         and also contains "aggregate" (or common misspellings) or "concrete", classify as:
         "Mineral building products > Concrete aggregates".

      3) If flow property data is available and:
         - Flow Property Name is "Volume" (case-insensitive),
         - Flow Property Mean Value is "1" or "1.0",
         - Flow Property Reference Unit is "m3",
         and the text contains any numeric concrete pattern,
         then classify as:
         "Mineral building products > Mortar and Concrete > Ready mixed concrete".

      4) If "cement" appears in both product name and description, classify as "Cement".

      5) If any numeric-based concrete patterns (CONCRETE_PATTERNS) are present, classify as "Concrete".

      6) If the product name contains "concrete" (but not as part of a steel accessory), classify as "Concrete".

      7) If any CEM patterns are found, classify as "Cement".

      8) Otherwise, return "Other".
    """
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    app_lower = product_app.lower()
    combined_text = name_lower + " " + desc_lower

    # Concrete Admixtures
    if (
        re.search(r"\badmixtures?\b", name_lower)
        or re.search(r"\badditives?\b", name_lower)
        and re.search(r"\bconcrete\b", name_lower)
    ):
        return "Mineral building products > Mortar and Concrete > Concrete additive"

    #  Aggregates based on measurement and keywords
    # Look for a measurement in mm (e.g., "10 mm", "10/20 mm") and either "aggregate" (or a close variant) or "concrete"
    if re.search(r"\b\d+\s*(?:/\s*\d+)?\s*mm\b", name_lower) and (
        re.search(r"aggregate", name_lower, re.IGNORECASE)
        or re.search(r"aggregte", name_lower, re.IGNORECASE)
        and re.search(r"concrete", name_lower, re.IGNORECASE)
    ):
        return "Mineral building products > Concrete aggregates"

    # Flow Property Rule for Concrete ---
    if (
        flow_property_name is not None
        and flow_property_mean_value is not None
        and flow_property_reference_unit is not None
    ):
        if (
            flow_property_name.strip().lower() == "volume"
            and flow_property_reference_unit.strip().lower() == "m3"
            and flow_property_mean_value.strip() in {"1", "1.0"}
        ):
            if matches_any(CONCRETE_PATTERNS, combined_text) or re.search(
                r"concrete", name_lower, re.IGNORECASE
            ):

                return "Mineral building products > Mortar and Concrete > Ready mixed concrete"


    # Cement (explicit "cement" in both name and description) ---
    if in_both_name_desc(CEMENT_WORD, name_lower, desc_lower):
        return "Cement"

    # --- Rule 7: CEM Patterns as fallback for Cement ---
    if matches_any(CEM_PATTERNS, combined_text):
        return "Cement"
    
    # Concrete when ready mix is in the name
    if re.search(r"ready[\s-]*mix(?:[\s-]*ed)?", name_lower, re.IGNORECASE) or re.search(r"ready[\s-]*mix(?:[\s-]*ed)?", app_lower, re.IGNORECASE):
        return "Mineral building products > Mortar and Concrete > Ready mixed concrete"

    # # Numeric-based Concrete Patterns
    # if matches_any(CONCRETE_PATTERNS, name_lower):
    #     return "Mineral building products > Mortar and Concrete > Ready mixed concrete"
    
    # Numeric-based Concrete Patterns
    # if matches_any(CONCRETE_PATTERNS, combined_text):
    #     return "Concrete"
    
    # Precast Concrete Patterns
    if matches_any(PRECAST_KEYWORDS, combined_text):
        if re.search(r"\basphalt\b", name_lower, re.IGNORECASE) or re.search(r"\basphalt\b", desc_lower, re.IGNORECASE):
            return "Mineral building products > Asphalt"
        return "Mineral building products > Bricks, blocks and elements > Precast concrete elements and goods"

    # --- Rule 6: If product name has the word "concrete" (but avoid steel accessories) ---
    if re.search(r"\bconcrete\b", name_lower, re.IGNORECASE):
        if is_steel_accessory(name_lower):
            return "Other"  # or a dedicated "Steel/Reinforcement" category if desired
        return "Concrete"


    # --- Default ---
    return "Other"


# ------------------------------------------------------------------------------
# 4) Bulk Classification for a CSV
# ------------------------------------------------------------------------------


def classify_dataset(
    input_csv: str,
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description",
    app_col: str = "Technological Applicability",
    flow_prop_name_col: str = "Flow Property Name",
    flow_prop_mean_col: str = "Flow Property Mean Value",
    flow_prop_unit_col: str = "Flow Property Reference Unit",
):
    """
    Reads a CSV, classifies each row using classify_product, and writes a new CSV with the results.
    """
    df = pd.read_csv(input_csv)
    results = []

    for _, row in df.iterrows():
        p_name = str(row.get(name_col, ""))
        p_desc = str(row.get(desc_col, ""))
        p_app = str(row.get(app_col, ""))
        fp_name = (
            str(row.get(flow_prop_name_col, "")) if flow_prop_name_col in row else None
        )
        fp_mean = (
            str(row.get(flow_prop_mean_col, "")) if flow_prop_mean_col in row else None
        )
        fp_unit = (
            str(row.get(flow_prop_unit_col, "")) if flow_prop_unit_col in row else None
        )

        classification = classify_product(p_name, p_desc, p_app, fp_name, fp_mean, fp_unit)
        results.append(classification)

    df["RegEx Classification"] = results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Output saved to '{output_csv}'")


# ------------------------------------------------------------------------------
# 5) Example Usage
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    # Example usage: adjust file paths and column names as needed.
    suffix = "classified_all_05"
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data02.csv",
        output_csv=f"../../data/pipeline2/sql/regex_classified/filtered_epd_data02_{suffix}.csv",
        name_col="Product Name",
        desc_col="Technology Description",
        flow_prop_name_col="Flow Property Name",
        flow_prop_mean_col="Flow Property Mean Value",
        flow_prop_unit_col="Flow Property Reference Unit",
    )

In [None]:
# Isolate concrete

def filter_concrete_entries(input_csv: str, output_csv: str):
    # Read the CSV file generated by your classification script
    df = pd.read_csv(input_csv)
    
    # Filter rows where "RegEx Classification" contains "concrete" (case-insensitive)
    filtered_df = df[df["RegEx Classification"].str.contains("concrete", case=False, na=False)]
    
    # Write the filtered dataframe to a new CSV file
    filtered_df.to_csv(output_csv, index=False)
    print(f"Filtered CSV saved to '{output_csv}'")

# Set file paths
input_csv = "../../data/pipeline2/sql/regex_classified/filtered_epd_data02_classified_all_05.csv"
output_csv = "../../data/pipeline2/sql/regex_classified/filtered_epd_data02_classified_concrete05.csv"

# Run the filtering function
filter_concrete_entries(input_csv, output_csv)
