## Concrete Rule-based Keyword Identification

In [None]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Define lists and patterns for classification
# ------------------------------------------------------------------------------

# Regex patterns that strongly indicate "Concrete"
CONCRETE_PATTERNS = [
    r"\bconcrete\b",        # whole word "concrete"
    r"\brck\d+\b",          # e.g. rck30, rck50
    r"\bc\d+/\d+\b",        # e.g. c20/25, c35/45
    r"\bxc\d?\b",           # e.g. xc2, xc3
    r"\bxd\d?\b",           # e.g. xd3
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bs[345]\b",          # e.g. s3, s4, s5
    r"\bsf2\b",             # slump-flow class
    r"\bready[- ]mix\b",    # ready-mix or ready mix
    r"\baggregate\b",       # presence of aggregates is typical
    r"\bbatching\b"
]

# Regex patterns that strongly indicate "Cement"
CEMENT_PATTERNS = [
    r"\bcement\b",
    r"\bclinker\b",
    r"\bgypsum\b",
    r"\bpozzolan(?:a)?\b",   # pozzolan or pozzolana
    r"\bportland\b",
    r"\bbinder\b",
    # Patterns for CEM I, II, III, IV, V
    r"\bcem\s?[i-v]",       # e.g. CEM I, CEM II, CEM III
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+"
    # (some cements look like "CEM II/B-LL" or "CEM IV/A (P)" etc.
]

# (Optional) Additional keyword sets for your domain can be added here.

# ------------------------------------------------------------------------------
# 2) Helper Functions to Classify a Single Product
# ------------------------------------------------------------------------------

def is_concrete(text: str) -> bool:
    """Return True if text matches any strong 'concrete' pattern."""
    for pattern in CONCRETE_PATTERNS:
        if re.search(pattern, text):
            return True
    return False

def is_cement(text: str) -> bool:
    """Return True if text matches any strong 'cement' pattern."""
    for pattern in CEMENT_PATTERNS:
        if re.search(pattern, text):
            return True
    return False

def classify_product(product_name: str, product_desc: str) -> str:
    """
    Classify a product into 'Concrete', 'Cement', or 'Other' based on patterns.
    Checks both name and description text.
    """
    # Combine name & description for broader context
    combined_text = (product_name + " " + product_desc).lower()

    # Check for multiple signals
    concrete_flag = is_concrete(combined_text)
    cement_flag = is_cement(combined_text)

    if concrete_flag:
        # If both concrete and cement appear, choose the more dominant if needed.
        # Here we assume "concrete" is the final if both appear, but
        # you could refine logic if necessary.
        return "Mineral-based Building Materials > Mortar and Concrete > Concrete"
    elif cement_flag:
        return "Mineral-based Building Materials > Binders > Cement"
    else:
        return "Other"

# ------------------------------------------------------------------------------
# 3) Main Script to Read CSV, Classify, and Output
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads a CSV, classifies each row, and writes a new CSV with 'Classification Result'.
    """
    number_entries = 100

    # Read the dataset
    df = pd.read_csv(input_csv).head(number_entries)

    # Prepare output column
    classification_results = []

    for index, row in df.iterrows():
        # Safely handle missing columns
        product_name = str(row[name_col]) if name_col in row else ""
        product_desc = str(row[desc_col]) if desc_col in row else ""

        # Classify
        classification = classify_product(product_name, product_desc)
        classification_results.append(classification)

    # Add a new column with results
    df["Classification Result"] = classification_results

    # Write out to CSV
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Results saved to {output_csv}")

# ------------------------------------------------------------------------------
# 4) Example Usage
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Example usage:
    # Suppose we have 'dataset.csv' with columns ["Product Name", "Classification", "Technology Description"].
    # We'll produce 'classified_output.csv'
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )



Classification completed. Results saved to ../../data/pipeline2/sql/filtered_epd_data_lc_classified.csv


## Iteration 2

In [1]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Define Regex Patterns and Category Overrides
# ------------------------------------------------------------------------------

# (A) Patterns that strongly indicate "Concrete" when in the Product Name
CONCRETE_PATTERNS = [
    r"\bconcrete\b",            # whole word "concrete"
    r"\brck\d+\b",              # e.g. rck30, rck50
    # Updated to match full strength classes only (C20/25, C35/45, etc.)
    r"\bc\d{1,2}/\d{1,2}\b",    # e.g. C20/25, C35/45
    r"\bxc\d?\b",               # e.g. xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bs[345]\b",              # e.g. s3, s4, s5
    r"\bsf2\b",                 # slump-flow class
]

# (B) Patterns for "CEM" references
CEM_PATTERNS = [
    r"\bcem\s?[i-v]",                                 # e.g. CEM I, CEM II, CEM III
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+"          # e.g. CEM IV/B-LL 32.5R, etc.
]

# (C) Override Keywords/Patterns
MORTAR_PATTERNS = [r"\bmortar\b", r"\bscreed\b", r"\brender\b"]
PLASTERBOARD_PATTERNS = [r"\bplasterboard\b", r"\bdrywall\b", r"\brigips\b", r"\bgyproc\b"]
AGGREGATES_PATTERNS = [r"\baggregate\b", r"\brubble\b", r"\bdrainage\b", r"\bfill\b"]
INSULATION_PATTERNS = [r"\bmineral wool\b", r"\binsulation\b", r"\bfiberglass\b"]

# ------------------------------------------------------------------------------
# 2) Helper Functions for Pattern Matching
# ------------------------------------------------------------------------------

def matches_any(pattern_list, text):
    """Return True if any regex pattern in 'pattern_list' matches 'text'."""
    for pattern in pattern_list:
        if re.search(pattern, text):
            return True
    return False

# ------------------------------------------------------------------------------
# 3) Classification Logic with Priority Overrides
# ------------------------------------------------------------------------------

def classify_product(product_name: str, product_desc: str) -> str:
    """
    Classify a product into:
      - 'Mortar'
      - 'Gypsum Boards'
      - 'Aggregates'
      - 'Insulation'
      - 'Concrete'
      - 'Cement'
      - 'Other'
    according to the specified override rules and refined concrete logic.
    """

    # Convert to lowercase for simpler matching
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    combined_text = name_lower + " " + desc_lower

    # 1) Mortar override
    if matches_any(MORTAR_PATTERNS, combined_text):
        return "Mortar"

    # 2) Plasterboard override
    if matches_any(PLASTERBOARD_PATTERNS, combined_text):
        return "Mineral-based Building Materials > Bricks and Elements > Gypsum Boards"

    # 3) Aggregates override
    #   Note: Specifically check the Product Name first, as requested, 
    #   but you can also use combined_text if aggregates can appear in desc.
    if matches_any(AGGREGATES_PATTERNS, name_lower):
        return "Mineral-based Building Materials > Aggregates"

    # 4) Insulation override
    if matches_any(INSULATION_PATTERNS, combined_text):
        return "Insulation Materials"

    # 5) Check for "Concrete" classification:
    #    a) If the Product Name has a strong Concrete pattern, it's Concrete
    #    b) Or if it has "CEM + Concrete pattern" in the name, also Concrete
    has_concrete_pattern = matches_any(CONCRETE_PATTERNS, name_lower)
    has_cem_pattern = matches_any(CEM_PATTERNS, name_lower)

    if has_concrete_pattern:
        # Product name alone signals strongly it's "Concrete"
        return "Concrete"
    elif has_cem_pattern and has_concrete_pattern:
        # This scenario might be redundant since 'has_concrete_pattern' 
        # is already checked, but included for clarity if you want 
        # to handle it differently. 
        return "Concrete"

    # 6) If we see a "CEM" reference (in name or description),
    #    but no strong concrete pattern, classify as "Cement"
    if matches_any(CEM_PATTERNS, combined_text):
        return "Cement"

    # 7) Default fallback => 'Other'
    return "Other"

# ------------------------------------------------------------------------------
# 4) Main Script to Read & Write CSV
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads a CSV, classifies each row according to the improved logic,
    and writes a new CSV with 'Classification Result'.
    """
    df = pd.read_csv(input_csv).head(100)
    classification_results = []

    for index, row in df.iterrows():
        product_name = str(row.get(name_col, ""))
        product_desc = str(row.get(desc_col, ""))

        classification = classify_product(product_name, product_desc)
        classification_results.append(classification)

    df["Classification Result"] = classification_results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Results saved to {output_csv}")

# ------------------------------------------------------------------------------
# 5) Example Usage
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Example usage:
    # Input CSV with columns: "Product Name", "Technology Description"
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified_02.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )


Classification completed. Results saved to ../../data/pipeline2/sql/filtered_epd_data_lc_classified_02.csv


## Iteration 3

In [5]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Define Regex / Keyword Patterns
# ------------------------------------------------------------------------------
CONCRETE_PATTERNS = [
    r"\bconcrete\b",            # whole word "concrete"
    r"\brck\d+\b",              # e.g., rck30, rck50
    r"\bc\d{1,2}/\d{1,2}\b",    # e.g., C20/25, C35/45
    r"\bxc\d?\b",               # e.g., xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bs[345]\b",              # e.g., s3, s4, s5
    r"\bsf2\b",                 # slump-flow class
]

CEM_PATTERNS = [
    r"\bcem\s?[i-v]",              # e.g., CEM I, CEM II
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+"  # e.g., CEM IV/B-LL 32.5R
]

# 2) For the new classification rules
CEMENT_WORD = "cement"
SCREED_WORD = "screed"

# 3) Additional patterns/keywords for categories
PLASTERBOARD_KEYWORDS = [r"\bplaster\b", r"\bgypsum\b", r"\brigips\b", r"\bgyproc\b"]
AGGREGATES_KEYWORDS = [r"\baggregate\b", r"\brubble\b", r"\bfill\b"]

# ------------------------------------------------------------------------------
# Helper function: matches_any()
# ------------------------------------------------------------------------------

def matches_any(pattern_list, text):
    """
    Returns True if any regex in 'pattern_list' matches 'text', else False.
    """
    for pattern in pattern_list:
        if re.search(pattern, text):
            return True
    return False

def contains_word(word, text):
    """
    Checks if a whole word (case-insensitive) appears in 'text'.
    Simple approach: use \bword\b with re.IGNORECASE.
    """
    return bool(re.search(rf"\b{re.escape(word)}\b", text, re.IGNORECASE))

# ------------------------------------------------------------------------------
# 4) Main Classification Logic
# ------------------------------------------------------------------------------

def classify_product(product_name: str, product_desc: str) -> str:
    """
    Refined classification logic implementing the requested rules:
      1) If 'cement' in both product name + desc -> 'Cement'
      2) If 'screed' in both product name + desc -> 'Dry Screed'
      3) If plaster/gypsum/rigips/gyproc -> 'Gypsum Boards'
      4) If aggregates/rubble/fill -> 'Aggregates'
      5) If product name has any concrete pattern -> 'Concrete'
         or if product name has both CEM pattern + a concrete pattern -> 'Concrete'
      6) Else 'Other'
    """
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    combined_text = name_lower + " " + desc_lower

    # 1) "Cement" in both name & description
    if contains_word(CEMENT_WORD, name_lower) and contains_word(CEMENT_WORD, desc_lower):
        return "Cement"

    # 2) "Screed" in both name & description => "Dry Screed"
    if contains_word(SCREED_WORD, name_lower) and contains_word(SCREED_WORD, desc_lower):
        return "Dry Screed"

    # 3) Gypsum Boards check
    if matches_any(PLASTERBOARD_KEYWORDS, combined_text):
        return "Mineral-based Building Materials > Bricks and Elements > Gypsum Boards"

    # 4) Aggregates check
    if matches_any(AGGREGATES_KEYWORDS, combined_text):
        return "Mineral-based Building Materials > Aggregates"

    # 5) Concrete classification
    has_concrete_pattern = matches_any(CONCRETE_PATTERNS, name_lower)
    has_cem_pattern      = matches_any(CEM_PATTERNS,  name_lower)

    # If name alone has a strong concrete indicator
    if has_concrete_pattern:
        return "Concrete"

    # Or if name has both a CEM pattern + any concrete pattern
    if has_cem_pattern and has_concrete_pattern:
        return "Concrete"

    # Additional fallback for "CEM" references in desc => "Cement" 
    # (If we didn't detect it above with name+desc "cement" or the strong "Concrete" match.)
    if matches_any(CEM_PATTERNS, combined_text):
        return "Cement"

    # 6) Default => Other
    return "Other"

# ------------------------------------------------------------------------------
# 5) Script to Classify an Entire CSV
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads a CSV, classifies each row, and writes a new CSV with 'Classification Result'.
    Incorporates the final, refined classification logic.
    """
    df = pd.read_csv(input_csv).head(100)
    results = []

    for _, row in df.iterrows():
        product_name = str(row.get(name_col, ""))
        product_desc = str(row.get(desc_col, ""))

        label = classify_product(product_name, product_desc)
        results.append(label)

    df["Classification Result"] = results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Results saved to {output_csv}")

# ------------------------------------------------------------------------------
# 6) Example Usage
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Example usage: 
    #   Provide input CSV with columns "Product Name" & "Technology Description"
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified_03.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )


Classification completed. Results saved to ../../data/pipeline2/sql/filtered_epd_data_lc_classified_03.csv


## Iteration 4

In [4]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Regex Patterns
# ------------------------------------------------------------------------------

# Concrete-related patterns that MUST appear in the Product Name to classify as "Concrete".
CONCRETE_PATTERNS_NAME = [
    r"\brck\d+\b",              # e.g. rck30, rck50
    r"\bc\d{1,2}/\d{1,2}\b",    # e.g. C20/25, C35/45
    r"\bxc\d?\b",               # e.g. xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bs[345]\b",              # e.g. s3, s4, s5
    r"\bsf2\b",                 # slump-flow class
    r"\bconcrete\b"             # explicit "concrete" in the name
]

# Cement patterns (anywhere in name or description) if no "Concrete Patterns" in the name.
CEM_PATTERNS = [
    r"\bcem\s?[i-v]",             # e.g. CEM I, CEM II
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+"  # e.g. CEM IV/B-LL 32.5R
]

# Aggregates triggers (must be found in both name & description if not already "Concrete" or "Cement").
AGGREGATES_KEYWORDS = [r"\baggregate\b", r"\brubble\b", r"\bfill\b"]

# ------------------------------------------------------------------------------
# 2) Helper Functions
# ------------------------------------------------------------------------------

def matches_any(pattern_list, text):
    """Return True if any of the regex patterns in pattern_list match text."""
    for pattern in pattern_list:
        if re.search(pattern, text):
            return True
    return False

def contains_any_aggregates(text):
    """Check if text contains at least one aggregates keyword."""
    for pattern in AGGREGATES_KEYWORDS:
        if re.search(pattern, text):
            return True
    return False

# ------------------------------------------------------------------------------
# 3) Main Classification Function
# ------------------------------------------------------------------------------

def classify_product(product_name: str, product_desc: str) -> str:
    """
    Classification logic:
      1) If Product Name has any "Concrete Patterns" => "Concrete"
      2) Else if combined text has "CEM" patterns => "Cement"
      3) Else if both name & desc mention "aggregates" => "Aggregates"
      4) Else => "Other"
    """
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    combined_text = name_lower + " " + desc_lower

    # 1) Concrete check (NAME must contain the pattern)
    if matches_any(CONCRETE_PATTERNS_NAME, name_lower):
        return "Concrete"

    # 2) Cement check (no concrete patterns found in name, but we see CEM anywhere in text)
    if matches_any(CEM_PATTERNS, combined_text):
        return "Cement"

    # 3) Aggregates check
    #    Must find an aggregates keyword in BOTH name and description
    if contains_any_aggregates(name_lower) and contains_any_aggregates(desc_lower):
        return "Mineral-based Building Materials > Aggregates"

    # 4) Default => Other
    return "Other"

# ------------------------------------------------------------------------------
# 4) CSV Classification Script
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads a CSV, classifies each row, writes new CSV with 'Classification Result'.
    """
    df = pd.read_csv(input_csv).head(100)
    classification_results = []

    for _, row in df.iterrows():
        product_name = str(row.get(name_col, ""))
        product_desc = str(row.get(desc_col, ""))
        label = classify_product(product_name, product_desc)
        classification_results.append(label)

    df["Classification Result"] = classification_results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Output saved: {output_csv}")

# ------------------------------------------------------------------------------
# 5) Example Usage
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Example usage:
    # Provide a CSV file with columns "Product Name" and "Technology Description".
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified_04.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )


Classification completed. Output saved: ../../data/pipeline2/sql/filtered_epd_data_lc_classified_04.csv


## Iteration 5

In [7]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Pattern Definitions
# ------------------------------------------------------------------------------

# A) Cement detection: word "cement" must appear in both name & description
CEMENT_WORD = r"\bcement\b"

# B) Concrete patterns: if any appear in the entire text => "Concrete"
CONCRETE_PATTERNS = [
    r"\brck\d+\b",              # e.g. rck30, rck50
    r"\bc\d{1,2}/\d{1,2}\b",    # e.g. C20/25, C35/45
    r"\bxc\d?\b",               # e.g. xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bs[345]\b",              # e.g. s3, s4, s5
    r"\bsf2\b",                 # slump-flow class
]
# Note: intentionally NOT including r"\bconcrete\b" to avoid misclassification
# if the numeric patterns (Rck, Cxx/yy, etc.) are absent.

# Cement patterns (anywhere in name or description) if no "Concrete Patterns" in the name.
CEM_PATTERNS = [
    r"\bcem\s?[i-v]",             # e.g. CEM I, CEM II
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+"  # e.g. CEM IV/B-LL 32.5R
]

# C) Aggregates keywords: if in the product name (and not already cement/concrete) => Aggregates
AGGREGATES_KEYWORDS = [r"\baggregates?\b", r"\brubble\b", r"\bfill\b"]

# ------------------------------------------------------------------------------
# 2) Helper Functions
# ------------------------------------------------------------------------------

def contains_whole_word(word_pattern, text):
    """Check if 'word_pattern' (regex) is present in 'text'."""
    return bool(re.search(word_pattern, text, re.IGNORECASE))

def matches_any(pattern_list, text):
    """Return True if any regex in 'pattern_list' matches 'text' (case-insensitive)."""
    for pattern in pattern_list:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

def name_has_aggregates(name_text):
    """Check if product name has aggregator keywords (aggregates, rubble, fill)."""
    for kw in AGGREGATES_KEYWORDS:
        if re.search(kw, name_text, re.IGNORECASE):
            return True
    return False

# ------------------------------------------------------------------------------
# 3) Main Classification
# ------------------------------------------------------------------------------

def classify_product(product_name: str, product_desc: str) -> str:
    """
    Classifies into "Cement", "Concrete", "Mineral-based Building Materials > Aggregates", or "Other".
    
    1) Cement: if "cement" is found in both product name & desc.
    2) Concrete: if any of the numeric-based patterns (rck, c20/25, xc2, etc.) appear anywhere in text.
    3) Aggregates: if name contains aggregator keywords (aggregates, rubble, fill) (and not already Cement/Concrete).
    4) Other: fallback.
    """
    # Lowercase for easy matching
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    combined_text = name_lower + " " + desc_lower


    # 1) Concrete rule: check entire text for numeric-based patterns
    if matches_any(CONCRETE_PATTERNS, combined_text):
        return "Concrete"
    
    # 2) Cement rule: 'cement' in both name & desc
    if (contains_whole_word(CEMENT_WORD, name_lower) and
        contains_whole_word(CEMENT_WORD, desc_lower)):
        return "Cement"

    # 3) Cement check (no concrete patterns found in name, but we see CEM anywhere in text)
    if matches_any(CEM_PATTERNS, combined_text):
        return "Cement"      

    # 4) Aggregates rule: check aggregator keywords in name
    if name_has_aggregates(name_lower):
        return "Mineral-based Building Materials > Aggregates"

    # 5) Default => Other
    return "Other"

# ------------------------------------------------------------------------------
# 4) Bulk Classification for a CSV
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads CSV, classifies each row with 'classify_product', writes new CSV with 'Classification Result'.
    """
    df = pd.read_csv(input_csv).head(100)
    results = []

    for _, row in df.iterrows():
        p_name = str(row.get(name_col, ""))
        p_desc = str(row.get(desc_col, ""))

        classification = classify_product(p_name, p_desc)
        results.append(classification)

    df["Classification Result"] = results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Output saved to '{output_csv}'")

# ------------------------------------------------------------------------------
# 5) Example Usage
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    # Provide a CSV with columns: "Product Name", "Technology Description"
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified_05.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )


Classification completed. Output saved to '../../data/pipeline2/sql/filtered_epd_data_lc_classified_05.csv'


## Iteration 6

In [9]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Pattern Definitions
# ------------------------------------------------------------------------------

# Cement detection: word "cement" must appear in both name & description
CEMENT_WORD = r"\bcement\b"

# Numeric-based Concrete patterns (anywhere in text)
CONCRETE_PATTERNS = [
    r"\brck\d+\b",              # e.g. rck30, rck50
    r"\bc\d{1,2}/\d{1,2}\b",    # e.g. C20/25, C35/45
    r"\bxc\d?\b",               # e.g. xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bs[345]\b",              # e.g. s3, s4, s5
    r"\bsf2\b",                 # slump-flow class
]

# Aggregates keywords (product name only)
AGGREGATES_KEYWORDS = [r"\baggregates?\b", r"\brubble\b", r"\bfill\b"]

# ------------------------------------------------------------------------------
# 2) Helper Functions
# ------------------------------------------------------------------------------

def contains_whole_word(word_regex, text):
    """
    Check if 'word_regex' is present as a whole word in 'text' (case-insensitive).
    """
    return bool(re.search(word_regex, text, re.IGNORECASE))

def matches_any(pattern_list, text):
    """
    Return True if any regex in 'pattern_list' matches 'text' (case-insensitive).
    """
    for pattern in pattern_list:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

def has_aggregates(name_text):
    """
    Check if product name contains aggregator keywords: 'aggregates', 'rubble', 'fill'.
    """
    for kw in AGGREGATES_KEYWORDS:
        if re.search(kw, name_text, re.IGNORECASE):
            return True
    return False

def is_cement(name_lower, desc_lower):
    """
    Returns True if the word 'cement' is in both name and description (case-insensitive).
    """
    return (contains_whole_word(CEMENT_WORD, name_lower) and
            contains_whole_word(CEMENT_WORD, desc_lower))

# ------------------------------------------------------------------------------
# 3) Main Classification
# ------------------------------------------------------------------------------

def classify_product(product_name: str, product_desc: str) -> str:
    """
    1) Cement: if 'cement' is in both name & description
    2) Concrete (Numeric-based): if combined text has Rck/cxx etc.
    3) Aggregates: if aggregator keywords in name
    4) Concrete (from 'concrete' in name): if not covered by above
    5) Other (fallback)
    """
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    combined_text = name_lower + " " + desc_lower

    # 1) Cement check
    if is_cement(name_lower, desc_lower):
        return "Cement"

    # 2) Numeric-based Concrete patterns (anywhere in text)
    if matches_any(CONCRETE_PATTERNS, combined_text):
        return "Concrete"

    # 3) Aggregates check in name
    if has_aggregates(name_lower):
        return "Mineral-based Building Materials > Aggregates"

    # 4) 'Concrete' in the name => 'Concrete' (unless aggregator is found)
    #    We only do this if not already Cement or numeric-based Concrete or Aggregates.
    if re.search(r"\bconcrete\b", name_lower):
        return "Concrete"

    # 5) Default => Other
    return "Other"

# ------------------------------------------------------------------------------
# 4) Bulk Classification for a CSV
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads CSV, classifies each row with 'classify_product', writes new CSV with 'Classification Result'.
    """
    df = pd.read_csv(input_csv).head(100)
    results = []

    for _, row in df.iterrows():
        p_name = str(row.get(name_col, ""))
        p_desc = str(row.get(desc_col, ""))

        classification = classify_product(p_name, p_desc)
        results.append(classification)

    df["Classification Result"] = results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Output saved to '{output_csv}'")

# ------------------------------------------------------------------------------
# 5) Example Usage
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    # Provide a CSV with columns: "Product Name", "Technology Description"
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified_06.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )


Classification completed. Output saved to '../../data/pipeline2/sql/filtered_epd_data_lc_classified_06.csv'


## Iteration 7

In [11]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Pattern Definitions
# ------------------------------------------------------------------------------

# A) "cement" must appear in both name & desc to force "Cement".
CEMENT_WORD = r"\bcement\b"

# B) Numeric-based Concrete patterns:
#    Note the updated pattern for strings like "32c12/15", "36c30/37", "101c35/45", etc.
CONCRETE_PATTERNS = [
    r"\brck\d+\b",               # e.g. rck30, rck50
    r"\b\d*c\d{1,2}/\d{1,2}\b",  # e.g. c20/25, 32c12/15, 101c35/45
    r"\bxc\d?\b",                # e.g. xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    r"\bs[345]\b",               # e.g. s3, s4, s5
    r"\bsf2\b",                  # slump-flow class
]

# C) Aggregates keywords: if found in Product Name => "Aggregates" (unless already concrete/cement)
AGGREGATES_KEYWORDS = [r"\baggregates?\b", r"\brubble\b", r"\bfill\b"]

# D) Patterns indicating "CEM" (for a fallback Cement classification)
CEM_PATTERNS = [
    r"\bcem\s?[i-v]",               # e.g. CEM I, CEM II
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+"  # e.g. CEM IV/B-LL 32.5R
]

# ------------------------------------------------------------------------------
# 2) Helper Functions
# ------------------------------------------------------------------------------

def contains_whole_word(word_regex, text):
    """
    Check if 'word_regex' is present as a whole word in 'text' (case-insensitive).
    """
    return bool(re.search(word_regex, text, re.IGNORECASE))

def matches_any(pattern_list, text):
    """
    Return True if any regex in 'pattern_list' matches 'text' (case-insensitive).
    """
    for pattern in pattern_list:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

def in_both_name_desc(word_regex, name_text, desc_text):
    """
    Return True if 'word_regex' is present in both name and description.
    """
    return (contains_whole_word(word_regex, name_text) and
            contains_whole_word(word_regex, desc_text))

def name_has_aggregates(name_text):
    """
    Check if product name has aggregator keywords: 'aggregates', 'rubble', 'fill'.
    """
    for kw in AGGREGATES_KEYWORDS:
        if re.search(kw, name_text, re.IGNORECASE):
            return True
    return False

# ------------------------------------------------------------------------------
# 3) Main Classification
# ------------------------------------------------------------------------------

def classify_product(product_name: str, product_desc: str) -> str:
    """
    Priority-based classification:

      1) Cement - if 'cement' word in both name & desc
      2) Concrete (numeric) - if text has any numeric-based concrete patterns
      3) Aggregates - if name has aggregator keywords
      4) Concrete (from the word 'concrete' in name) - if not yet classified
      5) Cement (CEM patterns) - if not yet classified
      6) Other - fallback
    """
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    combined_text = name_lower + " " + desc_lower

    # 1) "cement" in both name & desc => Cement
    if in_both_name_desc(CEMENT_WORD, name_lower, desc_lower):
        return "Cement"

    # 2) Numeric-based Concrete patterns => Concrete
    if matches_any(CONCRETE_PATTERNS, combined_text):
        return "Concrete"

    # 3) Aggregates keywords in name => Aggregates
    if name_has_aggregates(name_lower):
        return "Mineral-based Building Materials > Aggregates"

    # 4) If name has "concrete" => Concrete
    if re.search(r"\bconcrete\b", name_lower, re.IGNORECASE):
        return "Concrete"

    # 5) "CEM" patterns => Cement
    if matches_any(CEM_PATTERNS, combined_text):
        return "Cement"

    # 6) Default => Other
    return "Other"

# ------------------------------------------------------------------------------
# 4) Bulk Classification for a CSV
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads CSV, classifies each row with 'classify_product', writes new CSV with 'Classification Result'.
    """
    df = pd.read_csv(input_csv)
    results = []

    for _, row in df.iterrows():
        p_name = str(row.get(name_col, ""))
        p_desc = str(row.get(desc_col, ""))

        classification = classify_product(p_name, p_desc)
        results.append(classification)

    df["Classification Result"] = results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Output saved to '{output_csv}'")

# ------------------------------------------------------------------------------
# 5) Example Usage
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    # Provide a CSV with columns: "Product Name", "Technology Description"
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified_07.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )


Classification completed. Output saved to '../../data/pipeline2/sql/filtered_epd_data_lc_classified_07.csv'


## Iteration 8

In [2]:
import pandas as pd
import re

# ------------------------------------------------------------------------------
# 1) Pattern Definitions
# ------------------------------------------------------------------------------

# A) "cement" must appear in both name & desc to force "Cement".
CEMENT_WORD = r"\bcement\b"

# B) Numeric-based Concrete patterns:
#    Note the updated pattern for strings like "32c12/15", "36c30/37", "101c35/45", etc.
CONCRETE_PATTERNS = [
    r"\brck\d+\b",               # e.g. rck30, rck50
    r"\b\d*c\d{1,2}/\d{1,2}\b",  # e.g. c20/25, 32c12/15, 101c35/45
    r"\bxc\d?\b",                # e.g. xc2, xc3
    r"\bxd\d?\b",
    r"\bxs\d?\b",
    r"\bxf\d?\b",
    r"\bxa\d?\b",
    # r"\bs[345]\b",             # e.g. s3, s4, s5
    r"\bsf2\b",                  # slump-flow class
    r"\bcls\b"                   # "calcestruzzo" (Italian for "concrete")
]

# C) Aggregates keywords: if found in Product Name => "Aggregates" (unless already concrete/cement)
AGGREGATES_KEYWORDS = [r"\baggregates?\b", r"\brubble\b", r"\bfill\b"]

# D) Patterns indicating "CEM" (for a fallback Cement classification)
CEM_PATTERNS = [
    r"\bcem\s?[i-v]",               # e.g. CEM I, CEM II
    r"\bcem\s?\d+\/?[a-zA-Z]?[\-]?[a-zA-Z]+"  # e.g. CEM IV/B-LL 32.5R
]

STEEL_KEYWORDS = [r"\bsteel\b", r"\brebar\b", r"\bmesh\b", r"\bprofile\b", r"\bpipe\b", r"\bpex\b"]
# or refine further

# ------------------------------------------------------------------------------
# 2) Helper Functions
# ------------------------------------------------------------------------------

def contains_whole_word(word_regex, text):
    """
    Check if 'word_regex' is present as a whole word in 'text' (case-insensitive).
    """
    return bool(re.search(word_regex, text, re.IGNORECASE))

def matches_any(pattern_list, text):
    """
    Return True if any regex in 'pattern_list' matches 'text' (case-insensitive).
    """
    for pattern in pattern_list:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

def in_both_name_desc(word_regex, name_text, desc_text):
    """
    Return True if 'word_regex' is present in both name and description.
    """
    return (contains_whole_word(word_regex, name_text) and
            contains_whole_word(word_regex, desc_text))

def name_has_aggregates(name_text):
    """
    Check if product name has aggregator keywords: 'aggregates', 'rubble', 'fill'.
    """
    for kw in AGGREGATES_KEYWORDS:
        if re.search(kw, name_text, re.IGNORECASE):
            return True
    return False

def is_steel_accessory(name_lower):
    for kw in STEEL_KEYWORDS:
        if re.search(kw, name_lower):
            return True
    return False

# ------------------------------------------------------------------------------
# 3) Main Classification
# ------------------------------------------------------------------------------

def classify_product(product_name: str, product_desc: str) -> str:
    """
    Priority-based classification:

      1) Cement - if 'cement' word in both name & desc
      2) Concrete (numeric) - if text has any numeric-based concrete patterns
      3) Aggregates - if name has aggregator keywords
      4) Concrete (from the word 'concrete' in name) - if not yet classified
      5) Cement (CEM patterns) - if not yet classified
      6) Other - fallback
    """
    name_lower = product_name.lower()
    desc_lower = product_desc.lower()
    combined_text = name_lower + " " + desc_lower

    # 1) "cement" in both name & desc => Cement
    if in_both_name_desc(CEMENT_WORD, name_lower, desc_lower):
        return "Cement"

    # 2) Numeric-based Concrete patterns => Concrete
    if matches_any(CONCRETE_PATTERNS, combined_text):
        return "Concrete"

    # 3) Aggregates keywords in name => Aggregates
    if name_has_aggregates(name_lower):
        return "Mineral-based Building Materials > Aggregates"

    # 4) If name has "concrete" => Concrete
    if re.search(r"\bconcrete\b", name_lower, re.IGNORECASE):
        # if we detect steel or pipe or rebar => 'Other' or 'Steel'
        if is_steel_accessory(name_lower):
            return "Other"  # or "Steel/Reinforcement"
        # else real "Concrete"
        return "Concrete"

    # 5) If name has "reinforced concrete" => Concrete
    if re.search(r"\breinforced concrete\b", name_lower, re.IGNORECASE):
        return "Concrete"
    
    # 5) If name has "reinforced concrete" => Concrete
    if re.search(r"\bprefabricated concrete\b", desc_lower, re.IGNORECASE):
        return "Concrete"
    
    # 5) If name has "reinforced concrete" => Concrete
    if re.search(r"\bprecast concrete\b", desc_lower, re.IGNORECASE):
        return "Concrete"

    # 6) "CEM" patterns => Cement
    if matches_any(CEM_PATTERNS, combined_text):
        return "Cement"

    # 6) Default => Other
    return "Other"

# ------------------------------------------------------------------------------
# 4) Bulk Classification for a CSV
# ------------------------------------------------------------------------------

def classify_dataset(
    input_csv: str, 
    output_csv: str,
    name_col: str = "Product Name",
    desc_col: str = "Technology Description"
):
    """
    Reads CSV, classifies each row with 'classify_product', writes new CSV with 'Classification Result'.
    """
    df = pd.read_csv(input_csv)
    results = []

    for _, row in df.iterrows():
        p_name = str(row.get(name_col, ""))
        p_desc = str(row.get(desc_col, ""))

        classification = classify_product(p_name, p_desc)
        results.append(classification)

    df["Classification Result"] = results
    df.to_csv(output_csv, index=False)
    print(f"Classification completed. Output saved to '{output_csv}'")

# ------------------------------------------------------------------------------
# 5) Example Usage
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    # Provide a CSV with columns: "Product Name", "Technology Description"
    classify_dataset(
        input_csv="../../data/pipeline2/sql/filtered_epd_data_lc.csv",
        output_csv="../../data/pipeline2/sql/filtered_epd_data_lc_classified_all_01.csv",
        name_col="Product Name",
        desc_col="Technology Description"
    )


Classification completed. Output saved to '../../data/pipeline2/sql/filtered_epd_data_lc_classified_all_01.csv'


# Add RegEx Categories to Testing Dataset

In [2]:
import csv
import json

# Define input and output file paths
input_csv = "../../data/pipeline2/sql/filtered_epd_data_lc_classified_08.csv"     # your CSV file (make sure it has a header row)
output_json = "../../data/pipeline2/json/100_tech_sum_one_sentence02_regex.json"

# List to hold the JSON data
json_data = []

# Open the CSV file and create a DictReader to process each row
with open(input_csv, mode="r", newline='', encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row_index, row in enumerate(reader):
        # Create a JSON object for this row
        json_object = {
            "row_index": row_index,
            "Product Name": row["Product Name"],
            "Classification": row["Classification"],
            # Here we assume that the CSV field "Technology Description" is used
            # to create the summary; in a real script you might process this text further.
            "Technology Description Summary": row["Technology Description"],
            # Map the CSV "Classification Result" to a new JSON field "Classification Suggestion"
            "Classification Suggestion": row["Classification Result"]
        }
        json_data.append(json_object)

# Write out the list of JSON objects to a file
with open(output_json, mode="w", encoding="utf-8") as jsonfile:
    json.dump(json_data, jsonfile, indent=2, ensure_ascii=False)

print(f"JSON data has been written to {output_json}")


JSON data has been written to ../../data/pipeline2/json/100_tech_sum_one_sentence02_regex.json
