## Setup

Model developed in a Python 3.11.9 environment with the following packages required:

- spacy
- spacy-model-en_core_web_sm
- pandas
- openpyxl

In [None]:
import pandas as pd
from spacy.matcher import Matcher
import spacy

!pip install openpyxl

## Load and Pre-process Data

In [None]:
# Load the datasets
ASPECTS_ELEMENTS_MATERIALS_DF = pd.read_excel('./data/FCBS_Aspects-Elements-Materials_MachineReadable.xlsx')
BUILDUPS_DETAILS_DF = pd.read_excel('./data/FCBS_Build Ups-Details_MachineReadable.xlsx')
SECTORS_DF = pd.read_excel('./data/FCBS_Sectors-Subsectors_MachineReadable.xlsx')
ICE_DB_DF = pd.read_csv('./data/ICE DB_Cleaned.csv')
CLF_EMBODIED_CARBON_DF = pd.read_csv('./data/CLF Embodied Carbon_Cleaned.csv')
RIBA_TARGETS_DF = pd.read_excel('./data/RIBA 2030-Targets_MachineReadable.xlsx')

# Fill missing values
def fill_missing_values(df):
    return df.fillna(0)

ASPECTS_ELEMENTS_MATERIALS_DF = fill_missing_values(ASPECTS_ELEMENTS_MATERIALS_DF)
BUILDUPS_DETAILS_DF = fill_missing_values(BUILDUPS_DETAILS_DF)
SECTORS_DF = fill_missing_values(SECTORS_DF)
ICE_DB_DF = fill_missing_values(ICE_DB_DF)
CLF_EMBODIED_CARBON_DF = fill_missing_values(CLF_EMBODIED_CARBON_DF)
RIBA_TARGETS_DF = fill_missing_values(RIBA_TARGETS_DF)


## Model Training with Text Featurization

In [24]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Extract unique materials and sub-materials
unique_materials = ICE_DB_DF['Material'].unique()
unique_sub_materials = ICE_DB_DF['Sub-material'].unique()
all_materials = list(set(list(unique_materials) + list(unique_sub_materials)))
all_materials = [str(material).strip().lower() for material in all_materials if pd.notna(material)]

# Extract unique building types and uses from CLF Embodied Carbon dataset
unique_building_types_clf = CLF_EMBODIED_CARBON_DF['Building Type'].unique()
unique_building_uses_clf = CLF_EMBODIED_CARBON_DF['Building Use'].unique()
# Extract unique sectors, building typologies, and sub-sectors from Sectors dataset
unique_sectors = SECTORS_DF['Sector'].unique()
unique_building_typologies = SECTORS_DF['Building Typology'].unique()
unique_sub_sectors = SECTORS_DF['Sub-sector'].unique()
# Combine all unique building types
all_building_types = list(set(list(unique_building_types_clf) + list(unique_building_uses_clf) +
                              list(unique_sectors) + list(unique_building_typologies) + list(unique_sub_sectors)))
all_building_types = [str(building_type).strip().lower() for building_type in all_building_types if pd.notna(building_type)]

# Extract unique element buildups from the Aspects Elements Materials dataset
unique_element_buildups = ASPECTS_ELEMENTS_MATERIALS_DF[['Building Aspect', 'Element', 'Material']].apply(
    lambda x: ' '.join(x.dropna().astype(str)), axis=1).unique()
unique_element_buildups = [str(build_up).strip().lower() for build_up in unique_element_buildups if pd.notna(build_up)]




# Create a matcher for custom entities
matcher = Matcher(nlp.vocab)

# Function to create patterns from a list of terms
def create_patterns(terms):
    return [[{"LOWER": term}] for term in terms]

# Create patterns for all materials, building types, and element buildups
material_patterns = create_patterns(all_materials)
building_type_patterns = create_patterns(all_building_types)
element_buildup_patterns = create_patterns(unique_element_buildups)

# Add patterns to the matcher
matcher.add("MATERIAL", material_patterns)
matcher.add("BUILDING_TYPE", building_type_patterns)
matcher.add("ELEMENT_BUILDUP", element_buildup_patterns)




def parse_description(description):
    doc = nlp(description)
    size = 0
    size_unit = 'square meters'
    materials = []
    building_types = []
    location = 'unknown'
    
    # Use the matcher to find custom entities
    matches = matcher(doc)
    for match_id, start, end in matches:
        match_label = nlp.vocab.strings[match_id]  # Get the string representation of the match ID
        if match_label == 'MATERIAL':
            materials.append(doc[start:end].text.lower())
        elif match_label == 'BUILDING_TYPE':
            building_types.append(doc[start:end].text.lower())
    
    # Use spaCy's NER to identify locations, quantities, and any building types/materials not captured by matcher
    for ent in doc.ents:
        if ent.label_ == 'QUANTITY':
            size = float(ent.text.replace(',', '').split()[0])
            size_unit = ' '.join(ent.text.split()[1:])
        elif ent.label_ == 'GPE':  # Geopolitical entity (locations)
            location = ent.text.lower()
        elif ent.label_ == 'ORG' or ent.label_ == 'FAC':  # Organizations and facilities
            if not building_types:
                building_types.append(ent.text.lower())
        elif ent.label_ == 'PRODUCT':  # Products (could include materials)
            if not materials:
                materials.append(ent.text.lower())
    
    # Convert size to square meters if needed
    if 'square feet' in size_unit or 'sq ft' in size_unit:
        size *= 0.092903  # Convert square feet to square meters
    
    return {
        'size': size,
        'materials': materials,
        'building_types': building_types,
        'location': location
}

# Example usage
description = "I am constructing a school building in New York that is 2000 square metres, primarily made of timber and steel."
parsed_info = parse_description(description)
print(parsed_info)


{'size': 2000.0, 'materials': ['timber', 'steel'], 'building_types': [], 'element_buildups': [], 'location': 'new york'}
