## Setup

Model developed in a Python 3.10.4 environment with the following packages required:

- spacy
- spacy-model-en_core_web_lg
sklearn
- pandas
- openpyxl

In [None]:
import pandas as pd

from spacy.matcher import Matcher
import spacy

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

!pip install openpyxl

## Load and Pre-process Data

In [None]:
# Load the datasets
ASPECTS_ELEMENTS_MATERIALS_DF = pd.read_excel('./data/FCBS_Aspects-Elements-Materials_MachineReadable.xlsx')
BUILDUPS_DETAILS_DF = pd.read_excel('./data/FCBS_Build Ups-Details_MachineReadable.xlsx')
SECTORS_DF = pd.read_excel('./data/FCBS_Sectors-Subsectors_MachineReadable.xlsx')
ICE_DB_DF = pd.read_csv('./data/ICE DB_Cleaned.csv')
CLF_EMBODIED_CARBON_DF = pd.read_csv('./data/CLF Embodied Carbon_Cleaned.csv')
RIBA_TARGETS_DF = pd.read_excel('./data/RIBA 2030-Targets_MachineReadable.xlsx')

# Fill missing values
def fill_missing_values(df):
    return df.fillna(0)

ASPECTS_ELEMENTS_MATERIALS_DF = fill_missing_values(ASPECTS_ELEMENTS_MATERIALS_DF)
BUILDUPS_DETAILS_DF = fill_missing_values(BUILDUPS_DETAILS_DF)
SECTORS_DF = fill_missing_values(SECTORS_DF)
ICE_DB_DF = fill_missing_values(ICE_DB_DF)
CLF_EMBODIED_CARBON_DF = fill_missing_values(CLF_EMBODIED_CARBON_DF)
RIBA_TARGETS_DF = fill_missing_values(RIBA_TARGETS_DF)


## Model Training with Text Featurization

In [None]:
# Use spacy to create patterns for identifying materials, building types, and element buildups

# Load spaCy model
nlp = spacy.load('en_core_web_lg')

# Extract unique materials, building types, and element buildups
unique_materials = ICE_DB_DF['Material'].unique()
unique_sub_materials = ICE_DB_DF['Sub-material'].unique()
all_materials = list(set(list(unique_materials) + list(unique_sub_materials)))
all_materials = [str(material).strip().lower() for material in all_materials if pd.notna(material)]

unique_building_types_clf = CLF_EMBODIED_CARBON_DF['Building Type'].unique()
unique_building_uses_clf = CLF_EMBODIED_CARBON_DF['Building Use'].unique()
unique_sectors = SECTORS_DF['Sector'].unique()
unique_building_typologies = SECTORS_DF['Building Typology'].unique()
unique_sub_sectors = SECTORS_DF['Sub-sector'].unique()
all_building_types = list(set(list(unique_building_types_clf) + list(unique_building_uses_clf) +
                              list(unique_sectors) + list(unique_building_typologies) + list(unique_sub_sectors)))
all_building_types = [str(building_type).strip().lower() for building_type in all_building_types if pd.notna(building_type)]

unique_building_aspects = ASPECTS_ELEMENTS_MATERIALS_DF['Building Aspect'].unique()
unique_elements = ASPECTS_ELEMENTS_MATERIALS_DF['Element'].unique()
unique_element_materials = ASPECTS_ELEMENTS_MATERIALS_DF['Material'].unique()
all_element_buildups = list(set(list(unique_building_aspects) + list(unique_elements) + list(unique_element_materials)))
all_element_buildups = [str(build_up).strip().lower() for build_up in all_element_buildups if pd.notna(build_up)]

# Create a matcher for custom entities
matcher = Matcher(nlp.vocab)

# Function to create patterns from a list of terms
def create_patterns(terms):
    return [[{"LOWER": term}] for term in terms]

# Create patterns for all materials, building types, and element buildups
material_patterns = create_patterns(all_materials)
building_type_patterns = create_patterns(all_building_types)
element_buildup_patterns = create_patterns(all_element_buildups)

# Add patterns to the matcher
matcher.add("MATERIAL", material_patterns)
matcher.add("BUILDING_TYPE", building_type_patterns)
matcher.add("ELEMENT_BUILDUP", element_buildup_patterns)


In [None]:
#parse input description

def parse_description(description):
    doc = nlp(description)
    size = 0
    size_unit = 'square meters'
    materials = []
    building_types = []
    element_buildups = []
    location = 'unknown'
    
    matches = matcher(doc)
    for match_id, start, end in matches:
        match_label = nlp.vocab.strings[match_id]
        if match_label == 'MATERIAL':
            materials.append(doc[start:end].text.lower())
        elif match_label == 'BUILDING_TYPE':
            building_types.append(doc[start:end].text.lower())
        elif match_label == 'ELEMENT_BUILDUP':
            element_buildups.append(doc[start:end].text.lower())
    
    for ent in doc.ents:
        if ent.label_ == 'QUANTITY':
            size = float(ent.text.replace(',', '').split()[0])
            size_unit = ' '.join(ent.text.split()[1:])
        elif ent.label_ == 'GPE':
            location = ent.text.lower()
        elif ent.label_ in ('ORG', 'FAC', 'PRODUCT'):
            token_text = ent.text.lower()
            if not building_types and any(nlp(token_text).similarity(nlp(b_type)) > 0.75 for b_type in all_building_types):
                building_types.append(token_text)
            if not materials and any(nlp(token_text).similarity(nlp(m)) > 0.75 for m in all_materials):
                materials.append(token_text)
    
    if 'square feet' in size_unit or 'sq ft' in size_unit:
        size *= 0.092903
    
    return {
        'size': size,
        'materials': materials,
        'building_types': building_types,
        'element_buildups': element_buildups,
        'location': location
}

# Example usage
description = "I am constructing a school building in New York that is 2000 square metres, primarily made of timber and steel. It will have pile foundations"
parsed_info = parse_description(description)
print(parsed_info)


## Model Training

In [None]:
# Example: Assume you have a dataframe `df` with features and target variable `carbon_impact`
# df = pd.DataFrame(parsed_features)
# X = df.drop(columns=['carbon_impact'])
# y = df['carbon_impact']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')