# Data preprocessing script - CLF

Author: [Andreas Sørensen](https://www.linkedin.com/in/a-soerensen) - Source: https://doi.org/10.5281/zenodo.5895051

In [28]:
# SETUP
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [29]:
# set data frame printing settings
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
print("Pre-processing CLF data...")

# Import data

In [30]:
# IMPORT

#File name/path
filename = '00_data/0_data_input_raw/CLF_data_input_raw.csv'

#Load csv
df_CLF = pd.read_csv(filename, delimiter = ',')

# Clean data

In [31]:
#Delete columns without header and data
df_CLF = df_CLF.loc[:,~df_CLF.columns.str.match("Unnamed")]

#Drop all rows which have a missing value (NaN)
df_CLF.dropna(how='any',inplace=True)

# Cull irrelevant data

Remove all data outside Europe

In [32]:
df_CLF = df_CLF.loc[df_CLF['BLDG_LOC_REGION']=='Europe']

# Rename directly transferable data

Description: Rename column headers of data columns that are ready to be exported so they fit with Data Collection Template

In [33]:
#Translate "LCA_REFPERIOD" to "lca_RSP"

#Rename column header
df_CLF.rename(columns = {"LCA_REFPERIOD":"lca_RSP"}, inplace = True)

In [34]:
#Translate "BLDG_LOC_REGION" to "site_country"

#Rename column header
df_CLF.rename(columns = {"BLDG_LOC_REGION":"site_country"}, inplace = True)

# Translate and regroup/rename data

Description: Translate data entries that need to be rephrased to fit the categories and formats of the Data Collection Template. Rename column headers so they fit with Data Collection Template.

In [35]:
#Translate "BLDG_TYP" to "bldg_use_type"

try:
    #Display unique entries in "usage_principal" which need to be replaced
    #print(df_CLF.BLDG_TYP.unique())

    #Replace entries with correct category label
    df_CLF.BLDG_TYP.replace('Commercial','Non-residential',inplace=True)
    
    #Rename column header
    df_CLF.rename(columns = {"BLDG_TYP":"bldg_use_type"}, inplace = True)

except:
    print('usage_principal already processed')

['Commercial' 'Residential']


In [36]:
#Translate "BLDG_US" to "bldg_use_subtype"

try:
    #Display unique entries in "usage_principal" which need to be replaced
    #print(df_CLF.BLDG_US.unique())

    #Replace entries with correct category label
    df_CLF.BLDG_US.replace('Multi-family','Multi-family house',inplace=True)
    df_CLF.BLDG_US.replace('Lodging','Other',inplace=True)
    df_CLF.BLDG_US.replace('Public Assembly','Other',inplace=True)
    
    #Rename column header
    df_CLF.rename(columns = {"BLDG_US":"bldg_use_subtype"}, inplace = True)

except:
    print('usage_principal already processed')

['Office' 'Mixed use' 'Lodging' 'Multi-family' 'Public Assembly' 'Other']


In [37]:
#Translate "BLDG_NEW_REN" to "bldg_project_status"

try:
    #Display unique entries in "usage_principal" which need to be replaced
    #print(df_CLF.BLDG_NEW_REN.unique())

    #Replace entries with correct category label
    df_CLF.BLDG_NEW_REN.replace('New','New Built',inplace=True)
    df_CLF.BLDG_NEW_REN.replace('nan','NaN',inplace=True)
    
    #Rename column header
    df_CLF.rename(columns = {"BLDG_NEW_REN":"bldg_project_status"}, inplace = True)

except:
    print('usage_principal already processed')

['New']


In [38]:
#Translate "$BLDG_AREA_M2" to "bldg_area_interval"

df_CLF['BLDG_AREA_M2'] = df_CLF['$BLDG_AREA_M2']

try:
    #Display unique entries in "usage_principal" which need to be replaced
    #print(df_CLF.BLDG_AREA_M2.unique())

    #Replace entries with correct category label
    df_CLF.BLDG_AREA_M2.replace('46452 to 92903','45001-95000',inplace=True)
    df_CLF.BLDG_AREA_M2.replace('Over 92903','>90001',inplace=True)
    df_CLF.BLDG_AREA_M2.replace('9291 to 18580','5001-20000',inplace=True)
    df_CLF.BLDG_AREA_M2.replace('18581 to 46451','15001-50000',inplace=True)
    df_CLF.BLDG_AREA_M2.replace('4646 to 9290','0-10000',inplace=True)
    df_CLF.BLDG_AREA_M2.replace('2324 to 4645','0-5000',inplace=True)
    
    #Rename column header
    df_CLF.rename(columns = {"BLDG_AREA_M2":"bldg_area_interval"}, inplace = True)

except:
    print('usage_principal already processed')

['46452 to 92903' 'Over 92903' '9291 to 18580' '18581 to 46451'
 '4646 to 9290' '2324 to 4645']


In [39]:
#Translate "$BLDG_STOR_A" to "bldg_floors_ag"

df_CLF['BLDG_STOR_A'] = df_CLF['$BLDG_STOR_A']

try:
    #Display unique entries in "usage_principal" which need to be replaced
    #print(df_CLF.BLDG_STOR_A.unique())

    #Replace entries with correct category label
    df_CLF.BLDG_STOR_A.replace('1 to 6','1-6',inplace=True)
    df_CLF.BLDG_STOR_A.replace('7 to 14','7-14',inplace=True)
    df_CLF.BLDG_STOR_A.replace('15 to 25','15-25',inplace=True)
    df_CLF.BLDG_STOR_A.replace('More than 25','>25',inplace=True)
    
    #Rename column header
    df_CLF.rename(columns = {"BLDG_STOR_A":"bldg_floors_ag"}, inplace = True)

except:
    print('usage_principal already processed')

['More than 25' '7 to 14' '15 to 25' '1 to 6']


# Transform and derive data through inference

In [40]:
#Transform LCA_STAGES into viable columns of yes/no

df_CLF["scope_LCS_A"] = ["Yes" if i == "A" or "AB" or "ABC" or "ABCD" else "No" for i in df_CLF["LCA_STAGES"]]
#scope_LCS_A123 is "Yes" if LCA_STAGES contains A or AB, else it is "No"
df_CLF["scope_LCS_A123"] = ["Yes" if i == "A" or "AB" or "ABC" or "ABCD" else "No" for i in df_CLF["LCA_STAGES"]]
#Note that there is no differenciation between A1-3 and A1-5 in the CLF dataset, thus we can only be certain A1-3 is included
#We therefore put no data in A4 and A5 (the same applies for module B)
df_CLF["scope_LCS_A4"] = 'No data'
df_CLF["scope_LCS_A5"] = 'No data'

df_CLF["scope_LCS_B"] = ["Yes" if i == "B" or "AB" or "ABC" or "ABCD" else "No" for i in df_CLF["LCA_STAGES"]]
df_CLF["scope_LCS_B1"] = 'No data'
df_CLF["scope_LCS_B2"] = 'No data'
df_CLF["scope_LCS_B3"] = 'No data'
df_CLF["scope_LCS_B4"] = 'No data'
df_CLF["scope_LCS_B5"] = 'No data'
df_CLF["scope_LCS_B6"] = 'No data'
df_CLF["scope_LCS_B7"] = 'No data'
df_CLF["scope_LCS_B8"] = 'No data'

df_CLF["scope_LCS_C"] = 'No'
df_CLF["scope_LCS_C1"] = 'No'
df_CLF["scope_LCS_C2"] = 'No'
df_CLF["scope_LCS_C3"] = 'No'
df_CLF["scope_LCS_C4"] = 'No'

df_CLF["scope_LCS_D"] = 'No'
df_CLF["scope_handling_D"] = 'not in scope'

In [41]:
#Transform LCA_BLDG_SCOPE into viable columns of yes/no

#scope_parts_1_ground is "Yes" if LCA_BLDG_SCOPE contains F, else it is "No"
df_CLF["scope_parts_1_ground"] = ["Yes" if i == "F" or "SF" else "No" for i in df_CLF["LCA_BLDG_SCOPE"]]
#scope_parts_2_structure is "Yes" if LCA_BLDG_SCOPE contains S, else it is "No"
df_CLF["scope_parts_2_structure"] = ["Yes" if i == "S" or "SF" else "No" for i in df_CLF["LCA_BLDG_SCOPE"]]
#The remaining parts are out of scope
df_CLF["scope_parts_3_secondary"] = 'No'
df_CLF["scope_parts_4_finishes"] = 'No'
df_CLF["scope_parts_5_mechanical"] = 'No'
df_CLF["scope_parts_6_electrical"] = 'No'
df_CLF["scope_parts_6+_renewables"] = 'No'
df_CLF["scope_parts_7_facilities"] = 'No'
df_CLF["scope_parts_8_fittings"] = 'No'
df_CLF["scope_parts_9_external"] = 'No'

In [42]:
#Transform EC_LCAA_PERM2 into GHG_A123_m2a

#Need to divide EC_LCAA_PERM2 (kg CO2/m2) by the reference study period (year) to get the correct number/unit (kg CO2/m2/year)
df_CLF['GHG_A123_m2a'] = df_CLF['EC_LCAA_PERM2']/df_CLF['lca_RSP']

In [43]:
#Transform EC_WB_EX_OPER into GHG_sum_em

#Unit is in ton CO2 must change to kg CO2
df_CLF['GHG_sum_em'] = df_CLF['EC_WB_EX_OPER']*1000

In [44]:
#Infer admin_project_code
df_CLF['admin_project_code'] = range(1,1+len(df_CLF))
df_CLF['admin_project_code'] = 'CLF'+df_CLF['admin_project_code'].astype(str)

#Infer admin_project_contact
df_CLF['admin_project_contact'] = 'CLF'

# Add empty data columns

In [45]:
#Add empty columns
#(i.e. add columns from Data Collection sheet that aren't represented in CSTB for completeness)

#Time data (maybe we could infer this using time of the conduction of the study? -> date_etude_rsenv)
df_CLF['bldg_year_permit'] = 'n/a'
df_CLF['bldg_year_complete'] = 'n/a'
df_CLF['bldg_year_complete_interval'] = 'No data'

#Project data status (Could be available in dataset in "phase" and "phase_exacte" but there are two and can't discerne between them)
df_CLF['bldg_QTO_type'] = 'n/a'

#Building area definition
df_CLF['bldg_area_definition'] = 'n/a'

#Gross floor area and heated floor area
df_CLF['bldg_area_gfa'] = 'n/a'
df_CLF['bldg_area_hfa'] = 'n/a'

#Building users
df_CLF['bldg_users_total'] = 'n/a'

#Floors below ground
df_CLF['bldg_floors_bg'] = 'n/a'

#Structure type
df_CLF['bldg_struct_type'] = 'n/a'

#Roof type
df_CLF['bldg_roof_type'] = 'n/a'

df_CLF['bldg_floors_bg'] = 'n/a'
df_CLF['bldg_floors_bg'] = 'n/a'

#Energy performance class (could be infered using the available energy class)
df_CLF['bldg_energy_class_general'] = 'n/a'

#Energy class according to country
df_CLF['bldg_energy_class_country'] = 'n/a'

#Sustainability certification (Niveau Carbone? Is it a certification scheme?)
df_CLF['bldg_certification'] = 'n/a'

#Energy consumption
df_CLF['inv_energy_consumption'] = 'n/a'

#Total mass of the building
df_CLF['inv_mat_mass_total'] = 'n/a'

#Top 5 most used materials
df_CLF['inv_mat_1_type'] = 'n/a'
df_CLF['inv_mat_1_mass'] = 'n/a'
df_CLF['inv_mat_2_type'] = 'n/a'
df_CLF['inv_mat_2_mass'] = 'n/a'
df_CLF['inv_mat_3_type'] = 'n/a'
df_CLF['inv_mat_3_mass'] = 'n/a'
df_CLF['inv_mat_4_type'] = 'n/a'
df_CLF['inv_mat_4_mass'] = 'n/a'
df_CLF['inv_mat_5_type'] = 'n/a'
df_CLF['inv_mat_5_mass'] = 'n/a'

#LCA software (could be acquired from enquiry with data providers or infered from "RSEnv version")
df_CLF['lca_software'] = 'n/a'

#LCA database (could be acquired from enquiry with data providers or infered from "RSEnv version")
df_CLF['lca_database'] = 'n/a'

#Future decarbonisation scenarios considered
df_CLF['lca_scenarios_decarbonisation'] = 'No data'

#Building parts included
#df_CLF['scope_parts_1_ground'] = 'No data'
#df_CLF['scope_parts_2_structure'] = 'No data'
#df_CLF['scope_parts_3_secondary'] = 'No data'
#df_CLF['scope_parts_4_finishes'] = 'No data'
#df_CLF['scope_parts_5_mechanical'] = 'No data'
#df_CLF['scope_parts_6_electrical'] = 'No data'
#df_CLF['scope_parts_6+_renewables'] = 'No data'
#df_CLF['scope_parts_7_facilities'] = 'No data'
#df_CLF['scope_parts_8_fittings'] = 'No data'
#df_CLF['scope_parts_9_external'] = 'No data'

#Life cycle stages considered in the study
#df_CLF['scope_LCS_A123'] = 'No data'
#df_CLF['scope_LCS_A4'] = 'No data'
#df_CLF['scope_LCS_A5'] = 'No data'
#df_CLF['scope_LCS_B1'] = 'No data'
#df_CLF['scope_LCS_B2'] = 'No data'
#df_CLF['scope_LCS_B3'] = 'No data'
#df_CLF['scope_LCS_B4'] = 'No data'
#df_CLF['scope_LCS_B5'] = 'No data'
#df_CLF['scope_LCS_B6'] = 'No data'
#df_CLF['scope_LCS_B7'] = 'No data'
#df_CLF['scope_LCS_B8'] = 'No data'
#df_CLF['scope_LCS_C1'] = 'No data'
#df_CLF['scope_LCS_C2'] = 'No data'
#df_CLF['scope_LCS_C3'] = 'No data'
#df_CLF['scope_LCS_C4'] = 'No data'
#df_CLF['scope_LCS_D'] = 'No data'
#df_CLF['scope_handling_D'] = 'No data'



#Results totals
#df_CSTB['GHG_sum_em'] = 'n/a'
df_CLF['GHG_sum_op'] = 'n/a'
df_CLF['GHG_sum_em_m2a'] = 'n/a'
df_CLF['GHG_sum_op_m2a'] = 'n/a'

#Results individual modules [kgCO2]
df_CLF['GHG_A1'] = 'n/a'
df_CLF['GHG_A2'] = 'n/a'
df_CLF['GHG_A3'] = 'n/a'
df_CLF['GHG_A4'] = 'n/a'
df_CLF['GHG_A5'] = 'n/a'
df_CLF['GHG_B1'] = 'n/a'
df_CLF['GHG_B2'] = 'n/a'
df_CLF['GHG_B3'] = 'n/a'
df_CLF['GHG_B4'] = 'n/a'
df_CLF['GHG_B5'] = 'n/a'
df_CLF['GHG_B6'] = 'n/a'
df_CLF['GHG_B7'] = 'n/a'
df_CLF['GHG_C1'] = 'n/a'
df_CLF['GHG_C2'] = 'n/a'
df_CLF['GHG_C3'] = 'n/a'
df_CLF['GHG_C4'] = 'n/a'
df_CLF['GHG_D'] = 'n/a'

#Results individual modules [kgCO2/m2/y]
df_CLF['GHG_A1_m2a'] = 'n/a'
df_CLF['GHG_A2_m2a'] = 'n/a'
df_CLF['GHG_A3_m2a'] = 'n/a'
df_CLF['GHG_A4_m2a'] = 'n/a'
df_CLF['GHG_A5_m2a'] = 'n/a'
df_CLF['GHG_B1_m2a'] = 'n/a'
df_CLF['GHG_B2_m2a'] = 'n/a'
df_CLF['GHG_B3_m2a'] = 'n/a'
df_CLF['GHG_B4_m2a'] = 'n/a'
df_CLF['GHG_B5_m2a'] = 'n/a'
df_CLF['GHG_B6_m2a'] = 'n/a'
df_CLF['GHG_B7_m2a'] = 'n/a'
df_CLF['GHG_C1_m2a'] = 'n/a'
df_CLF['GHG_C2_m2a'] = 'n/a'
df_CLF['GHG_C3_m2a'] = 'n/a'
df_CLF['GHG_C4_m2a'] = 'n/a'
df_CLF['GHG_D_m2a'] = 'n/a'

#Results aggregated modules [kgCO2]
df_CLF["GHG_A123"] = 'n/a'
df_CLF["GHG_A45"] = 'n/a'
df_CLF["GHG_A12345"] = 'n/a'
df_CLF["GHG_B1234"] = 'n/a'
df_CLF["GHG_B12345"] = 'n/a'
df_CLF["GHG_B67"] = 'n/a'
df_CLF["GHG_B1234567"] = 'n/a'
df_CLF["GHG_C12"] = 'n/a'
df_CLF["GHG_C34"] = 'n/a'
df_CLF["GHG_C1234"] = 'n/a'
df_CLF["GHG_C34_D"] = 'n/a'
df_CLF["GHG_C1234_D"] = 'n/a'

#Results aggregated modules [kgCO2/m2/y]
df_CLF["GHG_A123_m2a"] = 'n/a'
df_CLF["GHG_A45_m2a"] = 'n/a'
df_CLF["GHG_A12345_m2a"] = 'n/a'
df_CLF["GHG_B1234_m2a"] = 'n/a'
df_CLF["GHG_B12345_m2a"] = 'n/a'
df_CLF["GHG_B67_m2a"] = 'n/a'
df_CLF["GHG_B1234567_m2a"] = 'n/a'
df_CLF["GHG_C12_m2a"] = 'n/a'
df_CLF["GHG_C34_m2a"] = 'n/a'
df_CLF["GHG_C1234_m2a"] = 'n/a'
df_CLF["GHG_C34_D_m2a"] = 'n/a'
df_CLF["GHG_C1234_D_m2a"] = 'n/a'

#Results building parts
#1 Ground
df_CLF["GHG_P1_sum_m2a"] = 'n/a'
df_CLF["GHG_P1_A123_m2a"] = 'n/a'
df_CLF["GHG_P1_A45_m2a"] = 'n/a'
df_CLF["GHG_P1_B1234_m2a"] = 'n/a'
df_CLF["GHG_P1_B5_m2a"] = 'n/a'
df_CLF["GHG_P1_C12_m2a"] = 'n/a'
df_CLF["GHG_P1_C34_m2a"] = 'n/a'
df_CLF["GHG_P1_D_m2a"] = 'n/a'

#2 Structure
df_CLF["GHG_P2_sum_m2a"] = 'n/a'
df_CLF["GHG_P2_A123_m2a"] = 'n/a'
df_CLF["GHG_P2_A45_m2a"] = 'n/a'
df_CLF["GHG_P2_B1234_m2a"] = 'n/a'
df_CLF["GHG_P2_B5_m2a"] = 'n/a'
df_CLF["GHG_P2_C12_m2a"] = 'n/a'
df_CLF["GHG_P2_C34_m2a"] = 'n/a'
df_CLF["GHG_P2_D_m2a"] = 'n/a'

#3-4 Envelope
df_CLF["GHG_P34_sum_m2a"] = 'n/a'
df_CLF["GHG_P34_A123_m2a"] = 'n/a'
df_CLF["GHG_P34_A45_m2a"] = 'n/a'
df_CLF["GHG_P34_B1234_m2a"] = 'n/a'
df_CLF["GHG_P34_B5_m2a"] = 'n/a'
df_CLF["GHG_P34_C12_m2a"] = 'n/a'
df_CLF["GHG_P34_C34_m2a"] = 'n/a'
df_CLF["GHG_P34_D_m2a"] = 'n/a'

#4 Internal
df_CLF["GHG_P4_sum_m2a"] = 'n/a'
df_CLF["GHG_P4_A123_m2a"] = 'n/a'
df_CLF["GHG_P4_A45_m2a"] = 'n/a'
df_CLF["GHG_P4_B1234_m2a"] = 'n/a'
df_CLF["GHG_P4_B5_m2a"] = 'n/a'
df_CLF["GHG_P4_C12_m2a"] = 'n/a'
df_CLF["GHG_P4_C34_m2a"] = 'n/a'
df_CLF["GHG_P4_D_m2a"] = 'n/a'

#5-6 Services
df_CLF["GHG_P56_sum_m2a"] = 'n/a'
df_CLF["GHG_P56_A123_m2a"] = 'n/a'
df_CLF["GHG_P56_A45_m2a"] = 'n/a'
df_CLF["GHG_P56_B1234_m2a"] = 'n/a'
df_CLF["GHG_P56_B5_m2a"] = 'n/a'
df_CLF["GHG_P56_C12_m2a"] = 'n/a'
df_CLF["GHG_P56_C34_m2a"] = 'n/a'
df_CLF["GHG_P56_D_m2a"] = 'n/a'

#7-8 Apppliances
df_CLF["GHG_P78_sum_m2a"] = 'n/a'
df_CLF["GHG_P78_A123_m2a"] = 'n/a'
df_CLF["GHG_P78_A45_m2a"] = 'n/a'
df_CLF["GHG_P78_B1234_m2a"] = 'n/a'
df_CLF["GHG_P78_B5_m2a"] = 'n/a'
df_CLF["GHG_P78_C12_m2a"] = 'n/a'
df_CLF["GHG_P78_C34_m2a"] = 'n/a'
df_CLF["GHG_P78_D_m2a"] = 'n/a'

# Create pre-processed dataframe

In [46]:
#Reorder columns in CLF dataframe using double brackets (columns that aren't called are dropped):
df_CLF_processed = df_CLF[[
    'admin_project_code',
    'admin_project_contact',
    'bldg_use_type',
    'bldg_use_subtype',
    'bldg_project_status',
    'site_country',
    'bldg_year_permit',
    'bldg_year_complete',
    'bldg_year_complete_interval',
    'bldg_QTO_type',
    'bldg_area_definition',
    'bldg_area_gfa','bldg_area_hfa',
    'bldg_area_interval',
    'bldg_users_total',
    'bldg_floors_ag',
    'bldg_floors_bg',
    'bldg_struct_type',
    'bldg_roof_type',
    'bldg_energy_class_general',
    'bldg_energy_class_country',
    'bldg_certification',
    
    'inv_energy_consumption',
    'inv_mat_mass_total',
    'inv_mat_1_type',
    'inv_mat_1_mass',
    'inv_mat_2_type',
    'inv_mat_2_mass',
    'inv_mat_3_type',
    'inv_mat_3_mass',
    'inv_mat_4_type',
    'inv_mat_4_mass',
    'inv_mat_5_type',
    'inv_mat_5_mass',
    
    'lca_RSP',
    'lca_software',
    'lca_database',
    'lca_scenarios_decarbonisation',
    'scope_parts_1_ground',
    'scope_parts_2_structure',
    'scope_parts_3_secondary',
    'scope_parts_4_finishes',
    'scope_parts_5_mechanical',
    'scope_parts_6_electrical',
    'scope_parts_6+_renewables',
    'scope_parts_7_facilities',
    'scope_parts_8_fittings',
    'scope_LCS_A123',
    'scope_LCS_A4',
    'scope_LCS_A5',
    'scope_LCS_B1',
    'scope_LCS_B2',
    'scope_LCS_B3',
    'scope_LCS_B4',
    'scope_LCS_B5',
    'scope_LCS_B6',
    'scope_LCS_B7',
    'scope_LCS_B8',
    'scope_LCS_C1',
    'scope_LCS_C2',
    'scope_LCS_C3',
    'scope_LCS_C4',
    'scope_LCS_D',
    'scope_handling_D',
    
    'GHG_sum_em',
    'GHG_sum_op',
    'GHG_sum_em_m2a',
    'GHG_sum_op_m2a',
    
    'GHG_A1',
    'GHG_A2',
    'GHG_A3',
    'GHG_A4',
    'GHG_A5',
    'GHG_B1',
    'GHG_B2',
    'GHG_B3',
    'GHG_B4',
    'GHG_B5',
    'GHG_B6',
    'GHG_B7',
    'GHG_C1',
    'GHG_C2',
    'GHG_C3',
    'GHG_C4',
    'GHG_D',
    
    'GHG_A1_m2a',
    'GHG_A2_m2a',
    'GHG_A3_m2a',
    'GHG_A4_m2a',
    'GHG_A5_m2a',
    'GHG_B1_m2a',
    'GHG_B2_m2a',
    'GHG_B3_m2a',
    'GHG_B4_m2a',
    'GHG_B5_m2a',
    'GHG_B6_m2a',
    'GHG_B7_m2a',
    'GHG_C1_m2a',
    'GHG_C2_m2a',
    'GHG_C3_m2a',
    'GHG_C4_m2a',
    'GHG_D_m2a',
    
    'GHG_A123',
    'GHG_A45',
    'GHG_A12345',
    'GHG_B1234',
    'GHG_B12345',
    'GHG_B67',
    'GHG_B1234567',
    'GHG_C12',
    'GHG_C34',
    'GHG_C1234',
    'GHG_C34_D',
    'GHG_C1234_D',  
    
    'GHG_A123_m2a',
    'GHG_A45_m2a',
    'GHG_A12345_m2a',
    'GHG_B1234_m2a',
    'GHG_B12345_m2a',
    'GHG_B67_m2a',
    'GHG_B1234567_m2a',
    'GHG_C12_m2a',
    'GHG_C34_m2a',
    'GHG_C1234_m2a',
    'GHG_C34_D_m2a',
    'GHG_C1234_D_m2a',
    
    'GHG_P1_sum_m2a',
    'GHG_P1_A123_m2a',
    'GHG_P1_A45_m2a',
    'GHG_P1_B1234_m2a',
    'GHG_P1_B5_m2a',
    'GHG_P1_C12_m2a',
    'GHG_P1_C34_m2a',
    'GHG_P1_D_m2a',
    'GHG_P2_sum_m2a',
    'GHG_P2_A123_m2a',
    'GHG_P2_A45_m2a',
    'GHG_P2_B1234_m2a',
    'GHG_P2_B5_m2a',
    'GHG_P2_C12_m2a',
    'GHG_P2_C34_m2a',
    'GHG_P2_D_m2a',
    'GHG_P34_sum_m2a',
    'GHG_P34_A123_m2a',
    'GHG_P34_A45_m2a',
    'GHG_P34_B1234_m2a',
    'GHG_P34_B5_m2a',
    'GHG_P34_C12_m2a',
    'GHG_P34_C34_m2a',
    'GHG_P34_D_m2a',
    'GHG_P4_sum_m2a',
    'GHG_P4_A123_m2a',
    'GHG_P4_A45_m2a',
    'GHG_P4_B1234_m2a',
    'GHG_P4_B5_m2a',
    'GHG_P4_C12_m2a',
    'GHG_P4_C34_m2a',
    'GHG_P4_D_m2a',
    'GHG_P56_sum_m2a',
    'GHG_P56_A123_m2a',
    'GHG_P56_A45_m2a',
    'GHG_P56_B1234_m2a',
    'GHG_P56_B5_m2a',
    'GHG_P56_C12_m2a',
    'GHG_P56_C34_m2a',
    'GHG_P56_D_m2a',
    'GHG_P78_sum_m2a',
    'GHG_P78_A123_m2a',
    'GHG_P78_A45_m2a',
    'GHG_P78_B1234_m2a',
    'GHG_P78_B5_m2a',
    'GHG_P78_C12_m2a',
    'GHG_P78_C34_m2a',
    'GHG_P78_D_m2a']]

In [47]:
df_CLF_processed

Unnamed: 0,admin_project_code,admin_project_contact,bldg_use_type,bldg_use_subtype,bldg_project_status,site_country,bldg_year_permit,bldg_year_complete,bldg_year_complete_interval,bldg_QTO_type,bldg_area_definition,bldg_area_gfa,bldg_area_hfa,bldg_area_interval,bldg_users_total,bldg_floors_ag,bldg_floors_bg,bldg_struct_type,bldg_roof_type,bldg_energy_class_general,bldg_energy_class_country,bldg_certification,inv_energy_consumption,inv_mat_mass_total,inv_mat_1_type,inv_mat_1_mass,inv_mat_2_type,inv_mat_2_mass,inv_mat_3_type,inv_mat_3_mass,inv_mat_4_type,inv_mat_4_mass,inv_mat_5_type,inv_mat_5_mass,lca_RSP,lca_software,lca_database,lca_scenarios_decarbonisation,scope_parts_1_ground,scope_parts_2_structure,scope_parts_3_secondary,scope_parts_4_finishes,scope_parts_5_mechanical,scope_parts_6_electrical,scope_parts_6+_renewables,scope_parts_7_facilities,scope_parts_8_fittings,scope_LCS_A123,scope_LCS_A4,scope_LCS_A5,scope_LCS_B1,scope_LCS_B2,scope_LCS_B3,scope_LCS_B4,scope_LCS_B5,scope_LCS_B6,scope_LCS_B7,scope_LCS_B8,scope_LCS_C1,scope_LCS_C2,scope_LCS_C3,scope_LCS_C4,scope_LCS_D,scope_handling_D,GHG_sum_em,GHG_sum_op,GHG_sum_em_m2a,GHG_sum_op_m2a,GHG_A1,GHG_A2,GHG_A3,GHG_A4,GHG_A5,GHG_B1,GHG_B2,GHG_B3,GHG_B4,GHG_B5,GHG_B6,GHG_B7,GHG_C1,GHG_C2,GHG_C3,GHG_C4,GHG_D,GHG_A1_m2a,GHG_A2_m2a,GHG_A3_m2a,GHG_A4_m2a,GHG_A5_m2a,GHG_B1_m2a,GHG_B2_m2a,GHG_B3_m2a,GHG_B4_m2a,GHG_B5_m2a,GHG_B6_m2a,GHG_B7_m2a,GHG_C1_m2a,GHG_C2_m2a,GHG_C3_m2a,GHG_C4_m2a,GHG_D_m2a,GHG_A123,GHG_A45,GHG_A12345,GHG_B1234,GHG_B12345,GHG_B67,GHG_B1234567,GHG_C12,GHG_C34,GHG_C1234,GHG_C34_D,GHG_C1234_D,GHG_A123_m2a,GHG_A45_m2a,GHG_A12345_m2a,GHG_B1234_m2a,GHG_B12345_m2a,GHG_B67_m2a,GHG_B1234567_m2a,GHG_C12_m2a,GHG_C34_m2a,GHG_C1234_m2a,GHG_C34_D_m2a,GHG_C1234_D_m2a,GHG_P1_sum_m2a,GHG_P1_A123_m2a,GHG_P1_A45_m2a,GHG_P1_B1234_m2a,GHG_P1_B5_m2a,GHG_P1_C12_m2a,GHG_P1_C34_m2a,GHG_P1_D_m2a,GHG_P2_sum_m2a,GHG_P2_A123_m2a,GHG_P2_A45_m2a,GHG_P2_B1234_m2a,GHG_P2_B5_m2a,GHG_P2_C12_m2a,GHG_P2_C34_m2a,GHG_P2_D_m2a,GHG_P34_sum_m2a,GHG_P34_A123_m2a,GHG_P34_A45_m2a,GHG_P34_B1234_m2a,GHG_P34_B5_m2a,GHG_P34_C12_m2a,GHG_P34_C34_m2a,GHG_P34_D_m2a,GHG_P4_sum_m2a,GHG_P4_A123_m2a,GHG_P4_A45_m2a,GHG_P4_B1234_m2a,GHG_P4_B5_m2a,GHG_P4_C12_m2a,GHG_P4_C34_m2a,GHG_P4_D_m2a,GHG_P56_sum_m2a,GHG_P56_A123_m2a,GHG_P56_A45_m2a,GHG_P56_B1234_m2a,GHG_P56_B5_m2a,GHG_P56_C12_m2a,GHG_P56_C34_m2a,GHG_P56_D_m2a,GHG_P78_sum_m2a,GHG_P78_A123_m2a,GHG_P78_A45_m2a,GHG_P78_B1234_m2a,GHG_P78_B5_m2a,GHG_P78_C12_m2a,GHG_P78_C34_m2a,GHG_P78_D_m2a
2,CLF1,CLF,Non-residential,Office,New Built,Europe,,,No data,,,,,45001-95000,,>25,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,1303390.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24,CLF2,CLF,Non-residential,Mixed use,New Built,Europe,,,No data,,,,,>90001,,>25,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,726440.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
37,CLF3,CLF,Non-residential,Office,New Built,Europe,,,No data,,,,,5001-20000,,7-14,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,713190.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51,CLF4,CLF,Non-residential,Office,New Built,Europe,,,No data,,,,,45001-95000,,>25,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,649630.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
55,CLF5,CLF,Non-residential,Mixed use,New Built,Europe,,,No data,,,,,45001-95000,,>25,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,653810.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
84,CLF6,CLF,Non-residential,Office,New Built,Europe,,,No data,,,,,5001-20000,,7-14,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,504790.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100,CLF7,CLF,Non-residential,Mixed use,New Built,Europe,,,No data,,,,,15001-50000,,7-14,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,521480.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
115,CLF8,CLF,Non-residential,Office,New Built,Europe,,,No data,,,,,15001-50000,,7-14,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,582730.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
127,CLF9,CLF,Non-residential,Office,New Built,Europe,,,No data,,,,,5001-20000,,7-14,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,440550.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
132,CLF10,CLF,Non-residential,Other,New Built,Europe,,,No data,,,,,45001-95000,,>25,,,,,,,,,,,,,,,,,,,40.0,,,No data,Yes,Yes,No,No,No,No,No,No,No,Yes,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No,No,No,No,No,not in scope,418300.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [48]:
#Check data types before export
df_CLF_processed.dtypes

admin_project_code                object
admin_project_contact             object
bldg_use_type                     object
bldg_use_subtype                  object
bldg_project_status               object
site_country                      object
bldg_year_permit                  object
bldg_year_complete                object
bldg_year_complete_interval       object
bldg_QTO_type                     object
bldg_area_definition              object
bldg_area_gfa                     object
bldg_area_hfa                     object
bldg_area_interval                object
bldg_users_total                  object
bldg_floors_ag                    object
bldg_floors_bg                    object
bldg_struct_type                  object
bldg_roof_type                    object
bldg_energy_class_general         object
bldg_energy_class_country         object
bldg_certification                object
inv_energy_consumption            object
inv_mat_mass_total                object
inv_mat_1_type  

# Export

In [50]:
#Create CSV
filename = '00_data/1_data_pre_processed/CLF_processed.csv'
df_CLF_processed.to_csv(filename, index=False)

In [None]:
print("CLF data pre-processed.")