# Building a Cleaned ('Flipped' & Filtered) Dataset

In [None]:
# Importing libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from scipy.stats import chi2_contingency
import re

Note: the following code block now reads in combined_clean.csv instead of pca_section_1&4... so that all property types are taken into account 

Should maybe rename all the "retail" variable names to "combined" for clarity

In [None]:
# Reading in full dataset
combined = pd.read_csv("combined_clean.csv")

#combined.head(50)
combined

Unnamed: 0,project_id,document_spot,the_data,property_type
0,623951,553b_othrsysdesc,Emergency lighting was typically provided by c...,Retail
1,621112,pca_reserve_yn_18a,yes,Retail
2,623951,pca_reserve_yn_35a,no,Retail
3,441727,551_1a_pca,no,Retail
4,314749,441_identify_party_who,an outside vendor,Retail
...,...,...,...,...
1556727,610122,1o_summary_primroof1,"Flat, mechanically-fastened, single-ply thermo...",Industrial
1556728,610122,1s_summary_domrep_pipe,Copper,Industrial
1556729,602151,01w_rm_title,Principal,Industrial
1556730,610122,1y_summary_ada_parking,Two,Industrial


### Specifying relevant columns from the raw dataset

In [None]:
# Building simplified dataset - remove all columns except for project_id, document_spot, and the_data
combined_simple = combined[['project_id', 'document_spot', 'the_data', 'property_type']].copy()

combined_simple.head(50)
combined_simple.nunique()

project_id         7508
document_spot       737
the_data         157939
property_type         4
dtype: int64

### Specifying valid "document_spots" (i.e. "categories") from the raw dataset

In [None]:
# Define valid document spots
valid_document_spots = ['1f_summary_gross_building', '01e_project_zip', '42a_vertconst', '42b_horzconst', '1d_summary_floor_num', '1z_summary_tenant_spaces', 'pca_01_eff_age', '1c_summary_building_num', 'roof_framing_42']

# Filter retail_simplified based on valid document spots
filtered_rows = combined_simple[combined_simple['document_spot'].isin(valid_document_spots)]

# If there are any matching rows, update retail_simplified with the filtered data
if not filtered_rows.empty:
    combined_simple = filtered_rows.copy()
else:
    print("No rows found matching the specified criteria.")

combined_simple

Unnamed: 0,project_id,document_spot,the_data,property_type
13,451165,42b_horzconst,"[""No upper floors present""]",Retail
26,350106,42b_horzconst,"[""Wood-framing with wood decking""]",Retail
45,384179,42a_vertconst,"[""Steel-framed""]",Retail
48,402102,01e_project_zip,06902,Retail
71,417214,1d_summary_floor_num,1,Retail
...,...,...,...,...
1556673,610115,01e_project_zip,30040,Industrial
1556679,609627,42a_vertconst,"[""Masonry wythe bearing walls, steel columns, ...",Industrial
1556703,610136,01e_project_zip,75220,Industrial
1556725,610131,1z_summary_tenant_spaces,1,Industrial


## Creating the "flipped" dataset

Essentially taking the raw dataset, dropping irrelevant categories, and flipping orientation such that every row is a building, and every column is one of its attributes. 

At the same time, we populate some custom binary variables based on whether certain phrases are found in the raw dataset (e.g. "has_wood_frame", a new variable which returns whether the phrase "wood..." + "...frame" is found in the raw dataset at the_data).

These extra variables serve to make correlation calculations less tedious later on.

In [None]:
# Creating flipped dataset
combined_flipped = pd.DataFrame(columns=['project_id', 'property_type', 'sqft', 'zip', 'has_wood_frame', 'has_vert_wood_frame', 'has_horz_wood_frame', 'wf_vertconst', 'wf_horzconst', 'has_concrete', 'has_glu', 'num_floors', 'num_tenants', 'age'])

# Extract each unique project_id in retail_simplified
for project_id in combined_simple['project_id'].unique():
    project_rows = combined_simple[combined_simple['project_id'] == project_id]
    
    sqft = None
    zip_code = None
    wood_frame_present = False
    wood_frame_present_vert = False
    wood_frame_present_horz = False
    concrete_present = False
    glu_present = False
    wf_summary_vert_cons = None
    wf_vertconst = None
    wf_horzconst = None
    num_floors = None  
    num_tenants = None
    age = None
    property_type = None
    
    # Get property type 
    property_type = project_rows['property_type'].iloc[0]

    # Check if the project has a summary of gross building (extracting sqft)
    if '1f_summary_gross_building' in project_rows['document_spot'].values:
        # Filter rows to get only those with '1f_summary_gross_building'
        summary_rows = project_rows[project_rows['document_spot'] == '1f_summary_gross_building']
        for index, row in summary_rows.iterrows():
            # Extract sqft from the_data
            sqft_match = re.search(r'\b(\d+)\b', str(row['the_data']))
            if sqft_match:
                sqft = int(sqft_match.group(0))
                
    # Check if the project has a zipcode
    if '01e_project_zip' in project_rows['document_spot'].values:
        # Filter rows to get only those with '01e_project_zip'
        zip_rows = project_rows[project_rows['document_spot'] == '01e_project_zip']
        for index, row in zip_rows.iterrows():
            # Extract zip code from the_data
            zip_match = re.search(r'\b\d{5}\b', str(row['the_data']))
            if zip_match:
                zip_code = zip_match.group(0)
    
    # Populating has_wood_frame, has_concrete, has_glu (based on "42a_vertconst" and "42b_horzconst")
    # Check for wood frame, concrete, and glu in vertconst and horzconst
    for index, project_row in project_rows.iterrows():
        if project_row['document_spot'] in ['42a_vertconst', '42b_horzconst']:
            # Check for wood frame
            wood_frame_present = wood_frame_present or ('wood' in str(project_row['the_data']).lower() and 'fram' in str(project_row['the_data']).lower())
            
            # Check for wood frame in vertconst
            # 5/4/24 removed "fram" phrase requirement 
            if project_row['document_spot'] == '42a_vertconst':
                wood_frame_present_vert = wood_frame_present_vert or ('wood' in str(project_row['the_data']).lower() ) #and 'fram' in str(project_row['the_data']).lower())
            
            # Check for wood frame in horzconst
            if project_row['document_spot'] == '42b_horzconst':
                wood_frame_present_horz = wood_frame_present_horz or ('wood' in str(project_row['the_data']).lower() ) #and 'fram' in str(project_row['the_data']).lower())
            
            # Check for concrete
            concrete_present = concrete_present or ('concrete' in str(project_row['the_data']).lower() or 'cmu' in str(project_row['the_data']).lower())
            
            # Check for glu
            glu_present = glu_present or ('glu' in str(project_row['the_data']).lower())
            
            # Check for wf_summary_vert_cons
            if project_row['document_spot'] == '1j_summary_vert_cons':
                wf_summary_vert_cons = project_row['the_data']
            
            # Check for wf_vertconst
            if project_row['document_spot'] == '42a_vertconst':
                wf_vertconst = project_row['the_data']
            
            # Check for wf_horzconst
            if project_row['document_spot'] == '42b_horzconst':
                wf_horzconst = project_row['the_data']
        
        # Check for number of floors
        if project_row['document_spot'] == '1d_summary_floor_num':
            num_floors = project_row['the_data']

        # Check for number of floors
        if project_row['document_spot'] == '1z_summary_tenant_spaces':
            num_tenants = project_row['the_data']

        # Check for number of tenants
        if project_row['document_spot'] == 'pca_01_eff_age':
            age = project_row['the_data']

        # new Check for glu-lam to accept more (by also accepting roof framing instances)
        if project_row['document_spot'] == 'roof_framing_42':
            glu_present = glu_present or ('glu' in str(project_row['the_data']).lower())
    
    # Append project attributes to retail_flipped
    combined_flipped.loc[len(combined_flipped)] = {
        'project_id': project_id,
        'property_type': property_type,
        'sqft': sqft,
        'zip': zip_code,
        'has_wood_frame': 1 if wood_frame_present else 0,
        'has_vert_wood_frame': 1 if wood_frame_present_vert else 0,
        'has_horz_wood_frame': 1 if wood_frame_present_horz else 0,
        'wf_vertconst': wf_vertconst,
        'wf_horzconst': wf_horzconst,
        'has_concrete': 1 if concrete_present else 0,
        'has_glu': 1 if glu_present else 0,
        'num_floors': num_floors,
        'num_tenants': num_tenants,
        'age': age
    }

combined_flipped

Unnamed: 0,project_id,property_type,sqft,zip,has_wood_frame,has_vert_wood_frame,has_horz_wood_frame,wf_vertconst,wf_horzconst,has_concrete,has_glu,num_floors,num_tenants,age
0,451165,Retail,12808,46123,0,0,0,"[""Conventional steel framing, isolated CMU, st...","[""No upper floors present""]",1,0,One,4,
1,350106,Retail,10958,43055,1,0,1,"[""Concrete masonry unit load bearing walls wit...","[""Wood-framing with wood decking""]",1,0,1,1,
2,384179,Retail,13,46360,0,0,0,"[""Steel-framed""]","[""Open-web steel joists with steel decking""]",0,0,1,one,
3,402102,Retail,82,06902,0,0,0,"[""Steel-framed""]","[""Steel-framed with steel decking and concrete...",1,0,1,One,
4,417214,Retail,11204,44062,0,0,0,"[""CMU load bearing walls, steel columns, steel...","[""No upper floors present""]",1,0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7503,618913,Office,59710,27517,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,0,Three,10,
7504,615825,Office,5650,89523,0,0,0,"[""Concrete tilt-up perimeter walls""]","[""No upper floors present""]",1,0,One,3,
7505,618912,Office,59178,27517,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,0,Three,12,36
7506,558027,Industrial,10500,95370,0,0,0,"[""Pre-engineered steel structure""]","[""No upper floors present""]",0,0,One,0,


Initial observations about the variable extraction:

- has_glu is mostly 0

- num_floors has a very inconsistent response convention (will standardize later before calculations)

- age is mostly null - only ~650 buildings had pca_eff_age filled out at all 

### Filter out null values in vert and horz construction

This ensures that a missing value in "h/v construction" doesn't erroneously result in a has_wood_frame of 0. 

In [None]:
# Drop rows with null vert or horz 
combined_flipped = combined_flipped.dropna(subset=['wf_vertconst', 'wf_horzconst'])

### Filter out properties that have more than one building

Commented out as combined_clean.csv comes with multi-bldg properties already dropped.

In [None]:
# # Store all properties with only one building
# single_building_projects = set()

# for _, row in retail.iterrows():
#     project_id = row['project_id']
#     document_spot = row['document_spot']
#     the_data = row['the_data']

#     # Check if the document_spot is "1c_summary_building_num" (number of bldgs)
#     if document_spot == "1c_summary_building_num":
#         building_num_match = re.search(r'\b(\d+)\b', str(the_data))
#         if building_num_match:
#             building_num = int(building_num_match.group(0))
#             if building_num == 1:
#                 single_building_projects.add(project_id)

# # Update retail_flipped to only have single building properties
# retail_flipped = retail_flipped[retail_flipped['project_id'].isin(single_building_projects)]

# ## check
# #  retail_flipped

### Adding "roof framing" as a column to retail_flipped

Commented out as it doesn't run with the new combined_clean.csv approach. Not urgent as roof framing was only used for glu_lam, which we are no longer using. 

In [None]:
# roof_framing = {}

# for _, row in retail.iterrows():
#     project_id = row['project_id']
#     document_spot = row['document_spot']
#     the_data = row['the_data']

#     if document_spot == "roof_framing_42":
#         roof_framing[project_id] = str(the_data)

# # Add new "roof_framing" column to retail_flipped
# retail_flipped['roof_framing'] = retail_flipped['project_id'].map(roof_framing)

# # Finally update the "has_glu" column based on the "roof_framing" column
# retail_flipped['has_glu'] = retail_flipped['has_glu'].astype(int) | (retail_flipped['roof_framing'].str.contains('glu', case=False, na=False)).astype(int)

# ## 
# retail_flipped

Now that roof framing has been added to the flipped dataset, we can see that there is just one instance of glu-lamination within roof_framing.

### Adding "region" attribute to the newly flipped dataset

In [None]:
# Reading in region dataset (to add region as an attribute)
zip_to_state = pd.read_csv("zip_to_state.csv", dtype={'zipcode': str})

zip_to_state.head()

Unnamed: 0,zipcode,city,state
0,601,Adjuntas,PR
1,602,Aguada,PR
2,603,Aguadilla,PR
3,606,Maricao,PR
4,610,Anasco,PR


In [None]:
# Identifying zipcode document spot
zip_code = combined[combined['document_spot'] == '01e_project_zip'].copy()

zip_code.head(10)

Unnamed: 0,project_id,document_spot,the_data,property_type
48,402102,01e_project_zip,6902,Retail
106,409030,01e_project_zip,33624,Retail
199,376933,01e_project_zip,92672,Retail
312,507879,01e_project_zip,78207,Retail
412,579418,01e_project_zip,54745,Retail
494,579415,01e_project_zip,53910,Retail
594,323502,01e_project_zip,55025,Retail
2050,456831,01e_project_zip,75061,Retail
2056,341563,01e_project_zip,23224,Retail
2068,351547,01e_project_zip,85295,Retail


In [None]:
# Based on this article: https://www.ackerheatcool.com/how-hvac-systems-differ-by-region/#:~:text=The%20local%20climate%20plays%20the,others%20across%20the%20United%20States.
# Region sources: https://www.chino.k12.ca.us/cms/lib/CA01902308/Centricity/domain/50/ngss%20resources%20for%20documents/ngss%20tablecloth/3rd%20grade/Grade%203%20US%20Regions%20and%20weather.pdf

# Converting states -> region
northeast = ['ME', 'NH', 'VT', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA', 'DE', 'MD']
midwest = ['IL', 'IN', 'MI', 'OH', 'WI', 'MN', 'IA', 'MO', 'ND', 'SD', 'NE', 'KS']
southeast = ['VA', 'WV', 'NC', 'SC', 'GA', 'FL', 'KY', 'TN', 'AL', 'MS', 'AR', 'LA']
southwest = ['AZ', 'NM', 'OK', 'TX']
west = ['WA', 'OR', 'MT', 'ID', 'WY', 'CO', 'UT', 'NV', 'CA']

# Function for region conversion
def regions(x):
    if x in northeast:
        return 'NE'
    elif x in midwest:
        return 'MW'
    elif x in southeast:
        return 'SE'
    elif x in southwest:
        return 'SW'
    else:
        return 'W'

In [None]:
# Merging datasets and applying conversion
region_df = pd.merge(zip_code, zip_to_state, left_on='the_data', right_on='zipcode', how='left')

region_df['region'] = region_df['state'].apply(regions)

region_df

Unnamed: 0,project_id,document_spot,the_data,property_type,zipcode,city,state,region
0,402102,01e_project_zip,06902,Retail,06902,Stamford,CT,NE
1,409030,01e_project_zip,33624,Retail,33624,Tampa,FL,SE
2,376933,01e_project_zip,92672,Retail,92672,San Clemente,CA,W
3,507879,01e_project_zip,78207,Retail,78207,San Antonio,TX,SW
4,579418,01e_project_zip,54745,Retail,54745,Holcombe,WI,MW
...,...,...,...,...,...,...,...,...
7503,610116,01e_project_zip,30096,Industrial,30096,Duluth,GA,SE
7504,610122,01e_project_zip,30084,Industrial,30084,Tucker,GA,SE
7505,608965,01e_project_zip,60540,Industrial,60540,Naperville,IL,MW
7506,610115,01e_project_zip,30040,Industrial,30040,Cumming,GA,SE


In [None]:
# Adding region as attribute to flipped df 
combined_flipped = pd.merge(combined_flipped, region_df, on='project_id', how='left')

In [None]:
combined_flipped

Unnamed: 0,project_id,property_type_x,sqft,zip,has_wood_frame,has_vert_wood_frame,has_horz_wood_frame,wf_vertconst,wf_horzconst,has_concrete,...,num_floors,num_tenants,age,document_spot,the_data,property_type_y,zipcode,city,state,region
0,451165,Retail,12808,46123,0,0,0,"[""Conventional steel framing, isolated CMU, st...","[""No upper floors present""]",1,...,One,4,,01e_project_zip,46123,Retail,46123,Avon,IN,MW
1,350106,Retail,10958,43055,1,0,1,"[""Concrete masonry unit load bearing walls wit...","[""Wood-framing with wood decking""]",1,...,1,1,,01e_project_zip,43055,Retail,43055,Newark,OH,MW
2,384179,Retail,13,46360,0,0,0,"[""Steel-framed""]","[""Open-web steel joists with steel decking""]",0,...,1,one,,01e_project_zip,46360,Retail,46360,Michigan City,IN,MW
3,402102,Retail,82,06902,0,0,0,"[""Steel-framed""]","[""Steel-framed with steel decking and concrete...",1,...,1,One,,01e_project_zip,06902,Retail,06902,Stamford,CT,NE
4,417214,Retail,11204,44062,0,0,0,"[""CMU load bearing walls, steel columns, steel...","[""No upper floors present""]",1,...,1,1,,01e_project_zip,44062,Retail,44062,Middlefield,OH,MW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5970,618913,Office,59710,27517,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,...,Three,10,,01e_project_zip,27517,Office,27517,Chapel Hill,NC,SE
5971,615825,Office,5650,89523,0,0,0,"[""Concrete tilt-up perimeter walls""]","[""No upper floors present""]",1,...,One,3,,01e_project_zip,89523,Office,89523,Reno,NV,W
5972,618912,Office,59178,27517,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,...,Three,12,36,01e_project_zip,27517,Office,27517,Chapel Hill,NC,SE
5973,558027,Industrial,10500,95370,0,0,0,"[""Pre-engineered steel structure""]","[""No upper floors present""]",0,...,One,0,,01e_project_zip,95370,Industrial,95370,Sonora,CA,W


In [None]:
# Hardcoding columns for flipped df
combined_flipped = combined_flipped.rename(columns={'property_type_x': 'property_type'})
combined_flipped = combined_flipped[['project_id', 'property_type', 'sqft', 'zip', 'region', 'has_wood_frame', 'has_vert_wood_frame', 'has_horz_wood_frame', 'wf_vertconst', 'wf_horzconst', 'has_concrete', 'has_glu', 'num_floors', 'num_tenants', 'age']]

combined_flipped

Unnamed: 0,project_id,property_type,sqft,zip,region,has_wood_frame,has_vert_wood_frame,has_horz_wood_frame,wf_vertconst,wf_horzconst,has_concrete,has_glu,num_floors,num_tenants,age
0,451165,Retail,12808,46123,MW,0,0,0,"[""Conventional steel framing, isolated CMU, st...","[""No upper floors present""]",1,0,One,4,
1,350106,Retail,10958,43055,MW,1,0,1,"[""Concrete masonry unit load bearing walls wit...","[""Wood-framing with wood decking""]",1,0,1,1,
2,384179,Retail,13,46360,MW,0,0,0,"[""Steel-framed""]","[""Open-web steel joists with steel decking""]",0,0,1,one,
3,402102,Retail,82,06902,NE,0,0,0,"[""Steel-framed""]","[""Steel-framed with steel decking and concrete...",1,0,1,One,
4,417214,Retail,11204,44062,MW,0,0,0,"[""CMU load bearing walls, steel columns, steel...","[""No upper floors present""]",1,0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5970,618913,Office,59710,27517,SE,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,0,Three,10,
5971,615825,Office,5650,89523,W,0,0,0,"[""Concrete tilt-up perimeter walls""]","[""No upper floors present""]",1,0,One,3,
5972,618912,Office,59178,27517,SE,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,0,Three,12,36
5973,558027,Industrial,10500,95370,W,0,0,0,"[""Pre-engineered steel structure""]","[""No upper floors present""]",0,0,One,0,


### Filtering retail_flipped down to avoid NaNs and unrealistic values

In [None]:
# Drop rows where sqft is less than 10 since likely mistakes/processing errors
combined_flipped = combined_flipped[combined_flipped['sqft'] >= 10]

combined_flipped

Unnamed: 0,project_id,property_type,sqft,zip,region,has_wood_frame,has_vert_wood_frame,has_horz_wood_frame,wf_vertconst,wf_horzconst,has_concrete,has_glu,num_floors,num_tenants,age
0,451165,Retail,12808,46123,MW,0,0,0,"[""Conventional steel framing, isolated CMU, st...","[""No upper floors present""]",1,0,One,4,
1,350106,Retail,10958,43055,MW,1,0,1,"[""Concrete masonry unit load bearing walls wit...","[""Wood-framing with wood decking""]",1,0,1,1,
2,384179,Retail,13,46360,MW,0,0,0,"[""Steel-framed""]","[""Open-web steel joists with steel decking""]",0,0,1,one,
3,402102,Retail,82,06902,NE,0,0,0,"[""Steel-framed""]","[""Steel-framed with steel decking and concrete...",1,0,1,One,
4,417214,Retail,11204,44062,MW,0,0,0,"[""CMU load bearing walls, steel columns, steel...","[""No upper floors present""]",1,0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5970,618913,Office,59710,27517,SE,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,0,Three,10,
5971,615825,Office,5650,89523,W,0,0,0,"[""Concrete tilt-up perimeter walls""]","[""No upper floors present""]",1,0,One,3,
5972,618912,Office,59178,27517,SE,0,0,0,"[""Concrete masonry unit load bearing walls wit...","[""Steel-framed with steel decking"", ""Steel-fra...",1,0,Three,12,36
5973,558027,Industrial,10500,95370,W,0,0,0,"[""Pre-engineered steel structure""]","[""No upper floors present""]",0,0,One,0,


In [None]:
# Convert cleaned df to csv
combined_flipped.to_csv('combined_df.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=aa72ad5b-4609-409b-ae7e-8a5fd2e73142' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>