In [3]:
from pathlib import Path
import pandas as pd

def loading_data():
    target_folder="src"
    current = Path.cwd()
    while current.name != target_folder:
        if current.parent == current:
            raise FileNotFoundError(f"Could not find folder named '{target_folder}' above {Path.cwd()}")
        current = current.parent

    path = current / "data" / "chicago_energy_benchmarking"
    print(path)

    dfs = []
    for file in path.rglob("*.csv"):
        print("Reading:", file)
        dfs.append(pd.read_csv(file))
    full_df = pd.concat(dfs, axis=0, join='outer', ignore_index=True).sort_values(by="Data Year", ascending=True)
    
    full_df['Property Name'] = full_df['Property Name'].astype(str)
    full_df['ZIP Code'] = full_df['ZIP Code'].astype(str)
    full_df['Community Area'] = full_df['Community Area'].astype(str)
    full_df['Primary Property Type'] = full_df['Primary Property Type'].astype(str)
    full_df['Gross Floor Area - Buildings (sq ft)'] = full_df['Gross Floor Area - Buildings (sq ft)'].str.replace(',', '').astype(float)
    full_df['Electricity Use (kBtu)'] = full_df['Electricity Use (kBtu)'].str.replace(',', '').astype(float)
    full_df['Natural Gas Use (kBtu)'] = full_df['Natural Gas Use (kBtu)'].str.replace(',', '').astype(float)
    full_df['District Steam Use (kBtu)'] = full_df['District Steam Use (kBtu)'].str.replace(',', '').astype(float)
    full_df['District Chilled Water Use (kBtu)'] = full_df['District Chilled Water Use (kBtu)'].str.replace(',', '').astype(float)
    full_df['All Other Fuel Use (kBtu)'] = full_df['All Other Fuel Use (kBtu)'].str.replace(',', '').astype(float)
    full_df['Site EUI (kBtu/sq ft)'] = full_df['Site EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
    full_df['Source EUI (kBtu/sq ft)'] = full_df['Source EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
    full_df['Weather Normalized Site EUI (kBtu/sq ft)'] = full_df['Weather Normalized Site EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
    full_df['Weather Normalized Source EUI (kBtu/sq ft)'] = full_df['Weather Normalized Source EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
    full_df['Total GHG Emissions (Metric Tons CO2e)'] = full_df['Total GHG Emissions (Metric Tons CO2e)'].astype(str).str.replace(',', '').astype(float)
    full_df['Water Use (kGal)'] = full_df['Water Use (kGal)'].astype(str).str.replace(',', '').astype(float)

    full_df['Location'] = full_df['Location'].astype(str)
    full_df['Reporting Status'] = full_df['Reporting Status'].astype(str)
    full_df['Exempt From Chicago Energy Rating'] = full_df['Exempt From Chicago Energy Rating'].astype(str)
    full_df['Row_ID'] = full_df['Row_ID'].astype(str)

    return full_df

full_df = loading_data()
print("Merged shape:", full_df.shape)
#print(Path.cwd())
#full_df.head()
full_df

/project/src/data/chicago_energy_benchmarking
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmarking_-_2017_Data_Reported_in_2018_20251002.csv
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmarking_-_2020_Data_Reported_in_2021_20251002.csv
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmarking_-_2022_Data_Reported_in_2023_20251002.csv
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmarking_-_2016_Data_Reported_in_2017_20251007.csv
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmarking_-_2021_Data_Reported_in_2022_20251002.csv
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmarking_-_2014_Data_Reported_in_2015_20251002.csv
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmarking_-_2023_Data_Reported_in_2024_20251002.csv
Reading: /project/src/data/chicago_energy_benchmarking/Chicago_Energy_Benchmark

Unnamed: 0,Data Year,ID,Property Name,Address,ZIP Code,Community Area,Primary Property Type,Gross Floor Area - Buildings (sq ft),Year Built,# of Buildings,...,Total GHG Emissions (Metric Tons CO2e),GHG Intensity (kg CO2e/sq ft),Latitude,Longitude,Location,Reporting Status,Chicago Energy Rating,Exempt From Chicago Energy Rating,Water Use (kGal),Row_ID
16458,2014,101508,Northeastern Illinois University,5500 N. Saint Louis Avenue,60625,NORTH PARK,College/University,1258489.0,1961.0,19.0,...,15478.0,12.30,41.980399,-87.720266,POINT (-87.72026611 41.98039877),Submitted,,,,2014-101508
16413,2014,101923,311 West Monroe,311 West Monroe,60606,LOOP,Office,411672.0,1969.0,1.0,...,11363.0,27.60,41.880305,-87.635701,POINT (-87.63570062 41.88030523),Submitted,,,,2014-101923
16414,2014,100851,600 W Chicago,600 W Chicago,60654,NEAR NORTH SIDE,Office,1434236.0,1908.0,1.0,...,25200.0,17.57,41.896501,-87.642880,POINT (-87.64288 41.896501),Submitted,,,,2014-100851
16415,2014,103632,NMH Feinberg Pavilion and Galter Pavilion,251 E Huron St.,60654,NEAR NORTH SIDE,Hospital (General Medical & Surgical),2200000.0,1999.0,1.0,...,51047.0,23.20,41.894587,-87.621528,POINT (-87.62152834 41.89458666),Submitted,,,,2014-103632
16416,2014,103608,329 W 18th Street,329 W 18th Street,60616,LOWER WEST SIDE,Office,406780.0,1911.0,1.0,...,2652.0,6.52,41.857685,-87.636254,POINT (-87.636254 41.857685),Submitted,,,,2014-103608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18047,2023,256569,McCrory Senior Housing,1659 W. Washington Boulevard,60612,Near West Side,Multifamily Housing,62940.0,2018.0,1.0,...,315.2,5.00,41.975260,-87.744214,POINT (-87.744214 41.97526),Submitted,2.5,,,2023-256569
18048,2023,254364,HYP-Loop-Chicago,28 N. Franklin St.,60606,Loop,Hotel,142321.0,2015.0,1.0,...,1198.8,8.40,41.786328,-87.705136,POINT (-87.705136 41.786328),Submitted,4.0,,6281.0,2023-254364
18049,2023,251657,Locke J,2828 N Oak Park Ave,60634,Montclare,K-12 School,108400.0,1927.0,4.0,...,564.0,5.20,41.780404,-87.621837,POINT (-87.621837 41.780404),Submitted,2.5,,,2023-251657
18073,2023,252286,Delmar Apartments,5042 N. Winthrop,60640,Uptown,Multifamily Housing,58772.0,1920.0,1.0,...,410.2,7.00,41.876427,-87.669177,POINT (-87.669177 41.876427),Submitted,4.0,,9810.0,2023-252286


In [2]:
Path.cwd()

PosixPath('/project/src/notebooks')

In [191]:
full_df['Location'] = full_df['Location'].astype(str)
full_df['Reporting Status'] = full_df['Reporting Status'].astype(str)
full_df['Exempt From Chicago Energy Rating'] = full_df['Exempt From Chicago Energy Rating'].astype(str)
full_df['Row_ID'] = full_df['Row_ID'].astype(str)

# Get the set of unique data types in the 'ID' column
count = 0
count2= 0
for col in full_df.columns:
    count2 += 1
    unique_types = set(full_df[col].map(type))

    if len(unique_types) == 1:
        count += 1
        print(f"{col}:     {unique_types}")
print(count2)
print(count)

Data Year:     {<class 'int'>}
ID:     {<class 'int'>}
Property Name:     {<class 'str'>}
Address:     {<class 'str'>}
ZIP Code:     {<class 'str'>}
Community Area:     {<class 'str'>}
Primary Property Type:     {<class 'str'>}
Gross Floor Area - Buildings (sq ft):     {<class 'float'>}
Year Built:     {<class 'float'>}
# of Buildings:     {<class 'float'>}
ENERGY STAR Score:     {<class 'float'>}
Electricity Use (kBtu):     {<class 'float'>}
Natural Gas Use (kBtu):     {<class 'float'>}
District Steam Use (kBtu):     {<class 'float'>}
District Chilled Water Use (kBtu):     {<class 'float'>}
All Other Fuel Use (kBtu):     {<class 'float'>}
Site EUI (kBtu/sq ft):     {<class 'float'>}
Source EUI (kBtu/sq ft):     {<class 'float'>}
Weather Normalized Site EUI (kBtu/sq ft):     {<class 'float'>}
Weather Normalized Source EUI (kBtu/sq ft):     {<class 'float'>}
Total GHG Emissions (Metric Tons CO2e):     {<class 'float'>}
GHG Intensity (kg CO2e/sq ft):     {<class 'float'>}
Latitude:     {

In [162]:
for col in full_df.columns[:15]:
    unique_types = set(full_df[col].map(type))

    # Remove float from the set
    filtered_types = {t for t in unique_types if t is not float}

    # Show only if there's more than one non-float typ
    print(f"{col}:     {filtered_types}")

full_df.columns[15:]

Data Year:     {<class 'int'>}
ID:     {<class 'int'>}
Property Name:     {<class 'str'>}
Address:     {<class 'str'>}
ZIP Code:     {<class 'int'>, <class 'str'>}
Community Area:     {<class 'str'>}
Primary Property Type:     {<class 'str'>}
Gross Floor Area - Buildings (sq ft):     {<class 'str'>}
Year Built:     set()
# of Buildings:     set()
ENERGY STAR Score:     set()
Electricity Use (kBtu):     {<class 'str'>}
Natural Gas Use (kBtu):     {<class 'str'>}
District Steam Use (kBtu):     {<class 'str'>}
District Chilled Water Use (kBtu):     {<class 'str'>}


Index(['All Other Fuel Use (kBtu)', 'Site EUI (kBtu/sq ft)',
       'Source EUI (kBtu/sq ft)', 'Weather Normalized Site EUI (kBtu/sq ft)',
       'Weather Normalized Source EUI (kBtu/sq ft)',
       'Total GHG Emissions (Metric Tons CO2e)',
       'GHG Intensity (kg CO2e/sq ft)', 'Latitude', 'Longitude', 'Location',
       'Reporting Status', 'Chicago Energy Rating',
       'Exempt From Chicago Energy Rating', 'Water Use (kGal)', 'Row_ID'],
      dtype='object')

In [None]:
full_df['Property Name'] = full_df['Property Name'].astype(str)
full_df['ZIP Code'] = full_df['ZIP Code'].astype(str)
full_df['Community Area'] = full_df['Community Area'].astype(str)
full_df['Primary Property Type'] = full_df['Primary Property Type'].astype(int)
full_df['Gross Floor Area - Buildings (sq ft)'] = full_df['Gross Floor Area - Buildings (sq ft)'].str.replace(',', '').astype(float)
full_df['Electricity Use (kBtu)'] = full_df['Electricity Use (kBtu)'].str.replace(',', '').astype(float)
full_df['Natural Gas Use (kBtu)'] = full_df['Natural Gas Use (kBtu)'].str.replace(',', '').astype(float)
full_df['District Steam Use (kBtu)'] = full_df['District Steam Use (kBtu)'].str.replace(',', '').astype(float)
full_df['District Chilled Water Use (kBtu)'] = full_df['District Chilled Water Use (kBtu)'].str.replace(',', '').astype(float)
full_df['All Other Fuel Use (kBtu)'] = full_df['All Other Fuel Use (kBtu)'].str.replace(',', '').astype(float)
full_df['Site EUI (kBtu/sq ft)'] = full_df['Site EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
full_df['Source EUI (kBtu/sq ft)'] = full_df['Source EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
full_df['Weather Normalized Site EUI (kBtu/sq ft)'] = full_df['Weather Normalized Site EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
full_df['Weather Normalized Source EUI (kBtu/sq ft)'] = full_df['Weather Normalized Source EUI (kBtu/sq ft)'].astype(str).str.replace(',', '').astype(float)
full_df['Total GHG Emissions (Metric Tons CO2e)'] = full_df['Total GHG Emissions (Metric Tons CO2e)'].astype(str).str.replace(',', '').astype(float)
full_df['Water Use (kGal)'] = full_df['Water Use (kGal)'].astype(str).str.replace(',', '').astype(float)
full_df[''] = full_df[''].astype(str).str.replace(',', '').astype(float)
full_df[''] = full_df[''].astype(str).str.replace(',', '').astype(float)
full_df[''] = full_df[''].astype(str).str.replace(',', '').astype(float)
full_df[''] = full_df[''].astype(str).str.replace(',', '').astype(float)
full_df[''] = full_df[''].astype(str).str.replace(',', '').astype(float)
full_df[''] = full_df[''].astype(str)

In [153]:
col = "Water Use (kGal)"

full_df[col] = full_df[col].astype(str).str.replace(',', '').astype(float)
examples_by_type = {}

# Iterate through the column
for val in full_df[col]:
    val_type = type(val)
    if val_type not in examples_by_type:
        examples_by_type[val_type] = val

# Display the results
for dtype, example in examples_by_type.items():
    print(f"{dtype.__name__}: {example}")
#19,288,620 6,662,892.3

float: nan


In [154]:
full_df[col]

16458       NaN
16413       NaN
16414       NaN
16415       NaN
16416       NaN
          ...  
18047       NaN
18048    6281.0
18049       NaN
18073    9810.0
19518       NaN
Name: Water Use (kGal), Length: 28329, dtype: float64

In [84]:
#full_df[col] = full_df[col].str.replace(',', '').astype(float)
full_df[col] = full_df[col].str.replace(',', '')
full_df[[col]]

Unnamed: 0,Weather Normalized Site EUI (kBtu/sq ft)
16458,
16413,
16414,
16415,
16416,
...,...
18047,
18048,
18049,
18073,
