# 0 Configuration

In [620]:
config = {
    "overview": {
        "overview_plot": False
    },
    "cleaning": {
        "emptyness": {
            "overview_plot": False,
            "remove_empty": True
        },
        "types": {
            "convert": True
        },
        "categoricals": {
            "overview_plot": False,
            "usability": {
                "remove": True
            },
            "inconsistencies": {
                "strip_and_lower": True
            }
        },
        "numericals": {
            "overview_plot": False,
            "usability": {
                "remove": True
            },
            "redundancy": {
                "overview_plot": False,
                "distrib_corr_plot": False,
                "remove": True
            },
            "outliers": {
                "remove": False,
                "iqr_remove": False,
                "supplied_remove": True,
                "plot": False,
            },
            "scaling": {
                "distrib_plot_pre": False,
                "scale": True,
                "distrib_plot_post": False,
            }
        }
    },
    "correlations": {
        "numericals": {
            "overview_heatmap_plot": False,
            "removing": False,
            "result_heatmap_plot": False
        },
        "categoricals": {
            "labelisation": True,
            "overview_heatmap_plot": False,
            "removing": False,
            "result_heatmap_plot": False
        }
    },
    "final": {
        "overview_plot": False
    }
}

***
# 1 Dependency import

In [621]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from fuzzywuzzy import process
import chardet

from math import ceil

from sklearn.preprocessing import minmax_scale, power_transform

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

np.random.seed(0)

In [622]:
def dataframe_distribution_overview(data, figsize=(10, 3)):
    plt.figure(figsize=figsize)

    sns.barplot(x=data.columns, y=data.count())

    plt.title("Number of values per column", size=20)
    plt.xticks(rotation=45, size=16, ha="right")
    plt.yticks(size=16)
    plt.ylabel("Number values", size=16)
    plt.show()

In [623]:
def index_lth(data, percentage: int):
    percentage = percentage / 100
    less_than = data.count() < data.shape[0] * percentage
    index_less_than = less_than[less_than == True].index
    return index_less_than

In [624]:
def multi_plot_numerical(data, features, kind="hist", n_cols=8, figsize=(30, 10), wspace=0.35, hspace=0.35):
    feature_nb = len(features)
    n_rows = ceil(feature_nb / n_cols)
    index = 0

    plt.subplots(n_rows, n_cols, figsize=figsize)
    plt.subplots_adjust(wspace=wspace, hspace=hspace)

    for r in range(n_rows):
        for c in range(n_cols):
            if index >= feature_nb:
                break

            plt.subplot(n_rows, n_cols, index+1)

            feature = features[index]

            if kind == "box":
                plot = sns.boxplot(y=data[feature])
            elif kind == "hist":
                plot = sns.histplot(data=data[feature], kde=True)
            else:
                plot = sns.histplot(data=data[feature], kde=True)

            plot.set_xlabel(feature, fontsize=12)
            plot.set_ylabel(None)

            index += 1

    plt.show()

In [625]:
def create_subplot(dataset, feature, n_rows, index, is_numeric):
    plt.subplot(n_rows, 2, index)
    uniques = dataset[feature].unique()

    if is_numeric:
        plot = sns.boxplot(y=dataset[feature])
    else:
        if uniques.size <= 20 and uniques.size > 0:
            plot = sns.countplot(x=dataset[feature])
            plt.xticks(rotation=45, size=8, ha="right")
        else:
            dist = pd.DataFrame(data=[[uniques.size, dataset.shape[0] - uniques.size]], columns=["uniques", "not_uniques"])
            plot = sns.barplot(data=dist)

    plot.set_xlabel(None)
    plot.set_ylabel(feature, fontsize=14)


def dataset_diff_analysis(data1, data2, exclude=[], figsize=(15, 200)):
    features = data1.columns.union(data2.columns).difference(exclude)
    n_cols = 2
    n_rows = len(features)
    col_id = 0
    index = 1

    plt.subplots(n_rows, n_cols, figsize=figsize)
    plt.subplots_adjust(wspace=0.35, hspace=0.5)

    for f in features:
        is_numeric = False
        if f in data1.columns:
            f_type = data1[f].dtype
        else:
            f_type = data2[f].dtype
        if f_type in ["int64", "float64"]:
            is_numeric = True

        if f in data1.columns.values:
            create_subplot(data1, f, n_rows, index, is_numeric)
        if f in data2.columns.values:
            create_subplot(data2, f, n_rows, index+1, is_numeric)

        index += 2

    print(f"features: {features}")
    plt.show()

In [626]:
def decribe_several(feature, *df):
    data = {}
    index = 1
    for d in df:
        data[feature + "_" + str(index)] = d[feature]
        index += 1
    final_dataframe = pd.DataFrame(data)
    return final_dataframe.describe()

In [627]:
def head_several(feature, nb, *df):
    data = {}
    index = 1
    for d in df:
        data[feature + "_" + str(index)] = d[feature]
        index += 1
    final_dataframe = pd.DataFrame(data)
    return final_dataframe.head(nb)

In [628]:
class OutlierProcessor():
    def __init__(self, data, features, lower_trig, upper_trig):
        self.data = data
        self.features = features
        self.lower_trig = lower_trig
        self.upper_trig = upper_trig
        self.__above = 0
        self.__below = 0
        self.__total = 0

    def __print(self):
        print(f"lower_trig: {self.lower_trig}")
        print(f"upper_trig: {self.upper_trig}")
        print(f"below: {self.__below}")
        print(f"above: {self.__above}")
        print(f"total: {self.__total}")

    def analyse(self):
        self.__below = self.data[self.data[self.features] < self.lower_trig][self.features].count()
        self.__above = self.data[self.data[self.features] > self.upper_trig][self.features].count()
        self.__total = self.__below + self.__above
        self.__print()

    def replace(self, replace_by=np.nan, inplace=False):
        result = self.data.loc[:, self.features].where(cond=lambda x: ((x > self.lower_trig) & (self.upper_trig > x)), other=replace_by)
        if inplace:
            self.data[self.features] = result
        else:
            return result



In [629]:
class OutlierIqrProcessor(OutlierProcessor):
    def __init__(self, data, features, exclude=None):
        self.features = [feature for feature in features if feature not in exclude]
        self.__q1 = data[self.features].quantile(0.25)
        self.__q3 = data[self.features].quantile(0.75)
        self.__iqr = self.__q3 - self.__q1
        upper_trig = self.__q3 + (1.5 * self.__iqr)
        lower_trig = self.__q1 - (1.5 * self.__iqr)
        super().__init__(data, self.features, lower_trig, upper_trig)


In [630]:
def correlation_heatmap(dataset, figsize=(30, 20)):
    plt.figure(figsize=figsize)

    correlation = dataset.corr()
    mask = np.triu(np.ones_like(correlation, dtype=bool))

    sns.heatmap(data=correlation, mask=mask, annot=True, vmax=.75, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title("Correlation heatmap", size=20)
    plt.xticks(rotation=45, size=16, ha="right")
    plt.yticks(size=16)
    plt.show()

In [631]:
def unique_several(dataset, features, take=20):
    uniques_data = {}
    too_many_uniques = []
    only_one_uniques = []
    only_two_uniques = []

    for f in features:
        f_uniques = dataset[f].unique()
        if f_uniques.size <= take:
            if f_uniques.size == 1:
                only_one_uniques.append(f)
            elif f_uniques.size == 2:
                only_two_uniques.append(f)
            else:
                uniques_data[f] = pd.Series(data=f_uniques, name=f, dtype="object")
        else:
            too_many_uniques.append((f, f_uniques.size))

    print(f"Only one unique: {only_one_uniques}")
    print(f"Only two uniques: {only_two_uniques}")
    print(f"Too many uniques: {too_many_uniques}")

    return pd.DataFrame(data=uniques_data)

In [632]:
def strip_and_lower(dataset, features, to_strip=None, inplace=False):
    result_data = {}
    for f in features:
        result_data[f] = pd.Series(data=dataset[f].str.strip(to_strip=to_strip), name=f, dtype="object")
        result_data[f] = result_data[f].str.lower()
    result = pd.DataFrame(data=result_data)
    if inplace:
        dataset[features] = result
    else:
        return result

In [633]:
def fuzzy_matching_several(dataset, fuzzy, limit=10):
    fuzzy_data = {}
    for feature, match in fuzzy:
        fuzzy_matches = fuzzywuzzy.process.extract(match, dataset[feature], limit=limit, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
        fuzzy_data[feature] = pd.Series(data=fuzzy_matches, name=feature, dtype="object")
    return pd.DataFrame(data=fuzzy_data)

In [634]:
def feature_distribution_multivar(data, features, figsize=(10, 3), wspace=0.2, hspace=0.2, corr_scale=(0.75, 0), xlim=(None, None), ylim=(None, None)):
    plt.subplots(1, 2, figsize=figsize)
    plt.subplots_adjust(wspace=wspace, hspace=hspace)

    plt.subplot(1, 2, 1)
    for f in features:
        plot = sns.kdeplot(data=data[f], label=f, shade=True)

    plt.title(f"Distribution", size=20)
    plt.xticks(size=16)
    plt.yticks(size=16)
    plt.xlim(xlim[0], xlim[1])
    plt.ylim(ylim[0], ylim[1])
    plt.xlabel(None)
    plt.ylabel("Density", size=16)
    plt.legend()

    plt.subplot(1, 2, 2)
    correlation = data.loc[:, features].corr()
    mask = np.triu(np.ones_like(correlation, dtype=bool))

    sns.heatmap(data=correlation, mask=mask, annot=True, vmax=corr_scale[0], center=corr_scale[1],
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title(f"Correlation", size=20)
    plt.xticks(rotation=45, size=12, ha="right")
    plt.yticks(rotation=0, size=12, va="center")
    
    plt.show()


***
# 2 Data loading

In [635]:
data = pd.read_csv("data/2016-building-energy-benchmarking.csv", delimiter=",")

***
# 3 Overview

In [636]:
if config["overview"]["overview_plot"]:
    dataframe_distribution_overview(data, figsize=(30, 3))

In [637]:
targets = ["SiteEnergyUse(kBtu)", "GHGEmissionsIntensity"]

***
# 4 Cleaning

***
## 4.1 Emptyness

In [638]:
cols_to_remove = index_lth(data, 10).difference(["Outlier"])

In [639]:
if config["cleaning"]["emptyness"]["remove_empty"]:
    data.drop(columns=cols_to_remove, inplace=True)

In [640]:
if config["cleaning"]["emptyness"]["overview_plot"]:
    dataframe_distribution_overview(data, figsize=(30, 3))

***
## 4.2 Types

In [641]:
data.describe(include="object")

Unnamed: 0,BuildingType,PrimaryPropertyType,PropertyName,Address,City,State,TaxParcelIdentificationNumber,Neighborhood,ListOfAllPropertyUseTypes,LargestPropertyUseType,SecondLargestPropertyUseType,ThirdLargestPropertyUseType,ComplianceStatus,Outlier
count,3376,3376,3376,3376,3376,3376,3376,3376,3367,3356,1679,596,3376,32
unique,8,24,3362,3354,1,1,3268,19,466,56,50,44,4,2
top,NonResidential,Low-Rise Multifamily,Northgate Plaza,2600 SW Barton St,Seattle,WA,1625049001,DOWNTOWN,Multifamily Housing,Multifamily Housing,Parking,Retail Store,Compliant,Low outlier
freq,1460,987,3,4,3376,3376,8,573,866,1667,976,110,3211,23


In [642]:
data.describe()

Unnamed: 0,OSEBuildingID,DataYear,ZipCode,CouncilDistrictCode,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),LargestPropertyUseTypeGFA,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseTypeGFA,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),TotalGHGEmissions,GHGEmissionsIntensity
count,3376.0,3376.0,3360.0,3376.0,3376.0,3376.0,3376.0,3368.0,3376.0,3376.0,3376.0,3376.0,3356.0,1679.0,596.0,2533.0,3369.0,3370.0,3367.0,3367.0,3371.0,3370.0,3367.0,3367.0,3367.0,3367.0,3367.0,3367.0,3367.0
mean,21208.991114,2016.0,98116.949107,4.439277,47.624033,-122.334795,1968.573164,1.106888,4.709123,94833.54,8001.526066,86832.01,79177.64,28444.075817,11738.675166,67.918674,54.732116,57.033798,134.232848,137.783932,5403667.0,5276726.0,274595.9,1086639.0,3707612.0,13685.05,1368505.0,119.723971,1.175916
std,12223.757015,0.0,18.615205,2.120625,0.047758,0.027203,33.088156,2.108402,5.494465,218837.6,32326.723928,207939.8,201703.4,54392.917928,29331.199286,26.873271,56.273124,57.16333,139.287554,139.109807,21610630.0,15938790.0,3912173.0,4352478.0,14850660.0,67097.81,6709781.0,538.832227,1.821452
min,1.0,2016.0,98006.0,1.0,47.49917,-122.41425,1900.0,0.0,0.0,11285.0,0.0,3636.0,5656.0,0.0,0.0,1.0,0.0,0.0,0.0,-2.1,0.0,0.0,0.0,-33826.8,-115417.0,0.0,0.0,-0.8,-0.02
25%,19990.75,2016.0,98105.0,3.0,47.59986,-122.350662,1948.0,1.0,2.0,28487.0,0.0,27756.0,25094.75,5000.0,2239.0,53.0,27.9,29.4,74.699997,78.400002,925128.6,970182.2,0.0,187422.9,639487.0,0.0,0.0,9.495,0.21
50%,23112.0,2016.0,98115.0,4.0,47.618675,-122.332495,1975.0,1.0,4.0,44175.0,0.0,43216.0,39894.0,10664.0,5043.0,75.0,38.599998,40.900002,96.199997,101.099998,1803753.0,1904452.0,0.0,345129.9,1177583.0,3237.538,323754.0,33.92,0.61
75%,25994.25,2016.0,98122.0,7.0,47.657115,-122.319407,1997.0,1.0,5.0,90992.0,0.0,84276.25,76200.25,26640.0,10138.75,90.0,60.400002,64.275002,143.899994,148.349998,4222455.0,4381429.0,0.0,829317.8,2829632.0,11890.33,1189034.0,93.94,1.37
max,50226.0,2016.0,98272.0,7.0,47.73387,-122.220966,2015.0,111.0,99.0,9320156.0,512608.0,9320156.0,9320156.0,686750.0,459748.0,100.0,834.400024,834.400024,2620.0,2620.0,873923700.0,471613900.0,134943500.0,192577500.0,657074400.0,2979090.0,297909000.0,16870.98,34.09


In [643]:
if config["cleaning"]["types"]["convert"]:
    data["TaxParcelIdentificationNumber"] = pd.to_numeric(data["TaxParcelIdentificationNumber"], errors="coerce")
    data["TaxParcelIdentificationNumber"].describe()

***
## 4.3 Categoricals

### 4.3.1 Overview

In [644]:
categorical_data = data.select_dtypes("object").columns

In [645]:
data[categorical_data].describe()

Unnamed: 0,BuildingType,PrimaryPropertyType,PropertyName,Address,City,State,Neighborhood,ListOfAllPropertyUseTypes,LargestPropertyUseType,SecondLargestPropertyUseType,ThirdLargestPropertyUseType,ComplianceStatus,Outlier
count,3376,3376,3376,3376,3376,3376,3376,3367,3356,1679,596,3376,32
unique,8,24,3362,3354,1,1,19,466,56,50,44,4,2
top,NonResidential,Low-Rise Multifamily,Northgate Plaza,2600 SW Barton St,Seattle,WA,DOWNTOWN,Multifamily Housing,Multifamily Housing,Parking,Retail Store,Compliant,Low outlier
freq,1460,987,3,4,3376,3376,573,866,1667,976,110,3211,23


***
### 4.3.2 Usability

The features "City" and "State" are always the same. They bring no usefull information.

In [646]:
if config["cleaning"]["categoricals"]["usability"]["remove"]:
    features_to_delete = ["City", "State"]
    data.drop(columns=features_to_delete, inplace=True)
    categorical_data = categorical_data.drop(features_to_delete)

***
### 4.3.3 Inconsistencies

In [647]:
unique_several(data, categorical_data, take=50)

Only one unique: []
Only two uniques: []
Too many uniques: [('PropertyName', 3362), ('Address', 3354), ('ListOfAllPropertyUseTypes', 467), ('LargestPropertyUseType', 57), ('SecondLargestPropertyUseType', 51)]


Unnamed: 0,BuildingType,PrimaryPropertyType,Neighborhood,ThirdLargestPropertyUseType,ComplianceStatus,Outlier
0,NonResidential,Hotel,DOWNTOWN,,Compliant,
1,Nonresidential COS,Other,SOUTHEAST,Restaurant,Error - Correct Default Data,High outlier
2,Multifamily MR (5-9),Mid-Rise Multifamily,NORTHEAST,Swimming Pool,Missing Data,Low outlier
3,SPS-District K-12,Mixed Use Property,EAST,Data Center,Non-Compliant,
4,Campus,K-12 School,Central,Office,,
5,Multifamily LR (1-4),University,NORTH,Retail Store,,
6,Multifamily HR (10+),Small- and Mid-Sized Office,MAGNOLIA / QUEEN ANNE,Other - Entertainment/Public Assembly,,
7,Nonresidential WA,Self-Storage Facility,LAKE UNION,Non-Refrigerated Warehouse,,
8,,Warehouse,GREATER DUWAMISH,Other,,
9,,Large Office,BALLARD,Distribution Center,,


In [648]:
if config["cleaning"]["categoricals"]["inconsistencies"]["strip_and_lower"]:
    strip_and_lower(data, categorical_data, inplace=True)
    unique_several(data, categorical_data, take=50)

Only one unique: []
Only two uniques: []
Too many uniques: [('PropertyName', 3360), ('Address', 3353), ('ListOfAllPropertyUseTypes', 467), ('LargestPropertyUseType', 57), ('SecondLargestPropertyUseType', 51)]


In [649]:
# fuzzy = [
#     ("Neighborhood_2016", "Ballard"),
# ]
# fuzzy_matching_several(data)

***
## 4.4 Numericals

### 4.4.1 Overview

In [650]:
numerical_data = data.select_dtypes(["int64", "float64"]).columns

In [651]:
data[numerical_data].describe()

Unnamed: 0,OSEBuildingID,DataYear,ZipCode,TaxParcelIdentificationNumber,CouncilDistrictCode,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),LargestPropertyUseTypeGFA,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseTypeGFA,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),TotalGHGEmissions,GHGEmissionsIntensity
count,3376.0,3376.0,3360.0,3373.0,3376.0,3376.0,3376.0,3376.0,3368.0,3376.0,3376.0,3376.0,3376.0,3356.0,1679.0,596.0,2533.0,3369.0,3370.0,3367.0,3367.0,3371.0,3370.0,3367.0,3367.0,3367.0,3367.0,3367.0,3367.0,3367.0
mean,21208.991114,2016.0,98116.949107,4297012000.0,4.439277,47.624033,-122.334795,1968.573164,1.106888,4.709123,94833.54,8001.526066,86832.01,79177.64,28444.075817,11738.675166,67.918674,54.732116,57.033798,134.232848,137.783932,5403667.0,5276726.0,274595.9,1086639.0,3707612.0,13685.05,1368505.0,119.723971,1.175916
std,12223.757015,0.0,18.615205,2960847000.0,2.120625,0.047758,0.027203,33.088156,2.108402,5.494465,218837.6,32326.723928,207939.8,201703.4,54392.917928,29331.199286,26.873271,56.273124,57.16333,139.287554,139.109807,21610630.0,15938790.0,3912173.0,4352478.0,14850660.0,67097.81,6709781.0,538.832227,1.821452
min,1.0,2016.0,98006.0,0.0,1.0,47.49917,-122.41425,1900.0,0.0,0.0,11285.0,0.0,3636.0,5656.0,0.0,0.0,1.0,0.0,0.0,0.0,-2.1,0.0,0.0,0.0,-33826.8,-115417.0,0.0,0.0,-0.8,-0.02
25%,19990.75,2016.0,98105.0,1975701000.0,3.0,47.59986,-122.350662,1948.0,1.0,2.0,28487.0,0.0,27756.0,25094.75,5000.0,2239.0,53.0,27.9,29.4,74.699997,78.400002,925128.6,970182.2,0.0,187422.9,639487.0,0.0,0.0,9.495,0.21
50%,23112.0,2016.0,98115.0,3524039000.0,4.0,47.618675,-122.332495,1975.0,1.0,4.0,44175.0,0.0,43216.0,39894.0,10664.0,5043.0,75.0,38.599998,40.900002,96.199997,101.099998,1803753.0,1904452.0,0.0,345129.9,1177583.0,3237.538,323754.0,33.92,0.61
75%,25994.25,2016.0,98122.0,7129302000.0,7.0,47.657115,-122.319407,1997.0,1.0,5.0,90992.0,0.0,84276.25,76200.25,26640.0,10138.75,90.0,60.400002,64.275002,143.899994,148.349998,4222455.0,4381429.0,0.0,829317.8,2829632.0,11890.33,1189034.0,93.94,1.37
max,50226.0,2016.0,98272.0,55696400000.0,7.0,47.73387,-122.220966,2015.0,111.0,99.0,9320156.0,512608.0,9320156.0,9320156.0,686750.0,459748.0,100.0,834.400024,834.400024,2620.0,2620.0,873923700.0,471613900.0,134943500.0,192577500.0,657074400.0,2979090.0,297909000.0,16870.98,34.09


In [652]:
if config["cleaning"]["numericals"]["overview_plot"]:
    multi_plot_numerical(data, numerical_data, kind="box", n_cols=6, hspace=0.4, wspace=0.30, figsize=(30, 25))

***
### 4.4.2 Usability

In [653]:
data[numerical_data].describe()

Unnamed: 0,OSEBuildingID,DataYear,ZipCode,TaxParcelIdentificationNumber,CouncilDistrictCode,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),LargestPropertyUseTypeGFA,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseTypeGFA,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),TotalGHGEmissions,GHGEmissionsIntensity
count,3376.0,3376.0,3360.0,3373.0,3376.0,3376.0,3376.0,3376.0,3368.0,3376.0,3376.0,3376.0,3376.0,3356.0,1679.0,596.0,2533.0,3369.0,3370.0,3367.0,3367.0,3371.0,3370.0,3367.0,3367.0,3367.0,3367.0,3367.0,3367.0,3367.0
mean,21208.991114,2016.0,98116.949107,4297012000.0,4.439277,47.624033,-122.334795,1968.573164,1.106888,4.709123,94833.54,8001.526066,86832.01,79177.64,28444.075817,11738.675166,67.918674,54.732116,57.033798,134.232848,137.783932,5403667.0,5276726.0,274595.9,1086639.0,3707612.0,13685.05,1368505.0,119.723971,1.175916
std,12223.757015,0.0,18.615205,2960847000.0,2.120625,0.047758,0.027203,33.088156,2.108402,5.494465,218837.6,32326.723928,207939.8,201703.4,54392.917928,29331.199286,26.873271,56.273124,57.16333,139.287554,139.109807,21610630.0,15938790.0,3912173.0,4352478.0,14850660.0,67097.81,6709781.0,538.832227,1.821452
min,1.0,2016.0,98006.0,0.0,1.0,47.49917,-122.41425,1900.0,0.0,0.0,11285.0,0.0,3636.0,5656.0,0.0,0.0,1.0,0.0,0.0,0.0,-2.1,0.0,0.0,0.0,-33826.8,-115417.0,0.0,0.0,-0.8,-0.02
25%,19990.75,2016.0,98105.0,1975701000.0,3.0,47.59986,-122.350662,1948.0,1.0,2.0,28487.0,0.0,27756.0,25094.75,5000.0,2239.0,53.0,27.9,29.4,74.699997,78.400002,925128.6,970182.2,0.0,187422.9,639487.0,0.0,0.0,9.495,0.21
50%,23112.0,2016.0,98115.0,3524039000.0,4.0,47.618675,-122.332495,1975.0,1.0,4.0,44175.0,0.0,43216.0,39894.0,10664.0,5043.0,75.0,38.599998,40.900002,96.199997,101.099998,1803753.0,1904452.0,0.0,345129.9,1177583.0,3237.538,323754.0,33.92,0.61
75%,25994.25,2016.0,98122.0,7129302000.0,7.0,47.657115,-122.319407,1997.0,1.0,5.0,90992.0,0.0,84276.25,76200.25,26640.0,10138.75,90.0,60.400002,64.275002,143.899994,148.349998,4222455.0,4381429.0,0.0,829317.8,2829632.0,11890.33,1189034.0,93.94,1.37
max,50226.0,2016.0,98272.0,55696400000.0,7.0,47.73387,-122.220966,2015.0,111.0,99.0,9320156.0,512608.0,9320156.0,9320156.0,686750.0,459748.0,100.0,834.400024,834.400024,2620.0,2620.0,873923700.0,471613900.0,134943500.0,192577500.0,657074400.0,2979090.0,297909000.0,16870.98,34.09


In [654]:
if config["cleaning"]["numericals"]["usability"]["remove"]:
    features_to_delete = ["DataYear"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
### 4.4.2 Redundancy

In [655]:
if config["cleaning"]["numericals"]["redundancy"]["overview_plot"]:
    dataframe_distribution_overview(data[numerical_data], figsize=(25, 3))

***
#### 4.4.2.1 SiteEUI(kBtu/sf), SiteEUIWN(kBtu/sf)

In [656]:
features = ["SiteEUI(kBtu/sf)", "SiteEUIWN(kBtu/sf)"]

In [657]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-50, 300))

In [658]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["SiteEUIWN(kBtu/sf)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.2 SourceEUI(kBtu/sf), SourceEUIWN(kBtu/sf)

In [659]:
features = ["SourceEUI(kBtu/sf)", "SourceEUIWN(kBtu/sf)"]

In [660]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-100, 800))

In [661]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["SourceEUIWN(kBtu/sf)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.3 SiteEnergyUse(kBtu), SiteEnergyUseWN(kBtu)

In [662]:
features = ["SiteEnergyUse(kBtu)", "SiteEnergyUseWN(kBtu)"]

In [663]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-0.15e8, 0.35e8))

In [664]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["SiteEnergyUseWN(kBtu)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.4 Electricity(kWh), Electricity(kBtu)

In [665]:
features = ["Electricity(kWh)", "Electricity(kBtu)"]

In [666]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-1e7, 1.5e7))

In [667]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["Electricity(kBtu)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.5 NaturalGas(therms), NaturalGas(kBtu)

In [668]:
features = ["NaturalGas(therms)", "NaturalGas(kBtu)"]

In [669]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-2, 5))

In [670]:
data[features].describe()

Unnamed: 0,NaturalGas(therms),NaturalGas(kBtu)
count,3367.0,3367.0
mean,13685.05,1368505.0
std,67097.81,6709781.0
min,0.0,0.0
25%,0.0,0.0
50%,3237.538,323754.0
75%,11890.33,1189034.0
max,2979090.0,297909000.0


In [671]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["NaturalGas(therms)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
### 4.3.2 Outliers removal

In [672]:
data[numerical_data].describe()

Unnamed: 0,OSEBuildingID,ZipCode,TaxParcelIdentificationNumber,CouncilDistrictCode,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),LargestPropertyUseTypeGFA,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseTypeGFA,ENERGYSTARScore,SiteEUI(kBtu/sf),SourceEUI(kBtu/sf),SiteEnergyUse(kBtu),SteamUse(kBtu),Electricity(kWh),NaturalGas(kBtu),TotalGHGEmissions,GHGEmissionsIntensity
count,3376.0,3360.0,3373.0,3376.0,3376.0,3376.0,3376.0,3368.0,3376.0,3376.0,3376.0,3376.0,3356.0,1679.0,596.0,2533.0,3369.0,3367.0,3371.0,3367.0,3367.0,3367.0,3367.0,3367.0
mean,21208.991114,98116.949107,4297012000.0,4.439277,47.624033,-122.334795,1968.573164,1.106888,4.709123,94833.54,8001.526066,86832.01,79177.64,28444.075817,11738.675166,67.918674,54.732116,134.232848,5403667.0,274595.9,1086639.0,1368505.0,119.723971,1.175916
std,12223.757015,18.615205,2960847000.0,2.120625,0.047758,0.027203,33.088156,2.108402,5.494465,218837.6,32326.723928,207939.8,201703.4,54392.917928,29331.199286,26.873271,56.273124,139.287554,21610630.0,3912173.0,4352478.0,6709781.0,538.832227,1.821452
min,1.0,98006.0,0.0,1.0,47.49917,-122.41425,1900.0,0.0,0.0,11285.0,0.0,3636.0,5656.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-33826.8,0.0,-0.8,-0.02
25%,19990.75,98105.0,1975701000.0,3.0,47.59986,-122.350662,1948.0,1.0,2.0,28487.0,0.0,27756.0,25094.75,5000.0,2239.0,53.0,27.9,74.699997,925128.6,0.0,187422.9,0.0,9.495,0.21
50%,23112.0,98115.0,3524039000.0,4.0,47.618675,-122.332495,1975.0,1.0,4.0,44175.0,0.0,43216.0,39894.0,10664.0,5043.0,75.0,38.599998,96.199997,1803753.0,0.0,345129.9,323754.0,33.92,0.61
75%,25994.25,98122.0,7129302000.0,7.0,47.657115,-122.319407,1997.0,1.0,5.0,90992.0,0.0,84276.25,76200.25,26640.0,10138.75,90.0,60.400002,143.899994,4222455.0,0.0,829317.8,1189034.0,93.94,1.37
max,50226.0,98272.0,55696400000.0,7.0,47.73387,-122.220966,2015.0,111.0,99.0,9320156.0,512608.0,9320156.0,9320156.0,686750.0,459748.0,100.0,834.400024,2620.0,873923700.0,134943500.0,192577500.0,297909000.0,16870.98,34.09


In [673]:
values = data[numerical_data].idxmax().value_counts()
data.loc[values[values > 3].index]

Unnamed: 0,OSEBuildingID,BuildingType,PrimaryPropertyType,PropertyName,Address,ZipCode,TaxParcelIdentificationNumber,CouncilDistrictCode,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),ListOfAllPropertyUseTypes,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,ENERGYSTARScore,SiteEUI(kBtu/sf),SourceEUI(kBtu/sf),SiteEnergyUse(kBtu),SteamUse(kBtu),Electricity(kWh),NaturalGas(kBtu),DefaultData,ComplianceStatus,Outlier,TotalGHGEmissions,GHGEmissionsIntensity
3274,49967,campus,university,university of washington - seattle campus,ne 45th st & brooklyn ave,98105.0,1625049000.0,4,northeast,47.65644,-122.31041,1900,111.0,0,9320156,0,9320156,college/university,college/university,9320156.0,,,,,,93.800003,246.600006,873923712.0,84985240.0,192577488.0,0.0,False,compliant,,11140.56,1.2
35,43,campus,mixed use property,plant 2 site,7755 e marginal way s,98108.0,22000000.0,2,greater duwamish,47.53493,-122.31788,1992,1.0,3,494835,0,494835,"energy/power station, laboratory, manufacturin...",office,757027.0,laboratory,639931.0,non-refrigerated warehouse,459748.0,8.0,221.699997,388.200012,448385312.0,0.0,44102076.0,297909000.0,False,compliant,,16870.98,34.09


In [674]:
if config["cleaning"]["numericals"]["outliers"]["remove"]:
    data.drop(index=values[values > 3].index, inplace=True)

In [675]:
if config["cleaning"]["numericals"]["outliers"]["iqr_remove"]:
    o_proc = OutlierIqrProcessor(data, numerical_data, exclude=["Latitude", "Longitude"])
    temp = o_proc.replace(inplace=True)


In [676]:
if config["cleaning"]["numericals"]["outliers"]["supplied_remove"]:
    outliers_index = data[data["Outlier"].notna()].index
    data.drop(index=outliers_index, inplace=True)

features_to_delete = ["Outlier"]
data.drop(columns=features_to_delete, inplace=True)
categorical_data = categorical_data.drop(features_to_delete)

In [677]:
if config["cleaning"]["numericals"]["outliers"]["plot"]:
    multi_plot_numerical(data, numerical_data, kind="box", n_cols=6, hspace=0.4, wspace=0.30, figsize=(30, 25))

***
### 4.3.3 Scaling

In [678]:
if config["cleaning"]["numericals"]["scaling"]["distrib_plot_pre"]:
    multi_plot_numerical(data, numerical_data, n_cols=5, hspace=0.4, wspace=0.2, figsize=(30, 50))

In [679]:
if config["cleaning"]["numericals"]["scaling"]["scale"]:
    data[numerical_data] = pd.DataFrame(data=minmax_scale(data[numerical_data], feature_range=(0, 100)), columns=numerical_data)
    # data[numerical_data] = pd.DataFrame(data=power_transform(data[numerical_data]), columns=numerical_data)
    data[numerical_data].head()

In [680]:
if config["cleaning"]["numericals"]["scaling"]["distrib_plot_post"]:
    multi_plot_numerical(data, numerical_data, n_cols=5, hspace=0.4, wspace=0.2, figsize=(30, 50))

***
## 4.5 Targets

### 4.5.1 Nan removing

In [681]:
data[targets].isna().any()

SiteEnergyUse(kBtu)      True
GHGEmissionsIntensity    True
dtype: bool

In [682]:
data.dropna(subset=targets, inplace=True)

In [683]:
data[targets].isna().any()

SiteEnergyUse(kBtu)      False
GHGEmissionsIntensity    False
dtype: bool

***
### 4.5.2 Normalization

In [684]:
# targets

In [685]:
# data[targets].describe()

In [686]:
# feature = data.ZipCode.array.reshape(-1, 1)
# target_transformed = pd.DataFrame(data=power_transform(data[targets], method="box-cox"), columns=data[targets].columns)

In [687]:
# target_transformed.head()

In [688]:
# target_transformed.describe()

In [689]:
# data[targets] = target_transformed

***
# 5 Correlations

## 5.1 Numericals features

### 5.1.1 Overview

In [690]:
targets

['SiteEnergyUse(kBtu)', 'GHGEmissionsIntensity']

In [691]:
if config["correlations"]["numericals"]["overview_heatmap_plot"]:
    correlation_heatmap(data[numerical_data], figsize=(15, 15))

***
### 5.1.2 Removing

In [692]:
if config["correlations"]["numericals"]["removing"]:
    features_to_delete = []
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
### 5.1.3 Result

In [693]:
if config["correlations"]["numericals"]["result_heatmap_plot"]:
    correlation_heatmap(data[numerical_data], figsize=(15, 15))

***
## 5.2 Categoricals features

### 5.2.1 Labelisation

In [694]:
if config["correlations"]["categoricals"]["labelisation"]:
    categs_to_nums = data[categorical_data].apply(lambda feature: feature.astype("category").cat.codes)

    for col in categs_to_nums.columns:
        categs_to_nums.rename(columns={col: col + "_CATEG"}, inplace=True)

    data_enhanced = categs_to_nums.join(data[targets])

***
### 5.2.2 Overview

In [695]:
if config["correlations"]["categoricals"]["overview_heatmap_plot"]:
    correlation_heatmap(data_enhanced, figsize=(15, 15))

***
### 5.2.3 Removing

In [696]:
if config["correlations"]["categoricals"]["removing"]:
    features_to_delete = []
    data.drop(columns=features_to_delete, inplace=True)
    categorical_data = categorical_data.drop(features_to_delete)

***
### 5.2.4 Result

In [697]:
if config["correlations"]["categoricals"]["result_heatmap_plot"]:
    correlation_heatmap(data_enhanced, figsize=(15, 15))

***
# 6 Final state

In [698]:
if config["final"]["overview_plot"]:
    dataframe_distribution_overview(data, figsize=(30, 3))

***
# 7 Saving

In [699]:
data.to_csv("data/data-cleaned.csv", sep=",")