# 0 Configuration

In [None]:
config = {
    "overview": {
        "overview_plot": False
    },
    "cleaning": {
        "emptyness": {
            "overview_plot": False,
            "remove_empty": True
        },
        "types": {
            "convert": True
        },
        "categoricals": {
            "overview_plot": False,
            "usability": {
                "remove": True
            },
            "inconsistencies": {
                "strip_and_lower": True
            }
        },
        "numericals": {
            "overview_plot": False,
            "usability": {
                "remove": True
            },
            "redundancy": {
                "overview_plot": False,
                "distrib_corr_plot": False,
                "remove": True
            },
            "outliers": {
                "remove": False,
                "iqr_remove": False,
                "supplied_remove": True,
                "plot": False,
            },
            "scaling": {
                "distrib_plot_pre": False,
                "scale": True,
                "distrib_plot_post": False,
            }
        }
    },
    "correlations": {
        "numericals": {
            "overview_heatmap_plot": True,
            "removing": False,
            "result_heatmap_plot": True
        },
        "categoricals": {
            "labelisation": True,
            "overview_heatmap_plot": True,
            "removing": False,
            "result_heatmap_plot": True
        }
    },
    "final": {
        "overview_plot": False
    }
}

***
# 1 Dependency import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from fuzzywuzzy import process
import chardet

from math import ceil

from sklearn.preprocessing import minmax_scale

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

np.random.seed(0)

In [None]:
def dataframe_distribution_overview(data, figsize=(10, 3)):
    plt.figure(figsize=figsize)

    sns.barplot(x=data.columns, y=data.count())

    plt.title("Number of values per column", size=20)
    plt.xticks(rotation=45, size=16, ha="right")
    plt.yticks(size=16)
    plt.ylabel("Number values", size=16)
    plt.show()

In [None]:
def index_lth(data, percentage: int):
    percentage = percentage / 100
    less_than = data.count() < data.shape[0] * percentage
    index_less_than = less_than[less_than == True].index
    return index_less_than

In [None]:
def multi_plot_numerical(data, features, kind="hist", n_cols=8, figsize=(30, 10), wspace=0.35, hspace=0.35):
    feature_nb = len(features)
    n_rows = ceil(feature_nb / n_cols)
    index = 0

    plt.subplots(n_rows, n_cols, figsize=figsize)
    plt.subplots_adjust(wspace=wspace, hspace=hspace)

    for r in range(n_rows):
        for c in range(n_cols):
            if index >= feature_nb:
                break

            plt.subplot(n_rows, n_cols, index+1)

            feature = features[index]

            if kind == "box":
                plot = sns.boxplot(y=data[feature])
            elif kind == "hist":
                plot = sns.histplot(data=data[feature], kde=True)
            else:
                plot = sns.histplot(data=data[feature], kde=True)

            plot.set_xlabel(feature, fontsize=12)
            plot.set_ylabel(None)

            index += 1

    plt.show()

In [None]:
def create_subplot(dataset, feature, n_rows, index, is_numeric):
    plt.subplot(n_rows, 2, index)
    uniques = dataset[feature].unique()

    if is_numeric:
        plot = sns.boxplot(y=dataset[feature])
    else:
        if uniques.size <= 20 and uniques.size > 0:
            plot = sns.countplot(x=dataset[feature])
            plt.xticks(rotation=45, size=8, ha="right")
        else:
            dist = pd.DataFrame(data=[[uniques.size, dataset.shape[0] - uniques.size]], columns=["uniques", "not_uniques"])
            plot = sns.barplot(data=dist)

    plot.set_xlabel(None)
    plot.set_ylabel(feature, fontsize=14)


def dataset_diff_analysis(data1, data2, exclude=[], figsize=(15, 200)):
    features = data1.columns.union(data2.columns).difference(exclude)
    n_cols = 2
    n_rows = len(features)
    col_id = 0
    index = 1

    plt.subplots(n_rows, n_cols, figsize=figsize)
    plt.subplots_adjust(wspace=0.35, hspace=0.5)

    for f in features:
        is_numeric = False
        if f in data1.columns:
            f_type = data1[f].dtype
        else:
            f_type = data2[f].dtype
        if f_type in ["int64", "float64"]:
            is_numeric = True

        if f in data1.columns.values:
            create_subplot(data1, f, n_rows, index, is_numeric)
        if f in data2.columns.values:
            create_subplot(data2, f, n_rows, index+1, is_numeric)

        index += 2

    print(f"features: {features}")
    plt.show()

In [None]:
def decribe_several(feature, *df):
    data = {}
    index = 1
    for d in df:
        data[feature + "_" + str(index)] = d[feature]
        index += 1
    final_dataframe = pd.DataFrame(data)
    return final_dataframe.describe()

In [None]:
def head_several(feature, nb, *df):
    data = {}
    index = 1
    for d in df:
        data[feature + "_" + str(index)] = d[feature]
        index += 1
    final_dataframe = pd.DataFrame(data)
    return final_dataframe.head(nb)

In [None]:
class OutlierProcessor():
    def __init__(self, data, features, lower_trig, upper_trig):
        self.data = data
        self.features = features
        self.lower_trig = lower_trig
        self.upper_trig = upper_trig
        self.__above = 0
        self.__below = 0
        self.__total = 0

    def __print(self):
        print(f"lower_trig: {self.lower_trig}")
        print(f"upper_trig: {self.upper_trig}")
        print(f"below: {self.__below}")
        print(f"above: {self.__above}")
        print(f"total: {self.__total}")

    def analyse(self):
        self.__below = self.data[self.data[self.features] < self.lower_trig][self.features].count()
        self.__above = self.data[self.data[self.features] > self.upper_trig][self.features].count()
        self.__total = self.__below + self.__above
        self.__print()

    def replace(self, replace_by=np.nan, inplace=False):
        result = self.data.loc[:, self.features].where(cond=lambda x: ((x > self.lower_trig) & (self.upper_trig > x)), other=replace_by)
        if inplace:
            self.data[self.features] = result
        else:
            return result



In [None]:
class OutlierIqrProcessor(OutlierProcessor):
    def __init__(self, data, features, exclude=None):
        self.features = [feature for feature in features if feature not in exclude]
        self.__q1 = data[self.features].quantile(0.25)
        self.__q3 = data[self.features].quantile(0.75)
        self.__iqr = self.__q3 - self.__q1
        upper_trig = self.__q3 + (1.5 * self.__iqr)
        lower_trig = self.__q1 - (1.5 * self.__iqr)
        super().__init__(data, self.features, lower_trig, upper_trig)


In [None]:
def correlation_heatmap(dataset, figsize=(30, 20)):
    plt.figure(figsize=figsize)

    correlation = dataset.corr()
    mask = np.triu(np.ones_like(correlation, dtype=bool))

    sns.heatmap(data=correlation, mask=mask, annot=True, vmax=.75, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title("Correlation heatmap", size=20)
    plt.xticks(rotation=45, size=16, ha="right")
    plt.yticks(size=16)
    plt.show()

In [None]:
def unique_several(dataset, features, take=20):
    uniques_data = {}
    too_many_uniques = []
    only_one_uniques = []
    only_two_uniques = []

    for f in features:
        f_uniques = dataset[f].unique()
        if f_uniques.size <= take:
            if f_uniques.size == 1:
                only_one_uniques.append(f)
            elif f_uniques.size == 2:
                only_two_uniques.append(f)
            else:
                uniques_data[f] = pd.Series(data=f_uniques, name=f, dtype="object")
        else:
            too_many_uniques.append((f, f_uniques.size))

    print(f"Only one unique: {only_one_uniques}")
    print(f"Only two uniques: {only_two_uniques}")
    print(f"Too many uniques: {too_many_uniques}")

    return pd.DataFrame(data=uniques_data)

In [None]:
def strip_and_lower(dataset, features, to_strip=None, inplace=False):
    result_data = {}
    for f in features:
        result_data[f] = pd.Series(data=dataset[f].str.strip(to_strip=to_strip), name=f, dtype="object")
        result_data[f] = result_data[f].str.lower()
    result = pd.DataFrame(data=result_data)
    if inplace:
        dataset[features] = result
    else:
        return result

In [None]:
def fuzzy_matching_several(dataset, fuzzy, limit=10):
    fuzzy_data = {}
    for feature, match in fuzzy:
        fuzzy_matches = fuzzywuzzy.process.extract(match, dataset[feature], limit=limit, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
        fuzzy_data[feature] = pd.Series(data=fuzzy_matches, name=feature, dtype="object")
    return pd.DataFrame(data=fuzzy_data)

In [None]:
def feature_distribution_multivar(data, features, figsize=(10, 3), wspace=0.2, hspace=0.2, corr_scale=(0.75, 0), xlim=(None, None), ylim=(None, None)):
    plt.subplots(1, 2, figsize=figsize)
    plt.subplots_adjust(wspace=wspace, hspace=hspace)

    plt.subplot(1, 2, 1)
    for f in features:
        plot = sns.kdeplot(data=data[f], label=f, shade=True)

    plt.title(f"Distribution", size=20)
    plt.xticks(size=16)
    plt.yticks(size=16)
    plt.xlim(xlim[0], xlim[1])
    plt.ylim(ylim[0], ylim[1])
    plt.xlabel(None)
    plt.ylabel("Density", size=16)
    plt.legend()

    plt.subplot(1, 2, 2)
    correlation = data.loc[:, features].corr()
    mask = np.triu(np.ones_like(correlation, dtype=bool))

    sns.heatmap(data=correlation, mask=mask, annot=True, vmax=corr_scale[0], center=corr_scale[1],
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title(f"Correlation", size=20)
    plt.xticks(rotation=45, size=12, ha="right")
    plt.yticks(rotation=0, size=12, va="center")
    
    plt.show()


***
# 2 Data loading

In [None]:
data = pd.read_csv("data/2016-building-energy-benchmarking.csv", delimiter=",")

***
# 3 Overview

In [None]:
if config["overview"]["overview_plot"]:
    dataframe_distribution_overview(data, figsize=(30, 3))

In [None]:
targets = ["SiteEnergyUse(kBtu)", "GHGEmissionsIntensity"]

***
# 4 Cleaning

***
## 4.1 Emptyness

In [None]:
cols_to_remove = index_lth(data, 10).difference(["Outlier"])

In [None]:
if config["cleaning"]["emptyness"]["remove_empty"]:
    data.drop(columns=cols_to_remove, inplace=True)

In [None]:
if config["cleaning"]["emptyness"]["overview_plot"]:
    dataframe_distribution_overview(data, figsize=(30, 3))

***
## 4.2 Types

In [None]:
data.describe(include="object")

In [None]:
data.describe()

In [None]:
if config["cleaning"]["types"]["convert"]:
    data["TaxParcelIdentificationNumber"] = pd.to_numeric(data["TaxParcelIdentificationNumber"], errors="coerce")
    data["TaxParcelIdentificationNumber"].describe()

***
## 4.3 Categoricals

### 4.3.1 Overview

In [None]:
categorical_data = data.select_dtypes("object").columns

In [None]:
data[categorical_data].describe()

***
### 4.3.2 Usability

The features "City" and "State" are always the same. They bring no usefull information.

In [None]:
if config["cleaning"]["categoricals"]["usability"]["remove"]:
    features_to_delete = ["City", "State"]
    data.drop(columns=features_to_delete, inplace=True)
    categorical_data = categorical_data.drop(features_to_delete)

***
### 4.3.3 Inconsistencies

In [None]:
unique_several(data, categorical_data, take=50)

In [None]:
if config["cleaning"]["categoricals"]["inconsistencies"]["strip_and_lower"]:
    strip_and_lower(data, categorical_data, inplace=True)
    unique_several(data, categorical_data, take=50)

In [None]:
# fuzzy = [
#     ("Neighborhood_2016", "Ballard"),
# ]
# fuzzy_matching_several(data)

***
## 4.4 Numericals

### 4.4.1 Overview

In [None]:
numerical_data = data.select_dtypes(["int64", "float64"]).columns

In [None]:
data[numerical_data].describe()

In [None]:
if config["cleaning"]["numericals"]["overview_plot"]:
    multi_plot_numerical(data, numerical_data, kind="box", n_cols=6, hspace=0.4, wspace=0.30, figsize=(30, 25))

***
### 4.4.2 Usability

In [None]:
data[numerical_data].describe()

In [None]:
if config["cleaning"]["numericals"]["usability"]["remove"]:
    features_to_delete = ["DataYear"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
### 4.4.2 Redundancy

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["overview_plot"]:
    dataframe_distribution_overview(data[numerical_data], figsize=(25, 3))

***
#### 4.4.2.1 SiteEUI(kBtu/sf), SiteEUIWN(kBtu/sf)

In [None]:
features = ["SiteEUI(kBtu/sf)", "SiteEUIWN(kBtu/sf)"]

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-50, 300))

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["SiteEUIWN(kBtu/sf)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.2 SourceEUI(kBtu/sf), SourceEUIWN(kBtu/sf)

In [None]:
features = ["SourceEUI(kBtu/sf)", "SourceEUIWN(kBtu/sf)"]

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-100, 800))

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["SourceEUIWN(kBtu/sf)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.3 SiteEnergyUse(kBtu), SiteEnergyUseWN(kBtu)

In [None]:
features = ["SiteEnergyUse(kBtu)", "SiteEnergyUseWN(kBtu)"]

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-0.15e8, 0.35e8))

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["SiteEnergyUseWN(kBtu)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.4 Electricity(kWh), Electricity(kBtu)

In [None]:
features = ["Electricity(kWh)", "Electricity(kBtu)"]

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-1e7, 1.5e7))

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["Electricity(kBtu)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
#### 4.4.2.5 NaturalGas(therms), NaturalGas(kBtu)

In [None]:
features = ["NaturalGas(therms)", "NaturalGas(kBtu)"]

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["distrib_corr_plot"]:
    feature_distribution_multivar(data, features, figsize=(15, 5), hspace=0.2, wspace=0.75, corr_scale=(1, 0), xlim=(-2, 5))

In [None]:
data[features].describe()

In [None]:
if config["cleaning"]["numericals"]["redundancy"]["remove"]:
    features_to_delete = ["NaturalGas(therms)"]
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
### 4.3.2 Outliers removal

In [None]:
data[numerical_data].describe()

In [None]:
values = data[numerical_data].idxmax().value_counts()
data.loc[values[values > 3].index]

In [None]:
if config["cleaning"]["numericals"]["outliers"]["remove"]:
    data.drop(index=values[values > 3].index, inplace=True)

In [None]:
if config["cleaning"]["numericals"]["outliers"]["iqr_remove"]:
    o_proc = OutlierIqrProcessor(data, numerical_data, exclude=["Latitude", "Longitude"])
    temp = o_proc.replace(inplace=True)


In [None]:
if config["cleaning"]["numericals"]["outliers"]["supplied_remove"]:
    outliers_index = data[data["Outlier"].notna()].index
    data.drop(index=outliers_index, inplace=True)

features_to_delete = ["Outlier"]
data.drop(columns=features_to_delete, inplace=True)
categorical_data = categorical_data.drop(features_to_delete)

In [None]:
if config["cleaning"]["numericals"]["outliers"]["plot"]:
    multi_plot_numerical(data, numerical_data, kind="box", n_cols=6, hspace=0.4, wspace=0.30, figsize=(30, 25))

***
### 4.3.3 Scaling

In [None]:
if config["cleaning"]["numericals"]["scaling"]["distrib_plot_pre"]:
    multi_plot_numerical(data, numerical_data, n_cols=5, hspace=0.4, wspace=0.2, figsize=(30, 50))

In [None]:
if config["cleaning"]["numericals"]["scaling"]["scale"]:
    data[numerical_data] = pd.DataFrame(data=minmax_scale(data[numerical_data], feature_range=(0, 100)), columns=numerical_data)
    data[numerical_data].head()

In [None]:
if config["cleaning"]["numericals"]["scaling"]["distrib_plot_post"]:
    multi_plot_numerical(data, numerical_data, n_cols=5, hspace=0.4, wspace=0.2, figsize=(30, 50))

***
## 4.5 Targets

In [None]:
data[targets].isna().any()

In [None]:
data.dropna(subset=targets, inplace=True)

In [None]:
data[targets].isna().any()

***
# 5 Correlations

## 5.1 Numericals features

### 5.1.1 Overview

In [None]:
targets

In [None]:
if config["correlations"]["numericals"]["overview_heatmap_plot"]:
    correlation_heatmap(data[numerical_data], figsize=(15, 15))

***
### 5.1.2 Removing

In [None]:
if config["correlations"]["numericals"]["removing"]:
    features_to_delete = []
    data.drop(columns=features_to_delete, inplace=True)
    numerical_data = numerical_data.drop(features_to_delete)

***
### 5.1.3 Result

In [None]:
if config["correlations"]["numericals"]["result_heatmap_plot"]:
    correlation_heatmap(data[numerical_data], figsize=(15, 15))

***
## 5.2 Categoricals features

### 5.2.1 Labelisation

In [None]:
if config["correlations"]["categoricals"]["labelisation"]:
    categs_to_nums = data[categorical_data].apply(lambda feature: feature.astype("category").cat.codes)

    for col in categs_to_nums.columns:
        categs_to_nums.rename(columns={col: col + "_CATEG"}, inplace=True)

    data_enhanced = categs_to_nums.join(data[targets])

***
### 5.2.2 Overview

In [None]:
if config["correlations"]["categoricals"]["overview_heatmap_plot"]:
    correlation_heatmap(data_enhanced, figsize=(15, 15))

***
### 5.2.3 Removing

In [None]:
if config["correlations"]["categoricals"]["removing"]:
    features_to_delete = []
    data.drop(columns=features_to_delete, inplace=True)
    categorical_data = categorical_data.drop(features_to_delete)

***
### 5.2.4 Result

In [None]:
if config["correlations"]["categoricals"]["result_heatmap_plot"]:
    correlation_heatmap(data_enhanced, figsize=(15, 15))

***
# 6 Final state

In [None]:
if config["final"]["overview_plot"]:
    dataframe_distribution_overview(data, figsize=(30, 3))

***
# 7 Saving

In [None]:
data.to_csv("data/data-cleaned.csv", sep=",")