# 1 Dependency import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

***
# 2 Data loading

In [None]:
data = pd.read_csv("data/2015-building-energy-benchmarking.csv", delimiter=",")

***
# 3 Overview

In [None]:
plt.figure(figsize=(30, 3))

sns.barplot(x=data.columns, y=data.count())

plt.title("Number of values per column", size=20)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.ylabel("Number values", size=16)
plt.show()

In [None]:
data.describe()

In [None]:
data.describe(include="object")

In [None]:
data.info()

***
# 4 Correlations

In [None]:
plt.figure(figsize=(30, 20))

correlation = data.corr()
mask = np.triu(np.ones_like(correlation, dtype=bool))

sns.heatmap(data=correlation, mask=mask, annot=True, vmax=.75, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.title("Correlation heatmap", size=20)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

In [None]:
categs = data.select_dtypes(include="object")

In [None]:
def categorize(feature):
    new_feature = feature.astype("category").cat.codes
    return new_feature

In [None]:
categs_to_nums = categs.apply(lambda col: categorize(col))

In [None]:
for col in categs_to_nums.columns:
    categs_to_nums.rename(columns={col: col + "_CATEG"}, inplace=True)

In [None]:
data_enhanced = data.join([categs_to_nums])

In [None]:
plt.figure(figsize=(30, 20))

correlation = data_enhanced.corr()
mask = np.triu(np.ones_like(correlation, dtype=bool))

sns.heatmap(data=correlation, mask=mask, annot=True, vmax=.75, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.title("Correlation heatmap", size=20)
plt.xticks(rotation=45, size=16, ha="right")
plt.yticks(size=16)
plt.show()

***
# 5 Saving

In [None]:
data.to_csv("data/2015-cleaned.csv", sep=",")

***
# 6 ...