# 2. Preprocessing

## Preparation of the dataset for machine learning

- Handling NaNs (hint: **imputation**)
- Converting categorical data into numeric features (hint: **one-hot encoding**)
- Rescaling numeric features (hint: **standardization**)

## Handling NaNs with imputation

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv

# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

# Display the head
df.head()

Mean/Median Imputation

In [None]:
# Specify columns to impute
impute_columns = ["total_area_sqm", "surface_land_sqm", "nbr_frontages", "nbr_bedrooms", "terrace_sqm", "garden_sqm", "primary_energy_consumption_sqm", "cadastral_income"]

# Select numerical data
impute_df = df[impute_columns]

In [None]:
df.select_dtypes(include=float)

In [None]:
df = df.select_dtypes(include=float)

# mean imputation
mean_values = df.mean()
mean_imputation = df.fillna(mean_values)
mean_imputation.head()

In [None]:
# median imputation
median_values = df[impute_columns].median()
median_imputation = df[impute_columns].fillna(median_values)
median_imputation.head()

Random Sample Imputation

In [None]:
def random_sample_imputation(dfcopy):
   
    cols_with_missing_values = dfcopy.columns[dfcopy.isna().any()].tolist()

    for var in cols_with_missing_values:

        # extract a random sample
        random_sample_df = dfcopy[var].dropna().sample(dfcopy[var].isnull().sum(),
                                                    random_state=0, replace=True)
        # re-index the randomly extracted sample
        random_sample_df.index = dfcopy[
                dfcopy[var].isnull()].index

        # replace the NA
        dfcopy.loc[dfcopy[var].isnull(), var] = random_sample_df
    
    return dfcopy

In [None]:
dfcopy = df.copy()
random_sample_imp_df = random_sample_imputation(dfcopy)
random_sample_imp_df.head()

Plotting distribution

In [None]:
mean_imputation["garden_sqm Mean Imp"] = mean_imputation["garden_sqm"]
median_imputation["garden_sqm Median Imp"] = median_imputation["garden_sqm"]
random_sample_imp_df["garden_sqm Random Imp"] = random_sample_imp_df["garden_sqm"]

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))

df["garden_sqm"].plot(kind='kde',color='blue')
mean_imputation["garden_sqm Mean Imp"].plot(kind='kde',color='yellow')
median_imputation["garden_sqm Median Imp"].plot(kind='kde',color='red')

plt.legend()

## Convert categorical data into numerical features

Categorical data is already converted to numerical values 1 and 0. 

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/

In [None]:
# One hot encoding

# Specify columns to convert
category_columns = ["fl_furnished", "fl_open_fire", "fl_terrace", "fl_garden", "fl_swimming_pool", "fl_floodzone", "fl_double_glazing"]

In [None]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Retreive data
data = pd.read_csv("../data/properties.csv")

# Convert type of columns to category
data["fl_furnished"] = data["fl_furnished"].astype("category")
data["fl_open_fire"] = data["fl_open_fire"].astype("category")

# Assign numerical values and store in another column
data["fur_new"] = data["fl_furnished"].cat.codes
data["fire_new"] = data["fl_open_fire"].cat.codes

# Create instance of One-hot-encoder
enc = OneHotEncoder()

# Pass encoded columns
enc_data = pd.DataFrame(enc.fit_transform(data[["fur_new", "fire_new"]]).toarray())

# Merge with main
new_df = data.join(enc_data)

display(new_df)

## Encoding categorical features

https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features

## Rescale numeric features with standardization

In [None]:
from sklearn.preprocessing import StandardScaler

# define data
data = df.select_dtypes(exclude='object')
display(data.head())

# define standard scaler
scaler = StandardScaler()

# transform data
scaled_data = scaler.fit_transform(data)
display(scaled_data)

scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
display(scaled_df)

In [None]:
# histograms of the variables
from matplotlib import pyplot
df.hist()
pyplot.show()