# 1. Data preprocessing

Some notes about the clean dataset in **data/properties.csv** :

- There are about 76 000 properties, roughly equally spread across houses and apartments
- Each property has a unique identifier **id**
- The target variable is **price**
- Variables prefixed with **fl_** are dummy variables (1/0)
- Variables suffixed with **_sqm** indicate the measurement is in square meters
- All missing categories for the categorical variables are encoded as **MISSING**

## Preparation of the dataset for machine learning

- Handling NaNs (hint: **imputation**)
- Converting categorical data into numeric features (hint: **one-hot encoding**)
- Rescaling numeric features (hint: **standardization**)

## Exploring the dataset


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv

In [None]:
# Read the csv file
df = pd.read_csv("../data/properties.csv")

In [None]:
# Display the head
df.head()

In [None]:
print("There are {} rows of data".format(len(df)))

In [None]:
# (rows,columns)
df.shape

In [None]:
# Describe index
df.index 

In [None]:
# Describe df columns
df.columns

In [None]:
# Info on df
df.info()

In [None]:
# Number of non-NA values
df.count()

In [None]:
# Descriptive statistics
df.describe()

In [None]:
# descriptive statistics for all columns in df, including both numeric and non-numeric (categorical) columns

df.describe(include="all").T  # Transpose the data frame so that it fits in a cell

In [None]:
# check for missing (NaN or null) values in each column, 
# count the number of missing values per column, 
# and then sort the results in descending order

df.isna().sum().sort_values(ascending=False)

In [None]:
# There are no duplicates
df.duplicated().any()

In [None]:
# Dropping the "id" column
df_drop_id = df.drop(["id"], axis=1)

## Cleaning the data

In [None]:
# replace values "missing" by NAN
missing_column = ["region", "province", "locality", "equipped_kitchen", "state_building", "epc", "heating_type"]
df_missing_col = df[missing_column].replace('MISSING', np.NAN, inplace=False)
display(df_missing_col)

# Replace the "MISSING" values with NaN
df_missing = df.replace('MISSING', np.NAN, inplace=False)
display(df_missing)


In [None]:
df_missing.isna().sum().sort_values(ascending=False)


In [None]:
df_missing.describe(include="all").T

In [None]:
df_missing.dtypes

In [None]:
# See unique values in dataframe
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}' is : {unique_values}")

In [None]:
# See unique values of a specific column
unique_value = df["subproperty_type"].unique()
print(unique_value)

# See unique values of multiple columns
unique_values_multi = df[['property_type', 'subproperty_type', 'region', 'province', 'locality', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']].apply(lambda x: x.unique())
print("\nUnique values in columns:\n", unique_values_multi)


In [None]:
# See unique values of multiple columns
columns = df[['property_type', 'subproperty_type', 'region', 'province', 'locality', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']]
for column in columns:
    multi_columns = df[column].unique() 
    print(f"Unique values in column '{column}': {multi_columns}")

## Handling NaNs with imputation

Mean/Median Imputation

In [None]:
# Specify columns to impute
impute_columns = ["total_area_sqm", "surface_land_sqm", "nbr_frontages", "nbr_bedrooms", "terrace_sqm", "garden_sqm", "primary_energy_consumption_sqm", "cadastral_income"]

# Select numerical data
impute_df = df[impute_columns]

In [None]:
df.select_dtypes(include=float)

In [None]:
# mean imputation
mean_values = df[impute_columns].mean()
mean_imputation = df[impute_columns].fillna(mean_values)
mean_imputation.head()

In [None]:
# median imputation
median_values = df[impute_columns].median()
median_imputation = df[impute_columns].fillna(median_values)
median_imputation.head()

Random Sample Imputation

In [None]:
def random_sample_imputation(dfcopy):
   
    cols_with_missing_values = dfcopy.columns[dfcopy.isna().any()].tolist()

    for var in cols_with_missing_values:

        # extract a random sample
        random_sample_df = dfcopy[var].dropna().sample(dfcopy[var].isnull().sum(),
                                                    random_state=0, replace=True)
        # re-index the randomly extracted sample
        random_sample_df.index = dfcopy[
                dfcopy[var].isnull()].index

        # replace the NA
        dfcopy.loc[dfcopy[var].isnull(), var] = random_sample_df
    
    return dfcopy

In [None]:
dfcopy = df.copy()
random_sample_imp_df = random_sample_imputation(dfcopy)
random_sample_imp_df.head()

Plotting distribution

In [None]:
mean_imputation["garden_sqm Mean Imp"] = mean_imputation["garden_sqm"]
median_imputation["garden_sqm Median Imp"] = median_imputation["garden_sqm"]
random_sample_imp_df["garden_sqm Random Imp"] = random_sample_imp_df["garden_sqm"]

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))

df["garden_sqm"].plot(kind='kde',color='blue')
mean_imputation["garden_sqm Mean Imp"].plot(kind='kde',color='yellow')
median_imputation["garden_sqm Median Imp"].plot(kind='kde',color='red')

plt.legend()

## Convert categorical data into numerical features

Categorical data is already converted to numerical values 1 and 0. 

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/

In [None]:
# One hot encoding

# Specify columns to convert
category_columns = ["fl_furnished", "fl_open_fire", "fl_terrace", "fl_garden", "fl_swimming_pool", "fl_floodzone", "fl_double_glazing"]

In [None]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Retreive data
data = pd.read_csv("../data/properties.csv")

# Convert type of columns to category
data["fl_furnished"] = data["fl_furnished"].astype("category")
data["fl_open_fire"] = data["fl_open_fire"].astype("category")

# Assign numerical values and store in another column
data["fur_new"] = data["fl_furnished"].cat.codes
data["fire_new"] = data["fl_open_fire"].cat.codes

# Create instance of One-hot-encoder
enc = OneHotEncoder()

# Pass encoded columns
enc_data = pd.DataFrame(enc.fit_transform(data[["fur_new", "fire_new"]]).toarray())

# Merge with main
new_df = data.join(enc_data)

display(new_df.head())

## Encoding categorical features

https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features

## Rescale numeric features with standardization

In [None]:
from sklearn.preprocessing import StandardScaler

# define data
data = df.select_dtypes(exclude='object')
display(data.head())

# define standard scaler
scaler = StandardScaler()

# transform data
scaled_data = scaler.fit_transform(data)
display(scaled_data)

scaled_df = pd.DataFrame(scaled_data, columns=data.columns)
display(scaled_df.head())

In [None]:
# histograms of the variables
from matplotlib import pyplot
df.hist()
pyplot.show()