## Environment and inputs

In [1]:
#
from unidecode import unidecode
import pandas as pd
import numpy as np
import re

In [102]:
# read inputs
df = pd.read_csv("./.data/processed.csv")
print(f"Input dataset num obs and features: {df.shape}")

# get raw input stats
raw_feature_stats = []
for col in df.columns:
    raw_feature_stats.append(
        {
            "feature": col, 
            "unique" : df[col].nunique(), 
            "na_rate": df[col].isna().sum() / df.shape[0]
        }
    )

# examine the most dense features
print(f"Available apartment features: {df.columns}")
raw_feature_stats = pd.DataFrame(raw_feature_stats ) 
print(raw_feature_stats.sort_values("na_rate")[:10])

Input dataset num obs and features: (5817, 24)
Available apartment features: Index(['price', 'place', 'name', 'stavba', 'stav_objektu', 'vlastnictvi',
       'podlazi', 'energeticka_narocnost_budovy', 'vytah', 'uzitna_plocha',
       'umisteni_objektu', 'vybaveni', 'parkovani', 'terasa', 'balkon',
       'lodzie', 'sklep', 'bezbarierovy', 'garaz', 'bazen', 'id',
       'locality_district_id', 'voda', 'odpad'],
      dtype='object')
                 feature  unique   na_rate
1                  place    2030  0.000000
2                   name     967  0.000000
3                 stavba       7  0.000000
4           stav_objektu       9  0.000000
5            vlastnictvi       3  0.000000
6                podlazi     449  0.000000
9          uzitna_plocha     239  0.000000
21  locality_district_id      11  0.000000
20                    id    5817  0.000000
0                  price    2187  0.046416


## Treat NA's and dtypes
In this section we:
1. Treat categorical features by imputing `<NA>` strings.
2. Treat "flag" features (e.g. if an apartment includes garage).
3. Treat numeric features (e.g. apartment area) by converting them to valid floats.

*Note*: The choice to leave missing categorical feature values and assign them a separate level was made because<br> of potential unwanted noise that could be introduced by trying to estimate their likely real values.

In [103]:
# impute flat NA's for set features
df.loc[df["odpad"].isna(), "odpad"] = "<NA>"
df.loc[df["voda" ].isna(), "voda" ] = "<NA>"

# parse target (price) - and drop corrupted
def func(val):
    if isinstance(val, str):
        val = unidecode(val)
        return float(val.replace(" ", ""))
    else: return float("nan")

# extract price, convert to mil CZK, filter
df["price"] = df["price"].apply(func)
df = df.loc[~df["price"].isna(), :]
df["price"] = df["price"] / 1e6
df = df.copy()

# parse binary features (flags)
def func(val):
    if (str(val) == "nan" or val == "False" or val is None): 
        return 0
    else: 
        return 1

flag_features = ["vytah", "bezbarierovy", "garaz", 
"bazen", "vybaveni", "parkovani"]
for feature in flag_features: 
    df[feature] = df[feature].apply(func)
df = df.copy()

# parse area features
def func(val):
    try: 
        if str(val) != "nan": return float(val)
        else: return 0.0
    except: return 0.0

area_features = ["terasa", "balkon", "lodzie", "sklep"]
for feature in area_features: 
    df[feature] = df[feature].apply(func)
df = df.copy()

# parse categorical features
def func(val):
    if not isinstance(val, str):
        return "<NA>"
    else:
        return str(val)

df["locality_district_id"] = df["locality_district_id"].astype("str")
cat_features = ["stavba", "stav_objektu", "vlastnictvi", 
"umisteni_objektu"]
for feature in cat_features: 
    df[feature] = df[feature].apply(func)
df = df.copy()

## Extract special features
In this section we extract features that are not readily available, e.g. apartment planning type is embedded within an unstructured text string and needs to be fetched using regex.

In [104]:
# extract energy class of an apartment
def func(val):
    try: return val.split(" - ")[0]
    except: return "<NA>"

df["energy_class"] = df["energeticka_narocnost_budovy"].apply(func)
df = df.drop("energeticka_narocnost_budovy", axis=1)
df = df.copy()

def extract_floor(df):
    """Extracts floor number."""

    floors = []
    for floor_str in df["podlazi"]:
        if "přízemí" not in floor_str:
            floor = re.findall(r".*?podlaží", floor_str)[0]
            floor = floor.replace(". podlaží", "")
        else: floor = "0"
        floors.append(floor)

    df = df.drop("podlazi", axis=1)
    df["floor"] = floors

    return df

df = extract_floor(df)
df = df.copy()

# extract apartment planning (number of rooms and kitchen placement)
def extract_planning(df):
    """Extracts apartment planning from name."""

    def func(name):

        name = name.replace("Prodej bytu ", "" )
        name = name.replace("Dražba bytu ", "" )
        name = name.replace("atypické "   , "" )
        name = unidecode(name.strip( ))

        if "+1" in name or "+kk" in name: return name.split(" ")[0]
        elif "6 pokojů a více" in name: return "6 pokojů a více"
        else: return "single"
    
    df["planning"] = df["name"].apply(func)

    return df

df = extract_planning(df)
df = df.copy()

# extract apartment mutation (e.g. loft)
def extract_mutation(df):
    """Extracts apratment mutation from name."""

    def func(name):
        try: return re.findall(r"\(.*?\)", name)[0]
        except: return "(Basic)"

    df["mutation"] = df["name"].apply(func)

    return df

df = extract_mutation(df)
df = df.drop("name", axis=1)
df = df.copy()

# extract Prague district number (location) from address
def extract_district(val):
    """Extracts Prague district number."""
    try:
        if " - " in val:
            parts = val.split(" - ")
            try: num = parts[-2].split(" ")[-1]
            except: pass
        else:
            parts = val.split(" ")
            num = parts[-1]
        num = int(num)
        return str(num)
    except:
        return "<NA>"

df["district"] = df["place"].apply(extract_district)
df = df.copy()

# extract locality (sub-district level)
def extract_locality(val):
    """Extracts locality name."""
    if " - " in val:
        parts = val.split(" - ")
        try:
            dist_ = parts[-1]
            if "Praha" not in dist_:
                return dist_
            else:
                return "<NA>"
        except:
            return "<NA>"
    else:
        return "<NA>"

df["locality"] = df["place"].apply(extract_locality)
df = df.drop("place", axis=1)
df = df.copy()

## Prettify feature names
In this section we prettify feature names and categorical feature levels by:
1. Prefixing features names with their data types: ``cat_`` for categorical, ``num_`` for numeric, and ``has_`` for binary.
2. Replacing spaces and translating them from Czech to English.

The description of final features is the following:
- ``num_price``         : asked price in million CZK, target
- ``num_area``          : area of an apartment in square meters
- ``num_terrace_area``  : area of a terrace, 0.0 if not present (applies to following num_ features)
- ``num_balcony_area``  : area of a balcony
- ``num_storage_area``  : area of a dedicated storage space
- ``num_cellar_area``   : area of a dedicated cellar space
- ``has_elevator``      : binary integer flag indicating presence of an elevator
- ``has_furniture``     : indicator if the apartment comes with furniture
- ``has_parking``       : indicator if the apartment comes with a dedicated parking slot
- ``has_barrierfree``   : indicator if the apartment building provides barrier-free entrance
- ``has_garage``        : indicator if the apartment comes with a dedicated garage
- ``has_swimming_pool`` : indicator if the apartment has a swimming pool
- ``cat_locality_district``: integer id of apartment location used for sreality internal purposes (mapping to real locations is unknown.)
- ``cat_locality``      : name of Prague sub-district where the aparment is located (e.g. Vinohrady, Prosek, etc.)
- ``cat_district``      : name of Prague numeric district (e.g. Praha 2, Praha 9, etc.)
- ``cat_quality_state`` : category indicating if the apartment is in a good state (e.g. "new", "bad sate", etc.)
- ``cat_bulding_type``  : type of the building the apartment is in (e.g. block of flats)
- ``cat_ownership``     : type of current ownership of the apartment (e.g. personal / communal ownership)
- ``cat_neighborhood``  : type of neighborhood (e.g. calm suburb / busy centre, etc.)
- ``cat_watersupply``   : type of water supply (e.g. central / local, etc.)
- ``cat_wasteamenities``: type of waste disposal available
- ``cat_energy_class``  : energy-efficiency classification (e.g. A/B/C, etc.)
- ``cat_floor``         : the floor the apartment is located in
- ``cat_planning``      : type of planning (determines the number of rooms and kitchen disposition)
- ``cat_apartment_type``: special type of the apartment if applicable (e.g. apartment is the last floor in the building)

In [105]:
naming_dict = {
    "price"               : "num_price",
    "stav_objektu"        : "cat_quality_state",
    "stavba"              : "cat_building_type",
    "vlastnictvi"         : "cat_ownership",
    "vytah"               : "has_elevator",
    "uzitna_plocha"       : "num_area",
    "umisteni_objektu"    : "cat_neighborhood",
    "vybaveni"            : "has_furniture",
    "parkovani"           : "has_parking",
    "terasa"              : "num_terrace_area",
    "balkon"              : "num_balcony_area",
    "lodzie"              : "num_storage_area",
    "sklep"               : "num_cellar_area",
    "bezbarierovy"        : "has_barrierfree",
    "garaz"               : "has_garage",
    "bazen"               : "has_swimming_poool",
    "locality_district_id": "cat_locality_district",
    "voda"                : "cat_watersupply",
    "odpad"               : "cat_wasteamenities",
    "energy_class"        : "cat_energy_class",
    "floor"               : "cat_floor",
    "planning"            : "cat_planning",
    "mutation"            : "cat_apartment_type",
    "district"            : "cat_district",
    "locality"            : "cat_locality"
}

df = df.rename(naming_dict, axis=1)

def func(val):
    return unidecode(val).replace(" ", "_").lower()
cat_features = [feature for feature in df.columns if "cat" in feature]
for feature in cat_features:
    df[feature] = df[feature].apply(func)
    df[feature] = df[feature].astype(str)

has_features = [feature for feature in df.columns if "has" in feature]
for feature in has_features:
    df[feature] = df[feature].astype(int)

num_features = [feature for feature in df.columns if "num" in feature]
for feature in num_features:
    df[feature] = df[feature].astype(float)

In [106]:
df.dtypes

num_price                float64
cat_building_type         object
cat_quality_state         object
cat_ownership             object
has_elevator               int64
num_area                 float64
cat_neighborhood          object
has_furniture              int64
has_parking                int64
num_terrace_area         float64
num_balcony_area         float64
num_storage_area         float64
num_cellar_area          float64
has_barrierfree            int64
has_garage                 int64
has_swimming_poool         int64
id                         int64
cat_locality_district     object
cat_watersupply           object
cat_wasteamenities        object
cat_energy_class          object
cat_floor                 object
cat_planning              object
cat_apartment_type        object
cat_district              object
cat_locality              object
dtype: object

## Dump features locally

In [107]:
df.to_parquet("./.data/features", index=None)