# Preprocessing NaN values
### for the 7 last columns
    17  Можно с детьми/животными  17272 non-null  object
        Create columns: "pets_okay" (and fill NA), "kids_okay" (and fill NA)
    18  Дополнительно             23011 non-null  object
        Create columns for potentially impactful factors: Мебель в комнатах (and count, then fill NA if needed or drop the column), Кондиционер (likewise), Посудомоечная машина (likewise)
    15  Окна                      16755 non-null  object
        Values: categorical, [На улицу и двор' 'Во двор' 'На улицу']. Create "only_street_view"
    16  Санузел                   20696 non-null  object
        Values: need to make them numerical and categorical ("total_bathrooms", "only_one"), then fill NaN.
    22  Лифт                      17868 non-null  object
        Create columns: "has_elevator", "has_cargo_elevator"
    21  Высота потолков, м        11206 non-null  float64
        Remove outliers, transform into meters, fill NaN, consider creating categorical (low, medium, high)
    23  Мусоропровод              12846 non-null  object
        Fill NA.

In [3]:
import pandas as pd

import numpy as np
from numpy import mean
from numpy import std
from numpy import ravel

import seaborn as sns
import matplotlib.pyplot as plt

import scipy
import joblib
import re
import glob
import sklearn
from math import sqrt
from decimal import Decimal

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
# from sklearn.metrics import mean_absolute_percentage_error


from geopy.exc import GeocoderTimedOut
from geopy.distance import geodesic
from geopy.geocoders import Nominatim

from tqdm import tqdm

# !pip install CurrencyConverter
from currency_converter import CurrencyConverter

In [4]:
df = pd.read_csv('../data/_data.csv')

## Column "Можно с детьми/животными"

####  New column "Можно с животными"

In [5]:
df['pets_okay'] = df['Можно с детьми/животными'].apply(lambda x: 1 if isinstance(x, str) and 'Можно с животными' in x else (0 if isinstance(x, str) else np.nan))
mode_pets_okay = df['pets_okay'].mode()[0]
df['pets_okay'].fillna(mode_pets_okay, inplace=True)
print(df['pets_okay'])

0        1.0
1        0.0
2        0.0
3        1.0
4        0.0
        ... 
23363    0.0
23364    0.0
23365    0.0
23366    0.0
23367    1.0
Name: pets_okay, Length: 23368, dtype: float64


In [6]:
percent_counts = df['pets_okay'].value_counts(normalize=True) * 100

# Print the percentage counts
print("Percentage of each value in 'pets_okay':")
print(percent_counts)

Percentage of each value in 'pets_okay':
pets_okay
0.0    69.453954
1.0    30.546046
Name: proportion, dtype: float64


####  New column "Можно с детьми"

In [7]:
df['kids_okay'] = df['Можно с детьми/животными'].apply(lambda x: 1 if isinstance(x, str) and 'Можно с детьми' in x else (0 if isinstance(x, str) else np.nan))
mode_kids_okay = df['kids_okay'].mode()[0] # 🚧
df['kids_okay'].fillna(mode_kids_okay, inplace=True)
print(df['kids_okay'])

0        1.0
1        1.0
2        1.0
3        0.0
4        1.0
        ... 
23363    1.0
23364    1.0
23365    1.0
23366    1.0
23367    1.0
Name: kids_okay, Length: 23368, dtype: float64


In [8]:
percent_counts = df['kids_okay'].value_counts(normalize=True) * 100

# Print the percentage counts
print("Percentage of each value in 'kids_okay':")
print(percent_counts)

Percentage of each value in 'kids_okay':
kids_okay
1.0    98.977234
0.0     1.022766
Name: proportion, dtype: float64


## Column "Дополнительно"
#### Create columns for potentially impactful factors: Мебель в комнатах (and count, then fill NA if needed or drop the column), Кондиционер (likewise), Посудомоечная машина (likewise)

In [10]:
def create_column(df, new_col_name, phrase):
    df[new_col_name] = df['Дополнительно'].apply(
        lambda x: 1 if isinstance(x, str) and phrase in x else (0 if isinstance(x, str) else np.nan)
    )
    # Fill NaN values with the mode
    mode_value = df[new_col_name].mode()[0]
    df[new_col_name].fillna(mode_value, inplace=True)
    return df

# Create new columns
df = create_column(df, 'furniture_in_the_room', 'Мебель в комнатах')
df = create_column(df, 'air_conditioner', 'Кондиционер')
df = create_column(df, 'dishwashing', 'Посудомоечная машина')
df = create_column(df, 'fridge', 'Холодильник')
df = create_column(df, 'internet', 'Интернет')

In [13]:
# Checking descriptive stats for each main feature
def calculate_percentage(df, column_name):
    percent_counts = df[column_name].value_counts(normalize=True) * 100
    print(f"Percentage of each value in '{column_name}':")
    print(percent_counts)

calculate_percentage(df, 'furniture_in_the_room')
calculate_percentage(df, 'air_conditioner')
calculate_percentage(df, 'dishwashing')
# Doing reality check - these columns should largely have 1 
calculate_percentage(df, 'fridge')
calculate_percentage(df, 'internet')


print(df[['furniture_in_the_room', 'air_conditioner', 'dishwashing', 'fridge', 'internet']])


Percentage of each value in 'furniture_in_the_room':
furniture_in_the_room
1.0    92.643786
0.0     7.356214
Name: proportion, dtype: float64
Percentage of each value in 'air_conditioner':
air_conditioner
0.0    62.358781
1.0    37.641219
Name: proportion, dtype: float64
Percentage of each value in 'dishwashing':
dishwashing
0.0    66.48836
1.0    33.51164
Name: proportion, dtype: float64
Percentage of each value in 'fridge':
fridge
1.0    88.124786
0.0    11.875214
Name: proportion, dtype: float64
Percentage of each value in 'internet':
internet
1.0    80.498973
0.0    19.501027
Name: proportion, dtype: float64
       furniture_in_the_room  air_conditioner  dishwashing  fridge  internet
0                        1.0              1.0          1.0     1.0       1.0
1                        1.0              1.0          1.0     1.0       1.0
2                        1.0              1.0          1.0     1.0       1.0
3                        1.0              1.0          1.0     1.0      

## Column "Окна"
#### Create column only_street_view with supposedly negative impact


In [14]:
df['only_street_view'] = df['Окна'].apply(
    lambda x: 1 if isinstance(x, str) and 'На улицу' in x else (0 if isinstance(x, str) else np.nan)
)
mode_only_street_view = df['only_street_view'].mode()[0]
df['only_street_view'].fillna(mode_only_street_view, inplace=True)
percent_counts = df['only_street_view'].value_counts(normalize=True) * 100

print("Percentage of each value in 'only_street_view':")
print(percent_counts)


Percentage of each value in 'only_street_view':
only_street_view
0.0    74.815988
1.0    25.184012
Name: proportion, dtype: float64


## Column "Высота потолков"
#### Values: transform into meters, ❗️ Remove outliers, fill NaN, consider creating ❗️ categorical levels (low, medium, high)


## Column "Санузел"
#### Values: need to make them numerical and categorical ("total number", "only совмещенный"), then fill NaN.


## Column "Лифт"
#### Create columns: "has_elevator", "has_cargo_elevator"


## Column "Мусоропровод"

