In [1]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
np.seterr(divide = 'ignore') 

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/pythonScripts")

from URL_helper import URL_helper

df = pd.read_csv('/Users/eugenganenco/Desktop/srealtyAnalysis/data/DataFile 16_11_2022_22_43/housesDf_17_11_2022_00_06_15.csv translatedWithCoordAndPrice.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains('ID')]


In [2]:
df['Gas'].unique()

array([nan, 'Individuální', 'Plynovod', 'Individuální, Plynovod'],
      dtype=object)

In [3]:
df = df.loc[df['price'] != 0]
df.drop(['link', 'location', 'total price', 'Update', 'Move-in date', 'Discounted', 'Original price'], inplace=True, axis=1)
df.head()

Unnamed: 0,district,HouseType,Note on price,Building,Object status,Location of the house,Object location,House type,The floor,Usable area,Land area,Parking,Year of reconstruction,Water,Heating,Garbage,Electricity,Transport,Energy efficiency of the building,Equipment,Lift,companyName,proximityIndex,Index,Built-up area,Store,Garage,Gas,Telecommunication,Garden area,Year of approval,Swimming pool,Cena,Floor area,Communication,Indicator of energy efficiency of the building,Certificate of energy performance of the building,Barrier-free,Ceiling height,Housing costs,Number of flats,Starting price,Expert opinion,Minimum bid,Auction principal,Type of auction,Place of auction,The date of the auction,The date of the 1st inspection,The date of the 2nd inspection,Auction decree,Expert opinion.1,Pool area,Condition,Date of completion of construction,Soil construction,Sale start date,Inspection date,Inspection date to,Minimum purchase price,Share size,Number of owners,locationLat,locationLong,price
0,cheb,rodinne-domy\n,V případě více zájemců může RK využít pro výběr kupujícího formu aukce.,Cihlová,Dobrý,Samostatný,Okraj obce,Patrový,2,196 m2,681 m2,3.0,2022.0,Dálkový vodovod,Lokální plynové,"Veřejná kanalizace, Jímka",230V,"Silnice, Autobus",Třída C - Úsporná č. 148/2007 Sb. podle vyhlášky,0,0.0,M&M reality,846.615385,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.1719781,12.3687621,6900000
1,cheb,rodinne-domy\n,"včetně provize, právních i finančních služeb",Cihlová,Dobrý,Řadový,Klidná část obce,Patrový,3,150 m2,356 m2,,,Místní zdroj,,ČOV pro celý objekt,230V,"Silnice, Autobus",Třída G - Mimořádně nehospodárná,,,Česká Společnost Realitní / Jura ex alto s.r.o.,1422.466667,1,73 m2,1.0,1.0,Individuální,"Telefon, Internet",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.2123211,12.2101758,2990000
2,cheb,rodinne-domy\n,"včetně provize, právních i finančních služeb",Cihlová,Velmi dobrý,Samostatný,Klidná část obce,Patrový,2,230 m2,977 m2,,,Dálkový vodovod,Ústřední plynové,Veřejná kanalizace,230V,"Vlak, Dálnice, Silnice, MHD, Autobus",Třída G - Mimořádně nehospodárná,1,,Česká Společnost Realitní / Jura ex alto s.r.o.,1923.666667,2,130 m2,,1.0,Plynovod,"Telefon, Internet",806 m2,2011.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.0574633,12.3974368,9900000
3,cheb,rodinne-domy\n,V případě více zájemců může RK využít pro výběr kupujícího formu aukce.,Cihlová,Před rekonstrukcí,Samostatný,Klidná část obce,Patrový,3,300 m2,11600 m2,4.0,,Místní zdroj,Lokální tuhá paliva,Septik,"230V, 400V","Silnice, Autobus",Třída F - Velmi nehospodárná č. 148/2007 Sb. podle vyhlášky,Částečně,0.0,M&M reality,2438.769231,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.2234326,12.3814405,4590000
4,cheb,rodinne-domy\n,,Cihlová,Velmi dobrý,,,Patrový,3,675 m2,1004 m2,,,,,,,,,,,Tout Puissant Almaz,1344.166667,5,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,49.95143,12.6850936,15000000


In [4]:
df['Parking'].apply(lambda x: 1 if pd.isna(x) else 0).sum()

7751

In [5]:
def createFeatureDict(column):
    featureDict = Counter()
    featureList = df[column].tolist()
    for feature in featureList:
        if pd.isna(feature):
            continue
        featureDict.update([string.lower().lstrip() for string in re.split(',|;|/.', feature)])
    return featureDict

def followsPattern(text, pattern):
        return bool(re.search(pattern, text.lower().lstrip()))

def createFeature(column, featureDict):
    featureList = list(featureDict.keys())
    patternList = []
    for feature in featureList:
        if feature != '':
            print(feature)
            patternList.append(re.compile(f'.*{feature}'))
    for feature,patternToFollow in zip(featureList,patternList):
        df[feature] = df.loc[df[column].apply(followsPattern, pattern=patternToFollow),column].map(lambda x: 'True', na_action='ignore')


In [6]:
columnList = ['Heating', 'Garbage', 'Electricity', 'Telecommunication', 'Communication']

In [7]:
for column in columnList:
    df.loc[df[column].str.contains('<selenium.', na=False), column] = np.NaN

In [8]:
for column in columnList:
    df[column].fillna(f'{column}_Unknown', inplace=True)
    featureDict = createFeatureDict(column)
    createFeature(column, featureDict)
    df.drop(column, inplace=True, axis=1)

lokální plynové
heating_unknown
ústřední plynové
lokální tuhá paliva
jiné
lokální elektrické
ústřední tuhá paliva
ústřední elektrické
ústřední dálkové
podlahové
veřejná kanalizace
jímka
čov pro celý objekt
septik
garbage_unknown
trativod
230v
400v
electricity_unknown
120v
telecommunication_unknown
telefon
internet
kabelové rozvody
satelit
kabelová televize
ostatní
communication_unknown
asfaltová
neupravená
dlážděná
betonová
zpevněná
šotolina
štěrková


### I will first perform some modifications to the dataframe that should not cause to data leakage

In [9]:
def extractInt(string):
    return re.sub('(m2)', '', string)

featureList = [feature for feature in df.filter(like='area').columns]
featuresModified = df.loc[:,featureList].applymap(extractInt, na_action='ignore')

for feature in featureList:
    df[feature] = featuresModified[feature].values
    df[feature] = pd.to_numeric(df[feature], errors='coerce')

print(df.loc[:,featureList])

print(df['Built-up area'].dtypes)

       Usable area  Land area  Built-up area  Garden area  Floor area  \
0              196      681.0            NaN          NaN         NaN   
1              150      356.0           73.0          NaN         NaN   
2              230      977.0          130.0        806.0         NaN   
3              300    11600.0            NaN          NaN         NaN   
4              675     1004.0            NaN          NaN         NaN   
...            ...        ...            ...          ...         ...   
15544          166      691.0          222.0          NaN         NaN   
15545           31      100.0           25.0          NaN         NaN   
15546          101     1642.0            NaN          NaN         NaN   
15550          112        1.0          145.0          NaN         NaN   
15551          630     1050.0          218.0          NaN       831.0   

       Pool area  
0            NaN  
1            NaN  
2            NaN  
3            NaN  
4            NaN  
...      

This exrtacts the numerical value of the "area" variables.

In [10]:
notesDict = Counter()
notesList = df['Note on price'].tolist()
for note in notesList:
    if pd.isna(note):
        continue
    notesDict.update([string.lower().lstrip() for string in re.split(',|;|/.', note)])
print(notesDict)

Counter({'včetně provize': 1489, 'včetně právního servisu': 1306, 'v případě více zájemců může rk využít pro výběr kupujícího formu aukce.': 1293, 'včetně poplatků': 664, 'včetně dph': 561, 'cena k jednání': 379, '+ provize rk': 252, 'včetně provize rk': 206, 'včetně provize a právního servisu': 154, 'včetně provize a právních služeb': 75, 'včetně provize rk a právního servisu': 69, 'cena včetně provize a právního servisu': 61, 'cena včetně provize rk': 57, 'cena včetně 1% provize': 48, 'včetně právního servisu a provize rk': 44, 'cena včetně provize': 41, 'na klíč': 40, 'vč. zákl. desky a dph': 39, '+ provize': 39, 'bez poplatků': 35, 'včetně provize rk a právních služeb': 31, 'vč. provize rk': 30, 'konečná cena': 29, 'neplatíte provizi': 29, 'advokátní úschovy': 28, 'vč. provize a právního servisu': 27, 'vč. provize a právních služeb': 27, 'k jednání': 26, 'vč. provize': 26, 'cena včetně provize a právních služeb': 26, 'právního servisu': 23, 'kupní cena včetně provize zprostředkovat

The dictionary above sorts the types of notes found in 'Note on price' variable by frequency of occurance.

In [11]:
df['Note on price'].fillna('Nothing', inplace=True)
df.loc[df['Note on price'].str.contains('poplatků', re.IGNORECASE), ['Note on price']]

Unnamed: 0,Note on price
8,"včetně DPH, včetně poplatků, včetně provize, včetně právního servisu"
10,"včetně DPH, včetně poplatků, včetně provize, včetně právního servisu, cena k jednání"
32,"včetně poplatků, včetně provize, včetně právního servisu, cena k jednání"
154,včetně veškerých poplatků
188,"včetně poplatků, včetně provize, včetně právního servisu, cena k jednání"
...,...
15395,"včetně poplatků, včetně provize, včetně právního servisu"
15415,"včetně poplatků, včetně provize, včetně právního servisu"
15418,"včetně poplatků, včetně provize, včetně právního servisu"
15422,Uvedená cena bez provize RK a poplatků


Next I will try to extract from the 'Note on price' variable the information about whether the house price includes comission, legal fees, vat, fees and if the price is negotiable.

In [12]:
        
notelist = ['provize', 'právního', 'dph', 'poplatků']
patternList = []

for note in notelist:
    patternList.append(re.compile('.*(včetně|\+)\s?.*{}.*'.format(note)))
patternList.append(re.compile('.*cena k jednání'))
notelist.append('cena k jednání')

for note,patternToFollow in zip(notelist,patternList):
    df[note] = df.loc[df['Note on price'].apply(followsPattern, pattern=patternToFollow),'Note on price'].map(lambda x: 'True', na_action='ignore')

In [13]:
df.drop('Note on price', inplace=True, axis=1)

### Data separation into training set and testing set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df,df['price'], test_size=0.1, random_state=0)

In [15]:
X_train.shape, X_test.shape

((12475, 99), (1387, 99))

## Missing values

### Categorical Values

In [16]:
## First lets handle categorical features which are missing

def percentageMissing(feature):
    missingValues = X_train[feature].apply(lambda x: 1 if pd.isna(x) else 0).sum()
    return round((missingValues/len(X_train.axes[0])) * 100, 3)

def getMissingFeaturesDict(featuresWithNan):
    missingFeaturesDict = {}
    for feature in featuresWithNan:
        percentage = percentageMissing(feature)
        missingFeaturesDict[feature] = percentage
    return sorted(missingFeaturesDict.items(), key=lambda x: x[1], reverse=True)

featuresWithNan=[feature for feature in X_train.columns if X_train[feature].isnull().sum() > 0 and X_train[feature].dtypes=='O']
missingFeaturesDict = getMissingFeaturesDict(featuresWithNan)

for index, feature in enumerate(missingFeaturesDict):
    print('Variable "{}"" has {}% of its values missing'.format(missingFeaturesDict[index][0], missingFeaturesDict[index][1]))
    

Variable "Cena"" has 100.0% of its values missing
Variable "Starting price"" has 100.0% of its values missing
Variable "Expert opinion"" has 100.0% of its values missing
Variable "Minimum bid"" has 100.0% of its values missing
Variable "Auction principal"" has 100.0% of its values missing
Variable "Type of auction"" has 100.0% of its values missing
Variable "Place of auction"" has 100.0% of its values missing
Variable "The date of the auction"" has 100.0% of its values missing
Variable "The date of the 1st inspection"" has 100.0% of its values missing
Variable "The date of the 2nd inspection"" has 100.0% of its values missing
Variable "Auction decree"" has 100.0% of its values missing
Variable "Expert opinion.1"" has 100.0% of its values missing
Variable "Minimum purchase price"" has 100.0% of its values missing
Variable "Share size"" has 99.968% of its values missing
Variable "Inspection date to"" has 99.904% of its values missing
Variable "šotolina"" has 99.816% of its values missing

In [17]:
print(X_train.shape)
cols = X_train.columns[X_train.isnull().mean()>0.98]
X_train.drop(cols, axis=1, inplace = True)
print(X_train.shape)

(12475, 99)
(12475, 70)


This deletes the columns that have more than 98% missing values.

In [18]:
catFeaturesWithNan=[feature for feature in X_train.columns if X_train[feature].isnull().sum() > 0 and X_train[feature].dtypes=='O']
missingFeaturesDict = getMissingFeaturesDict(catFeaturesWithNan)

for index, feature in enumerate(missingFeaturesDict):
    print('Variable "{}"" has {}% of its values missing'.format(missingFeaturesDict[index][0], missingFeaturesDict[index][1]))
    

Variable "neupravená"" has 97.571% of its values missing
Variable "zpevněná"" has 97.443% of its values missing
Variable "ústřední dálkové"" has 97.267% of its values missing
Variable "podlahové"" has 97.114% of its values missing
Variable "cena k jednání"" has 96.986% of its values missing
Variable "dlážděná"" has 96.842% of its values missing
Variable "Housing costs"" has 96.689% of its values missing
Variable "čov pro celý objekt"" has 96.489% of its values missing
Variable "kabelové rozvody"" has 95.992% of its values missing
Variable "kabelová televize"" has 95.784% of its values missing
Variable "satelit"" has 95.679% of its values missing
Variable "dph"" has 94.926% of its values missing
Variable "Certificate of energy performance of the building"" has 94.701% of its values missing
Variable "120v"" has 94.493% of its values missing
Variable "poplatků"" has 93.916% of its values missing
Variable "ústřední elektrické"" has 93.739% of its values missing
Variable "jiné"" has 92.265%

The model starts here

In [19]:
def fillNaCatFeature(dataset,catFeaturesWithNan):
    data = X_train.copy()
    data[catFeaturesWithNan]=data[catFeaturesWithNan].fillna('Missing')
    return data

X_train = fillNaCatFeature(X_train,catFeaturesWithNan)

Iputes the cells with missingvalues with a values called 'Missing'

### Numerical variables

In [20]:
numValsToCat = ['Barrier-free', 'Parking', 'Garage', 'Lift', 'Swimming pool']
for feature in numValsToCat:
    X_train[feature] = X_train[feature].fillna('Missing')

In [21]:
numFeatures = [feature for feature in X_train.columns if X_train[feature].dtypes != 'O' and feature not in ['Index']]
X_train[numFeatures]

Unnamed: 0,Usable area,Land area,Year of reconstruction,proximityIndex,Built-up area,Garden area,Year of approval,Floor area,price
151,734,5793.0,,249.083333,584.0,5209.0,,734.0,11900000
12063,643,2092.0,,1968.388889,349.0,1743.0,,,30900000
9880,236,801.0,,1539.266667,236.0,,,,6499000
557,90,521.0,,1889.875000,60.0,,,,1299000
9011,40,1068.0,,2229.230769,47.0,,,,2500000
...,...,...,...,...,...,...,...,...,...
14717,50,320.0,,1772.125000,25.0,,,,1500000
3571,180,717.0,,699.470588,96.0,,,,9950000
10965,276,1618.0,,1827.266667,172.0,,,,3150000
12062,650,2052.0,,1236.263158,269.0,,,,69000000


In [22]:
# Now lets handle numerical features which are missing
numFeaturesWithNan=[feature for feature in numFeatures if X_train[feature].isnull().sum()>1]    
missingFeaturesDict = getMissingFeaturesDict(numFeaturesWithNan)

for index, feature in enumerate(missingFeaturesDict):
    print('Variable "{}"" has {}% of its values missing'.format(missingFeaturesDict[index][0], missingFeaturesDict[index][1]))

Variable "Year of approval"" has 91.142% of its values missing
Variable "Year of reconstruction"" has 89.892% of its values missing
Variable "Garden area"" has 70.814% of its values missing
Variable "Floor area"" has 70.405% of its values missing
Variable "Built-up area"" has 34.453% of its values missing
Variable "Land area"" has 0.072% of its values missing


### Inpute data to numerical variables

In [23]:
X_train[numFeaturesWithNan]

Unnamed: 0,Land area,Year of reconstruction,Built-up area,Garden area,Year of approval,Floor area
151,5793.0,,584.0,5209.0,,734.0
12063,2092.0,,349.0,1743.0,,
9880,801.0,,236.0,,,
557,521.0,,60.0,,,
9011,1068.0,,47.0,,,
...,...,...,...,...,...,...
14717,320.0,,25.0,,,
3571,717.0,,96.0,,,
10965,1618.0,,172.0,,,
12062,2052.0,,269.0,,,


In [24]:
for feature in numFeaturesWithNan:
    # There are many outliers in the dataset, therefore I will use the median.
    median=X_train[feature].median()
    
    # create a new feature to capture nan values
    X_train[feature + 'Nan'] = np.where(X_train[feature].isnull(), 1, 0)
    X_train[feature].fillna(median, inplace=True)

In [25]:
X_train[numFeaturesWithNan]

Unnamed: 0,Land area,Year of reconstruction,Built-up area,Garden area,Year of approval,Floor area
151,5793.0,2016.0,584.0,5209.0,2013.0,734.0
12063,2092.0,2016.0,349.0,1743.0,2013.0,148.0
9880,801.0,2016.0,236.0,504.0,2013.0,148.0
557,521.0,2016.0,60.0,504.0,2013.0,148.0
9011,1068.0,2016.0,47.0,504.0,2013.0,148.0
...,...,...,...,...,...,...
14717,320.0,2016.0,25.0,504.0,2013.0,148.0
3571,717.0,2016.0,96.0,504.0,2013.0,148.0
10965,1618.0,2016.0,172.0,504.0,2013.0,148.0
12062,2052.0,2016.0,269.0,504.0,2013.0,148.0


### Normaliztion of numerical data

In [26]:
numFeatures=[feature for feature in X_train.columns if not bool(re.search(r'Nan$', feature)) 
             and X_train[feature].dtypes!='O' and feature not in ['Index']] 
for feature in numFeatures:
    X_train[feature]=np.log(X_train[feature])
X_train[numFeatures].head(5)

Unnamed: 0,Usable area,Land area,Year of reconstruction,proximityIndex,Built-up area,Garden area,Year of approval,Floor area,price
151,6.598509,8.664406,7.608871,5.517788,6.369901,8.558143,7.607381,6.598509,16.292049
12063,6.466145,7.645876,7.608871,7.584971,5.855072,7.463363,7.607381,4.997212,17.246267
9880,5.463832,6.685861,7.608871,7.339061,5.463832,6.222576,7.607381,4.997212,15.687159
557,4.49981,6.25575,7.608871,7.544266,4.094345,6.222576,7.607381,4.997212,14.077105
9011,3.688879,6.973543,7.608871,7.709412,3.850148,6.222576,7.607381,4.997212,14.731801


### Feature scaling

In [27]:
for feature in numFeatures:
    print('{}; min: {}; max: {}'.format(feature, X_train[feature].min(), X_train[feature].max()))

Usable area; min: 0.0; max: 10.699213884343516
Land area; min: 0.0; max: 13.666759938199
Year of reconstruction; min: 0.6931471805599453; max: 9.91447694733573
proximityIndex; min: -inf; max: 8.42063334221138
Built-up area; min: 0.0; max: 9.297160063928743
Garden area; min: 0.0; max: 12.30965758578655
Year of approval; min: 7.495541943884256; max: 7.6128310304073565
Floor area; min: 0.0; max: 9.722924448757036
price; min: 11.407564949312402; max: 19.209138104316636


In [28]:
X_train.loc[X_train['proximityIndex'] == X_train['proximityIndex'].min(), ['proximityIndex']] = X_train['proximityIndex'].median()

Substitute proximity index with '-inf' with median

In [29]:
scaler=MinMaxScaler()
X_train[numFeatures] = scaler.fit_transform(X_train[numFeatures])

X_train[numFeatures]

Unnamed: 0,Usable area,Land area,Year of reconstruction,proximityIndex,Built-up area,Garden area,Year of approval,Floor area,price
151,0.616728,0.633977,0.74997,0.213542,0.685145,0.695238,0.953537,0.678655,0.626090
12063,0.604357,0.559451,0.74997,0.773597,0.629770,0.606301,0.953537,0.513962,0.748401
9880,0.510676,0.489206,0.74997,0.706973,0.587688,0.505504,0.953537,0.513962,0.548555
557,0.420574,0.457735,0.74997,0.762569,0.440387,0.505504,0.953537,0.513962,0.342180
9011,0.344780,0.510256,0.74997,0.807311,0.414121,0.505504,0.953537,0.513962,0.426098
...,...,...,...,...,...,...,...,...,...
14717,0.365636,0.422069,0.74997,0.745140,0.346221,0.505504,0.953537,0.513962,0.360621
3571,0.485359,0.481100,0.74997,0.493283,0.490940,0.505504,0.953537,0.513962,0.603150
10965,0.525310,0.540651,0.74997,0.753441,0.553663,0.505504,0.953537,0.513962,0.455722
12062,0.605369,0.558038,0.74997,0.647583,0.601766,0.505504,0.953537,0.513962,0.851373


In [30]:
X_train.to_csv('TrainingData.csv')
X_test.to_csv('TestingData.csv')

In [31]:
numFeatures = [feature for feature in X_train.columns if X_train[feature].dtypes != 'O' and feature not in ['Index']]
X_train[numFeatures]

Unnamed: 0,Usable area,Land area,Year of reconstruction,proximityIndex,Built-up area,Garden area,Year of approval,Floor area,price,Land areaNan,Year of reconstructionNan,Built-up areaNan,Garden areaNan,Year of approvalNan,Floor areaNan
151,0.616728,0.633977,0.74997,0.213542,0.685145,0.695238,0.953537,0.678655,0.626090,0,1,0,0,1,0
12063,0.604357,0.559451,0.74997,0.773597,0.629770,0.606301,0.953537,0.513962,0.748401,0,1,0,0,1,1
9880,0.510676,0.489206,0.74997,0.706973,0.587688,0.505504,0.953537,0.513962,0.548555,0,1,0,1,1,1
557,0.420574,0.457735,0.74997,0.762569,0.440387,0.505504,0.953537,0.513962,0.342180,0,1,0,1,1,1
9011,0.344780,0.510256,0.74997,0.807311,0.414121,0.505504,0.953537,0.513962,0.426098,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14717,0.365636,0.422069,0.74997,0.745140,0.346221,0.505504,0.953537,0.513962,0.360621,0,1,0,1,1,1
3571,0.485359,0.481100,0.74997,0.493283,0.490940,0.505504,0.953537,0.513962,0.603150,0,1,0,1,1,1
10965,0.525310,0.540651,0.74997,0.753441,0.553663,0.505504,0.953537,0.513962,0.455722,0,1,0,1,1,1
12062,0.605369,0.558038,0.74997,0.647583,0.601766,0.505504,0.953537,0.513962,0.851373,0,1,0,1,1,1


In [32]:
X_train

Unnamed: 0,district,HouseType,Building,Object status,Location of the house,Object location,House type,The floor,Usable area,Land area,Parking,Year of reconstruction,Water,Transport,Energy efficiency of the building,Equipment,Lift,companyName,proximityIndex,Index,Built-up area,Store,Garage,Gas,Garden area,Year of approval,Swimming pool,Floor area,Indicator of energy efficiency of the building,Certificate of energy performance of the building,Barrier-free,Housing costs,locationLat,locationLong,price,lokální plynové,heating_unknown,ústřední plynové,lokální tuhá paliva,jiné,lokální elektrické,ústřední tuhá paliva,ústřední elektrické,ústřední dálkové,podlahové,veřejná kanalizace,jímka,čov pro celý objekt,septik,garbage_unknown,230v,400v,electricity_unknown,120v,telecommunication_unknown,telefon,internet,kabelové rozvody,satelit,kabelová televize,communication_unknown,asfaltová,neupravená,dlážděná,zpevněná,provize,právního,dph,poplatků,cena k jednání,Land areaNan,Year of reconstructionNan,Built-up areaNan,Garden areaNan,Year of approvalNan,Floor areaNan
151,cheb,chalupy\n,Cihlová,Velmi dobrý,Missing,Missing,Patrový,2,0.616728,0.633977,1.0,0.74997,Studna,Missing,Třída G - Mimořádně nehospodárná č. 78/2013 Sb. podle vyhlášky,1,Missing,ERA Home Service,0.213542,154,0.685145,Missing,1.0,Missing,0.695238,0.953537,Missing,0.678655,Missing,Missing,Missing,Missing,49.9875604,12.874645217105748,0.626090,Missing,Missing,Missing,True,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,0,1,0,0,1,0
12063,brno,vily\n,Cihlová,Velmi dobrý,Samostatný,Okraj obce,Patrový,3,0.604357,0.559451,2.0,0.74997,"Dálkový vodovod, Studna","Silnice, MHD, Autobus",Třída D - Méně úsporná č. 78/2013 Sb. podle vyhlášky,Částečně,Missing,REAL OFFICE,0.773597,12462,0.629770,1,1.0,Plynovod,0.606301,0.953537,1.0,0.513962,"116,0 kWh/m^2 za rok",Missing,Missing,Missing,49.2774571,16.6095407,0.748401,Missing,Missing,True,True,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,True,Missing,True,True,Missing,Missing,Missing,0,1,0,0,1,1
9880,usti-nad-orlici,chalupy\n,Cihlová,Novostavba,Samostatný,Centrum obce,Patrový,3,0.510676,0.489206,Missing,0.74997,Dálkový vodovod,"Vlak, Silnice, Autobus",Třída G - Mimořádně nehospodárná č. 148/2007 Sb. podle vyhlášky,Missing,Missing,RE/MAX Dynamic,0.706973,10203,0.587688,9 m2,1.0,Missing,0.505504,0.953537,Missing,0.513962,Missing,Zobrazit průkaz energetické náročnosti budovy,Missing,Missing,50.0479103,16.5949808,0.548555,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,True,True,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,True,True,Missing,Missing,Missing,0,1,0,1,1,1
557,sokolov,rodinne-domy\n,Cihlová,Před rekonstrukcí,Samostatný,Missing,Patrový,1,0.420574,0.457735,1.0,0.74997,Dálkový vodovod,Autobus,Třída G - Mimořádně nehospodárná,Missing,Missing,Dumrealit.cz Bonus,0.762569,565,0.440387,1,Missing,Missing,0.505504,0.953537,Missing,0.513962,Missing,Missing,Missing,Missing,50.3487052,12.5103994,0.342180,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,True,True,Missing,Missing,Missing,0,1,0,1,1,1
9011,zdar-nad-sazavou,chaty\n,Dřevěná,Dobrý,Samostatný,Klidná část obce,Přízemní,1. podlaží z celkem 1,0.344780,0.510256,1.0,0.74997,"Místní zdroj, Dálkový vodovod","Silnice, Autobus",Missing,1,Missing,Fincentrum Reality,0.807311,9311,0.414121,Missing,Missing,Plynovod,0.505504,0.953537,Missing,0.513962,Missing,Missing,Missing,Missing,49.4705755,16.0874183,0.426098,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14717,frydek-mistek,chaty\n,Smíšená,Dobrý,Samostatný,Klidná část obce,Patrový,2,0.365636,0.422069,Missing,0.74997,Místní zdroj,Missing,Třída G - Mimořádně nehospodárná,Missing,Missing,"Optimal Finance, s.r.o.",0.745140,15210,0.346221,Missing,Missing,Missing,0.505504,0.953537,Missing,0.513962,Missing,Missing,0.0,Missing,49.59994,18.3812642,0.360621,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,0,1,0,1,1,1
3571,beroun,vily\n,Panelová,Velmi dobrý,Samostatný,Centrum obce,Patrový,3,0.485359,0.481100,Missing,0.74997,Missing,Missing,Třída E - Nehospodárná č. 264/2020 Sb. podle vyhlášky,0,Missing,RE/MAX G8 Reality,0.493283,3672,0.490940,Missing,Missing,Missing,0.505504,0.953537,Missing,0.513962,Missing,Zobrazit průkaz energetické náročnosti budovy,Missing,Missing,49.8321686,13.8850731,0.603150,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,0,1,0,1,1,1
10965,ceska-lipa,rodinne-domy\n,Smíšená,Před rekonstrukcí,Samostatný,Klidná část obce,Patrový,2,0.525310,0.540651,2.0,0.74997,Dálkový vodovod,Autobus,Třída G - Mimořádně nehospodárná,Missing,Missing,Reality 11 Liberecko,0.753441,11319,0.553663,1,1.0,Missing,0.505504,0.953537,Missing,0.513962,Missing,Missing,Missing,Missing,50.7654108,14.4421418,0.455722,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,True,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,True,Missing,Missing,True,Missing,0,1,0,1,1,1
12062,brno,vily\n,Cihlová,V rekonstrukci,Samostatný,Missing,Patrový,3 včetně 1 podzemního,0.605369,0.558038,1.0,0.74997,Missing,Missing,Třída G - Mimořádně nehospodárná,Missing,Missing,AMANDLA,0.647583,12461,0.601766,1,1.0,Missing,0.505504,0.953537,1.0,0.513962,Missing,Missing,Missing,Missing,49.2583565,16.593727,0.851373,Missing,True,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,True,Missing,True,Missing,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,True,Missing,Missing,Missing,Missing,0,1,0,1,1,1


In [33]:
X_train['Swimming pool'].unique()

array(['Missing', 1.0], dtype=object)