In [44]:
import pandas as pd
from unidecode import unidecode

In [45]:
df = pd.read_csv("Brno_nemovitosti.csv", encoding="utf-8")
print("Data loaded successfully")

print(df.info())


# Ukážka niektorých riadkov pred odstránením diakritiky
print("Data before removing diacritics:")
print(df[['nazev_inzeratu', 'oblast']].head(10))

# Funkcia na odstránenie diakritiky z textu
def remove_diacritics(text):
    if isinstance(text, str):
        return unidecode(text)
    return text

# Odstránenie diakritiky zo všetkých textových stĺpcov
text_columns = df.select_dtypes(include=['object']).columns
for col in text_columns:
    df[col] = df[col].apply(remove_diacritics)

# Ukážka niektorých riadkov po odstránení diakritiky
print("Data after removing diacritics:")
print(df[['nazev_inzeratu', 'oblast']].head(10))

print("Diacritics removed successfully")


Data loaded successfully
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324599 entries, 0 to 324598
Data columns (total 28 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 324599 non-null  object 
 1   datum_vytvoreni    324599 non-null  object 
 2   data_address       322162 non-null  object 
 3   dipozice           217193 non-null  object 
 4   ma_balkon          79474 non-null   object 
 5   velilkost_balkonu  15713 non-null   float64
 6   mesto              324452 non-null  object 
 7   kod_krajiny        324599 non-null  object 
 8   oblast             324599 non-null  object 
 9   patro              215639 non-null  float64
 10  ma_garaz           39482 non-null   object 
 11  ma_zahradu         11369 non-null   object 
 12  rozloha_zahrady    10156 non-null   float64
 13  zemepisna_sirka    296337 non-null  float64
 14  zemepisna_delka    296337 non-null  float64
 15  gps_typ            176808 

In [46]:
column_mapping = {
    
    "data_address": "adresa",
}

df.rename(columns=column_mapping, inplace=True)
print("Columns renamed successfully")
print(df.head())  # Ukážka dát po premenovaní stĺpcov

# Nahradenie prázdnych hodnôt hodnotou False v boolovských stĺpcoch
bool_columns = ["ma_balkon", "ma_garaz", "ma_zahradu", "ma_parkovani"]
for col in bool_columns:
    df[col] = df[col].fillna(False).astype(bool)

print("Boolean columns updated:")
print(df[bool_columns].head(10))

Columns renamed successfully
                  id           datum_vytvoreni                        adresa  \
0  3vnrSu8uDu8k9PRyA  2022-05-25T13:53:45.798Z        Lidicka, Brno - Veveri   
1  3dF757eGeuddastQG  2022-05-25T13:53:47.335Z   Spolkova, Brno - Zabrdovice   
2  ptRJnQ4b8pNBPoLz7  2022-05-25T13:53:57.189Z  Poznanska, Brno - Zabovresky   
3  opXzryCRQsw4cjLi2  2022-05-25T13:53:57.569Z      Jilkova, Brno - Zidenice   
4  vKNfCkhikKL868i4R  2022-05-25T13:53:58.080Z             Brno - Brno-mesto   

  dipozice ma_balkon  velilkost_balkonu mesto kod_krajiny      oblast  patro  \
0      NaN       NaN                NaN  Brno          CZ  Brno-mesto    NaN   
1     1+kk      True                NaN  Brno          CZ  Brno-mesto    2.0   
2      3+1       NaN                NaN  Brno          CZ  Brno-mesto    3.0   
3     3+kk       NaN                NaN  Brno          CZ  Brno-mesto    1.0   
4      NaN       NaN                NaN  Brno          CZ  Brno-mesto    NaN   

   ... ty

  df[col] = df[col].fillna(False).astype(bool)
  df[col] = df[col].fillna(False).astype(bool)
  df[col] = df[col].fillna(False).astype(bool)
  df[col] = df[col].fillna(False).astype(bool)


In [47]:
# Konverzia všetkých hodnôt v stĺpci 'nazev_inzeratu' na text
df["nazev_inzeratu"] = df["nazev_inzeratu"].astype(str)

# Vytvorenie nového stĺpca 'nebytove_udaje'
df["nebytove_udaje"] = None

# Skontrolujeme výskyt kľúčových slov v stĺpci 'nazev_inzeratu'
keywords_nebytove = {
    "louka": "louka",
    "les": "les",
    "pozemek": "pozemek",
    "chata": "chata",
    "chalupa": "chalupa",
    "kancelar": "kancelar",
    "zahrada": "zahrada",
    "skladovaci prostory": "skladovaci prostory",
    "prodej domu": "prodej domu",
    "obchodni prostory": "obchodni prostory",
    "restaurace": "restaurace",
    "komercni prostor": "komercni prostor",
    "pronajem": "pronajem"
}

# Doplníme hodnoty do 'nebytove_udaje' na základe kľúčových slov
for keyword, value in keywords_nebytove.items():
    matches = df["nazev_inzeratu"].str.contains(keyword, case=False, na=False)
    df.loc[matches, "nebytove_udaje"] = value
    print(f"Number of matches for '{keyword}': {matches.sum()}")

# Špeciálne spracovanie pre 'pole' s výnimkou 'Kralovo pole'
pole_matches = df["nazev_inzeratu"].str.contains(r'\bpole\b', case=False, na=False) & ~df["nazev_inzeratu"].str.contains("Kralovo pole", case=False, na=False)
df.loc[pole_matches, "nebytove_udaje"] = "pole"
print(f"Number of matches for 'pole' excluding 'Kralovo pole': {pole_matches.sum()}")

# Doplníme 'dipozice' hodnotou 'garsoniera' ak je prázdny a 'nazev_inzeratu' obsahuje 'garsoniera'
garsoniera_matches = df["nazev_inzeratu"].str.contains("garsoniera", case=False, na=False)
df.loc[df["dipozice"].isna() & garsoniera_matches, "dipozice"] = "garsoniera"
print(f"Number of matches for 'garsoniera': {garsoniera_matches.sum()}")


Number of matches for 'louka': 16
Number of matches for 'les': 2479
Number of matches for 'pozemek': 21104
Number of matches for 'chata': 346
Number of matches for 'chalupa': 43
Number of matches for 'kancelar': 23180
Number of matches for 'zahrada': 686
Number of matches for 'skladovaci prostory': 14
Number of matches for 'prodej domu': 12416
Number of matches for 'obchodni prostory': 341
Number of matches for 'restaurace': 2314
Number of matches for 'komercni prostor': 209
Number of matches for 'pronajem': 186769
Number of matches for 'pole' excluding 'Kralovo pole': 5109
Number of matches for 'garsoniera': 424


In [48]:
# Vypočítanie ceny za meter štvorcový
df["cena_za_m2"] = df["cena"] / df["uzitna_plocha"]
df.loc[df["uzitna_plocha"] == 0, "cena_za_m2"] = None

print("New column 'cena_za_m2' added:")
print(df[["cena", "uzitna_plocha", "cena_za_m2"]].head(10))


New column 'cena_za_m2' added:
        cena  uzitna_plocha     cena_za_m2
0    15300.0          102.0     150.000000
1  3850000.0           32.0  120312.500000
2  7400000.0           82.0   90243.902439
3  9890000.0          102.0   96960.784314
4    30000.0          160.0     187.500000
5    12500.0           38.0     328.947368
6    15000.0            NaN            NaN
7    10500.0           35.0     300.000000
8        NaN          213.0            NaN
9    12500.0           46.0     271.739130


In [49]:
# Získanie roku z dátumu v stĺpci 'datum_vytvoreni'
df["rok"] = df["datum_vytvoreni"].str[:4]

print("New column 'rok_vytvoreni' added:")
print(df[["datum_vytvoreni", "rok"]].head(10))

New column 'rok_vytvoreni' added:
            datum_vytvoreni   rok
0  2022-05-25T13:53:45.798Z  2022
1  2022-05-25T13:53:47.335Z  2022
2  2022-05-25T13:53:57.189Z  2022
3  2022-05-25T13:53:57.569Z  2022
4  2022-05-25T13:53:58.080Z  2022
5  2022-05-25T13:53:58.653Z  2022
6  2022-05-25T14:33:02.925Z  2022
7  2022-05-25T14:33:08.767Z  2022
8  2022-05-25T14:43:17.723Z  2022
9  2022-05-25T14:52:39.060Z  2022


In [50]:
df.to_csv("Brno_vycistene_udaje.csv", index=False, encoding="utf-8")
print("Cleaned data saved successfully")

Cleaned data saved successfully
