# Lublin House Prices Prediction

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data preprocessing

In [59]:
data = pd.read_csv('data/houses_data.csv')

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3182 entries, 0 to 3181
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Price     3182 non-null   object
 1   Name      3182 non-null   object
 2   Rooms     3182 non-null   object
 3   Area      3182 non-null   object
 4   Floor     3121 non-null   object
 5   Region    3182 non-null   object
 6   Year      3182 non-null   object
 7   Elevator  3182 non-null   object
dtypes: object(8)
memory usage: 199.0+ KB


In [61]:
data = data.drop("Name", axis=1)

In [62]:
print("\nUnique values in each column (excluding 'Price' and 'Area'):")
for column in data.columns:
    if column not in ['Price', 'Area']:
        unique_values = data[column].unique()
        print(f"{column}: {unique_values}")


Unique values in each column (excluding 'Price' and 'Area'):
Rooms: ['4 pokoje' '2 pokoje' '3 pokoje' '1 pokój' '5 pokoi' '6 pokoi'
 '10+ pokoi' '7 pokoi']
Floor: ['3 piętro' '2 piętro' '1 piętro' nan '10+ piętro' '10 piętro' '4 piętro'
 'parter' '6 piętro' '5 piętro' '9 piętro' '7 piętro' '8 piętro'
 'suterena' 'poddasze']
Region: ['Rury' 'Czechów Północny' 'Wrotków' 'Czechów Południowy' 'Kośminek'
 'Wieniawa' 'Ponikwoda' 'Śródmieście' 'Bronowice' 'Węglin Południowy'
 'Dziesiąta' 'Tatary' 'Felin' 'Kalinowszczyzna' 'Sławin' 'Czuby Północne'
 'Konstantynów' 'Szerokie' 'Czuby Południowe' 'Stare Miasto'
 'Za Cukrownią' 'Zemborzyce' 'Węglin Północny' 'Hajdów-Zadębie']
Year: ['brak informacji' '2025' '2026' '2023' '2015' '1980' '1982' '2008' '2024'
 '2018' '2003' '2021' '1976' '1963' '2017' '1969' '2000' '1978' '1975'
 '1983' '1993' '1974' '1995' '2022' '1994' '1897' '1986' '1984' '2010'
 '1988' '1973' '1996' '1967' '1972' '1964' '1998' '2014' '1990' '1977'
 '2005' '2012' '2016' '1985' '19

### Change the data type of the columns

In [63]:
# Replace 'Zapytaj o cenę' with NaN
data['Price'] = data['Price'].replace('Zapytaj o cenę', np.nan)
# Remove non-numeric characters, replace commas with dots, and convert to float, while keeping NaN values
data['Price'] = data['Price'].str.replace('\xa0', '').str.replace('zł', '').str.replace(',', '.').astype('float32')

In [64]:
# Remove non-numeric characters and convert 'Area' to float
data['Area'] = data['Area'].str.replace(' ', '').str.replace('m²', '').astype('float32')

In [65]:
# Replace 'brak informacji' with NaN in 'Year' column
data['Year'] = data['Year'].replace('brak informacji', np.nan)
# Convert 'Year' to integer, while keeping NaN values
data['Year'] = data['Year'].astype(float).astype('Int32')

In [66]:
# Convert 'Elevator' column to 0 if 'nie' and 1 if 'tak'
data['Elevator'] = data['Elevator'].replace({'nie': 0, 'tak': 1}).astype('int32')

  data['Elevator'] = data['Elevator'].replace({'nie': 0, 'tak': 1}).astype('int32')


In [67]:
# Count the number of rows for each unique value in 'Rooms' column
room_counts = data['Rooms'].value_counts()
print("Number of rows for each unique value in 'Rooms' column:")
print(room_counts)

Number of rows for each unique value in 'Rooms' column:
Rooms
3 pokoje     1364
2 pokoje     1124
4 pokoje      404
1 pokój       232
5 pokoi        46
6 pokoi         7
7 pokoi         3
10+ pokoi       2
Name: count, dtype: int64


In [68]:
# Replace '10+ pokoi' with '10 pokoi' in 'Rooms' column
data['Rooms'] = data['Rooms'].replace('10+ pokoi', '10 pokoi')
# Extract numeric part and convert 'Rooms' to integer
data['Rooms'] = data['Rooms'].str.extract('(\d+)').astype('int32')

  data['Rooms'] = data['Rooms'].str.extract('(\d+)').astype('int32')


In [69]:
# Drop rows with missing values in 'Price' column
data = data.dropna(subset=['Price'])

In [70]:
# Reorder columns
data = data[['Price', 'Area', 'Elevator', 'Year', 'Rooms', 'Floor', 'Region']]

In [71]:
# Assuming 'data' is your DataFrame
missing_data = data.isnull().sum()
print("Missing data in each column:")
print(missing_data)

Missing data in each column:
Price         0
Area          0
Elevator      0
Year        430
Rooms         0
Floor        61
Region        0
dtype: int64


In [72]:
data.head()

Unnamed: 0,Price,Area,Elevator,Year,Rooms,Floor,Region
0,567000.0,52.5,0,,4,3 piętro,Rury
1,728500.0,77.5,1,2025.0,4,3 piętro,Czechów Północny
2,668360.0,68.199997,1,2026.0,4,2 piętro,Wrotków
3,833414.0,76.459999,1,2023.0,4,1 piętro,Czechów Południowy
4,696500.0,79.599998,1,2025.0,4,,Kośminek


In [73]:
data.dtypes

Price       float32
Area        float32
Elevator      int32
Year          Int32
Rooms         int32
Floor        object
Region       object
dtype: object

In [74]:
# Calculate the number of rows with both 'Floor' and 'Region' missing values
missing_floor_region = data[data['Floor'].isnull() & data['Region'].isnull()].shape[0]
print(f"Number of rows with both 'Floor' and 'Region' missing values: {missing_floor_region}")

Number of rows with both 'Floor' and 'Region' missing values: 0


There are 0 rows which has mising values at the same time in the columns 'Floor' and 'Region', so it is bad idea to drop all NA values, because we will lose near 500 rows from the dataset. Instead of this, we will fill the missing values in the 'Floor' column with the median value.