In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pd.read_csv('data/apartments_pl_2024_06.csv')

In [None]:
city_counts = df["city"].value_counts()
plt.figure(figsize=(12, 6))
plt.bar(city_counts.index, city_counts.values)
plt.yscale("log")
plt.xlabel("Miasto")
plt.ylabel("Ilość mieszkań z danego miasta (skala logarytmiczna)")
plt.title("Ilość mieszkań z danego miasta (skala logarytmiczna)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
print(city_counts.sort_values())

city_table = city_counts.sort_values().reset_index()
city_table.columns = ['Miasto', 'Liczba mieszkań']
city_table['Liczba mieszkań logarytmicznie'] = np.log10(city_table['Liczba mieszkań'])

city_table.index = city_table.index + 1
city_table.style.format({
    'Liczba mieszkań': '{:,.0f}',
    'Liczba mieszkań logarytmicznie': '{:.2f}'
}).set_caption("Ilość mieszkań w konkretnych miastach:")

In [None]:
cities_to_keep = city_counts[city_counts >= 400].index

df_usable = df[df["city"].isin(cities_to_keep)].copy()

In [None]:
city_counts = df_usable["city"].value_counts()
plt.figure(figsize=(12, 6))
plt.bar(city_counts.index, city_counts.values)
plt.yscale("log")
plt.xlabel("Miasto")
plt.ylabel("Ilość mieszkań z danego miasta (skala logarytmiczna)")
plt.title("Ilość mieszkań z danego miasta (skala logarytmiczna)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split


X = df_usable.drop("price", axis=1)
y = df_usable["price"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.1, 
    stratify=df_usable["city"],
    random_state=42
)

In [None]:
# missin values

missing_percentages = df_usable.isnull().mean() * 100
missing_percentages = missing_percentages[missing_percentages > 0]


plt.figure(figsize=(10, 5))
missing_percentages.sort_values().plot(kind='barh', color='tomato')
plt.xlabel("Ilość brakujących danych w procentach")
plt.title("Procent brakujących danych w poszczególnych kolumnach")
plt.grid(True, axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
#dropping the least useful values

columns_to_drop = missing_percentages[missing_percentages > 17.5].index
df_usable = df_usable.drop(columns=columns_to_drop)

In [None]:
missing_percentages = df_usable.isnull().mean() * 100
missing_percentages = missing_percentages[missing_percentages > 0]


plt.figure(figsize=(10, 5))
missing_percentages.sort_values().plot(kind='barh', color='tomato')
plt.xlabel("Ilość brakujących danych w procentach")
plt.title("Procent brakujących danych w poszczególnych kolumnach")
plt.grid(True, axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Select columns that are not numerical (e.g., object, category, etc.)
non_numerical_columns = df_usable.select_dtypes(exclude=['number']).columns.tolist()

# Display the list of non-numerical columns
print("Non-numerical columns:")
print(non_numerical_columns)

In [None]:
#dropping id

df_usable = df_usable.drop('id', axis=1)

In [None]:
# Get non-numerical columns
non_numerical_columns = df_usable.select_dtypes(exclude=['number']).columns.tolist()

# Create a dictionary with unique values for each column
unique_values = {col: df_usable[col].dropna().unique().tolist() for col in non_numerical_columns}

# Convert dictionary to DataFrame for a nice tabular display
unique_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in unique_values.items()]))

# Display the table with a caption
unique_df.style.set_caption("Unique Values in Non-Numerical Columns")

In [None]:
#nie ma szeregowalnych - wszystkie trzeba na getdummies

# Keep original 'city' column separately for stratification
stratify_col = df_usable['city']

# Create encoded dataframe with dummies, including city column encoded
df_ready = pd.get_dummies(df_usable, drop_first=True)


In [None]:
#zapis do pliku po getdummies

# Split columns by type
numeric_cols = df_ready.select_dtypes(include='number').columns
categorical_cols = df_ready.select_dtypes(include='object').columns


# Compute defaults
fill_values = {}

for col in df_ready.columns:
    if col in numeric_cols:
        fill_values[col] = df_ready[col].mean()
    else:
        fill_values[col] = df_ready[col].mode()[0]

# Save to file
pd.Series(fill_values).to_csv("fill_defaults.csv")

In [None]:
#trenowanie


X = df_ready.drop('price', axis=1)
y = df_ready['price']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=stratify_col
)

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

In [None]:
#wykres RMSE

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Assuming you have these:
# X_test, y_test, y_pred, and original city column for test samples called city_test

# Make sure city_test aligns with your test indices:
city_test = df_usable.loc[y_test.index, 'city']

# Calculate RMSE per city
rmse_per_city = {}
for city in city_test.unique():
    idx = city_test == city
    rmse = np.sqrt(mean_squared_error(y_test[idx], y_pred[idx]))
    rmse_per_city[city] = rmse

# Convert to a DataFrame for easier plotting
import pandas as pd
rmse_df = pd.DataFrame.from_dict(rmse_per_city, orient='index', columns=['RMSE'])
rmse_df = rmse_df.sort_values('RMSE')

# Plot
plt.figure(figsize=(10,6))
rmse_df.plot(kind='bar', legend=False)
plt.ylabel('RMSE')
plt.title('RMSE dla każdego miasta')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()




In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Apartment Prices")

# Add RMSE text to plot
plt.text(
    x=0.05 * max(y_test),  # 5% from the left
    y=0.95 * max(y_pred),  # 95% up the y-axis
    s=f"RMSE: {rmse:,.0f} PLN",  # Rounded, thousands separator, PLN unit
    fontsize=12,
    bbox=dict(facecolor='white', alpha=0.8)
)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

importances = rf_model.feature_importances_
feature_names = X.columns
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

feat_imp.plot(kind='bar', figsize=(12, 6))
plt.title("Feature Importance")
plt.tight_layout()
plt.show()


In [None]:
#uzupełnianie null'i

X = df_usable.drop('price', axis=1)
X = X.fillna(X.median(numeric_only=True))


for col in X.select_dtypes(include='object'):
    X[col] = X[col].fillna(X[col].mode()[0])

y = df_usable['price']
df_ready = pd.get_dummies(df_usable, drop_first=True)


# trenowanie po uzupełnieniu nulli


X = df_ready.drop('price', axis=1)
y = df_ready['price']

In [36]:
# Split columns by type
numeric_cols = X.select_dtypes(include='number').columns
categorical_cols = X.select_dtypes(include='object').columns

# Compute defaults
fill_values = {}

for col in X.columns:
    if col in numeric_cols:
        fill_values[col] = X[col].mean()
    else:
        fill_values[col] = X[col].mode()[0]

# Save to file
pd.Series(fill_values).to_csv("fill_defaults.csv")

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=stratify_col
)

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import math

# Calculate RMSE
mse = np.square(np.subtract(y_test,y_pred)).mean() 
 
rmse = math.sqrt(mse)
 


# Plot predictions vs. actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='steelblue')
plt.xlabel("Actual Price (zł)")
plt.ylabel("Predicted Price (zł)")
plt.title("Actual vs Predicted Apartment Prices")

# Plot diagonal line for perfect predictions
max_val = max(max(y_test), max(y_pred))
min_val = min(min(y_test), min(y_pred))
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label="Perfect Prediction")

# Add text box with RMSE
plt.text(0.05, 0.95, f"Mean RMSE: {rmse:,.0f} zł", transform=plt.gca().transAxes,
         fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow', edgecolor='gray'))

plt.legend()
plt.tight_layout()
plt.show()



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Assuming you have these:
# X_test, y_test, y_pred, and original city column for test samples called city_test

# Make sure city_test aligns with your test indices:
city_test = df_usable.loc[y_test.index, 'city']

# Calculate RMSE per city
rmse_per_city = {}
for city in city_test.unique():
    idx = city_test == city
    rmse = np.sqrt(mean_squared_error(y_test[idx], y_pred[idx]))
    rmse_per_city[city] = rmse

# Convert to a DataFrame for easier plotting
import pandas as pd
rmse_df = pd.DataFrame.from_dict(rmse_per_city, orient='index', columns=['RMSE'])
rmse_df = rmse_df.sort_values('RMSE')

# Plot
plt.figure(figsize=(10,6))
rmse_df.plot(kind='bar', legend=False)
plt.ylabel('RMSE')
plt.title('RMSE dla każdego miasta')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Średnia wartość rmse na metr kwadrat

import pandas as pd
import numpy as np


# absolute error dla każdego przypadku
errors = np.abs(y_test - y_pred)

# Extract the corresponding 'squareMeters' for the y_test rows
#train test split zostawił te same indeksy

square_meters = df_usable.loc[y_test.index, 'squareMeters']

# error podzielony przez metry kwad
error_per_sqm = errors / square_meters


# wartosc bezwzgledna bledow
errors = np.abs(y_test - y_pred)

# Get corresponding squareMeters for the y_test rows
square_meters = df_usable.loc[y_test.index, 'squareMeters']

# Calculate error per square meter
error_per_sqm = errors / square_meters

# Create a new DataFrame with these values
error_df = pd.DataFrame({
    'Actual_Price': y_test,
    'Predicted_Price': y_pred,
    'Absolute_Error': errors,
    'SquareMeters': square_meters,
    'Error_per_SquareMeter': error_per_sqm
})

# obliczenie rmse sredniego dla metra^2
mean_error_per_sqm = error_df['Error_per_SquareMeter'].mean()

print("Średnia wartość błędu RMSE dla metra kwadratowego:", mean_error_per_sqm)



In [None]:
# jakie kolumny mamy w modelu
#df_usable - sprzed getdummies

df_usable.columns


for col in df_usable.columns:
    print(col)



In [None]:
# tabela z tekstem czytelnym dla ui
# !!! będzie trzeba po wprowadzeniu zmienic z m na km, bo tak liczy model!!!

labels_for_ui = {
    'city': 'Miasto',
    'squareMeters': 'Powierzchnia (m²)',
    'rooms': 'Liczba pokoi',
    'floor': 'Piętro',
    'floorCount': 'Liczba pięter w budynku',
    'ownership': 'Forma własności',
    'hasParkingSpace': 'Miejsce parkingowe',
    'hasBalcony': 'Balkon',
    'hasElevator': 'Winda',
    'hasSecurity': 'Ochrona',
    'buildYear': 'Rok budowy',
    'latitude': 'Szerokość geograficzna',
    'longitude': 'Długość geograficzna',
    'centreDistance': 'Odległość od centrum (km)',
    'pharmacyDistance': 'Odległość do apteki (m)',
    'clinicDistance': 'Odległość do przychodni (m)',
    'postOfficeDistance': 'Odległość do poczty (m)',
    'restaurantDistance': 'Odległość do restauracji (m)',
    'kindergartenDistance': 'Odległość do przedszkola (m)',
    'schoolDistance': 'Odległość do szkoły (m)',
    'collegeDistance': 'Odległość do uczelni (m)',
    'poiCount': 'Liczba ważnych punktów w pobliżu (sklepy, szkoły etc.)',
    'hasStorageRoom': 'Komórka lokatorska',
    'price': 'Cena (zł)'
}


In [None]:
#zapis modelu

import joblib

# rf_model - nazwa modelu
joblib.dump(rf_model, 'rf_model.pkl')


In [None]:
#zapis korespondujących nazw kolumn i nazw dla ui

df_labels = pd.DataFrame(list(labels_for_ui.items()), columns=['column', 'ui_label'])

df_labels.to_csv('labels_for_ui.csv', index=False)


In [None]:
# sprawdzenie jednostek
first_full_row = df_usable.dropna().iloc[0]

print(first_full_row)


In [None]:
# zapisanie uzywanego dataset'a jako pli csv

df_usable.to_csv("df_usable.csv", index=False)


In [None]:
# orientacyjny zasięg miast

df = pd.read_csv("df_usable.csv")

# Print all unique cities
cities = df["city"].dropna().unique()
print("All cities in df_usable:")
for city in sorted(cities):
    print(city)

In [None]:
# tablea polozen geograficznych

city_geo = {
    "Miasto": [
        "Warszawa", "Kraków", "Szczecin", "Łódź", "Wrocław",
        "Gdańsk", "Poznań", "Bydgoszcz", "Katowice", "Lublin", "Gdynia"
    ],
    "Położenie geograficzne": [
        "52°13′47″N 21°00′41″E",
        "50°03′41″N 19°56′12″E",
        "53°25′44″N 14°34′22″E",
        "51°46′30″N 19°28′00″E",
        "51°06′29″N 17°02′19″E",
        "54°21′07″N 18°38′47″E",
        "52°24′23″N 16°55′31″E",
        "53°07′26″N 18°00′27″E",
        "50°15′54″N 19°01′26″E",
        "51°15′00″N 22°34′00″E",
        "54°31′08″N 18°32′27″E"
    ]
}


#tabela odleglosci miedzy centrum a najdalszym punktem miasta

city_radius = {
    "Miasto": [
        "Warszawa", "Kraków", "Szczecin", "Łódź", "Wrocław",
        "Gdańsk", "Poznań", "Bydgoszcz", "Katowice", "Lublin", "Gdynia"
    ],
    "Maksymalna odległość od centrum do granicy miasta (km)": [
        22.7, 15.0, 17.0, 10.0, 12.0,
        15.0, 12.0, 10.0, 8.0, 9.0, 10.0
    ]
}

df_city_radius = pd.DataFrame(city_radius)

df_city_radius

In [None]:
#MAks i min szerokosc dlugosc geo

# 1 stopien szerokosci = 111km
# 1 stopien dlugosci = 111km*cos(szerokosc)

# tabela z miastami i polozeniami geograficznymi ich cenrtow oraz promieniem
cities = [
    {"Miasto": "warszawa", "Lat": 52.2297, "Lon": 21.0122, "Radius_km": 22.7},
    {"Miasto": "krakow", "Lat": 50.0647, "Lon": 19.9450, "Radius_km": 15.0},
    {"Miasto": "szczecin", "Lat": 53.4285, "Lon": 14.5528, "Radius_km": 17.0},
    {"Miasto": "lodz", "Lat": 51.7592, "Lon": 19.4550, "Radius_km": 10.0},
    {"Miasto": "wroclaw", "Lat": 51.1079, "Lon": 17.0385, "Radius_km": 12.0},
    {"Miasto": "gdansk", "Lat": 54.3520, "Lon": 18.6466, "Radius_km": 15.0},
    {"Miasto": "poznań", "Lat": 52.4064, "Lon": 16.9252, "Radius_km": 12.0},
    {"Miasto": "bydgoszcz", "Lat": 53.1235, "Lon": 18.0084, "Radius_km": 10.0},
    {"Miasto": "katowice", "Lat": 50.2649, "Lon": 19.0238, "Radius_km": 8.0},
    {"Miasto": "lublin", "Lat": 51.2465, "Lon": 22.5684, "Radius_km": 9.0},
    {"Miasto": "gdynia", "Lat": 54.5189, "Lon": 18.5305, "Radius_km": 10.0},
]

# kilometry na stopien
km_per_deg_lat = 111.0

# Calculate table data
city_geo_range = []
for city in cities:
    lat = city["Lat"]
    lon = city["Lon"]
    r_km = city["Radius_km"]

    deg_lat = r_km / km_per_deg_lat
    km_per_deg_lon = 111.320 * math.cos(math.radians(lat))
    deg_lon = r_km / km_per_deg_lon

    row = {
        "Miasto": city["Miasto"],
        "Minimalna Szerokość": round(lat - deg_lat, 1),
        "Maksymalna Szerokość": round(lat + deg_lat, 1),
        "Minimalna Długość": round(lon - deg_lon, 1),
        "Maksymalna Długość": round(lon + deg_lon, 1),
    }
    city_geo_range.append(row)

# tworzę z powyzszego dataframe
df_geo_range = pd.DataFrame(city_geo_range)
df_geo_range



In [None]:
#zapisuje powyzsza tabele do csv

df_geo_range.to_csv("city_coordinates_ranges.csv", index=False, encoding='utf-8')

In [None]:
#zamiana wartości nienumerycznych na numeryczne

"""from sklearn.preprocessing import LabelEncoder

non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns


le = LabelEncoder()
for col in non_numeric_cols:
    X[col] = le.fit_transform(X[col].astype(str))
"""



In [None]:
#trenowanie
"""
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=df_ready['city'])


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)"""