## Apartment AI

In [27]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import gradio as gr
import pickle

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

### Get Data

In [28]:
# Read the data to a pandas data frame
df_data = pd.read_csv('apartments_data_enriched_lat_lon_combined.csv', sep=',', encoding='utf-8')
# Get number of rows and columns
df_data.shape
print(df_data)

      bfs_number  rooms  area  price  postalcode  \
0            261    4.5   148   4180        8050   
1            261    2.0   122   3190        8050   
2            261    3.5    78   2780        8050   
3            261    3.5    69   3750        8050   
4            261    3.5    74   2390        8050   
...          ...    ...   ...    ...         ...   
2395           3    4.5   104   2055        8906   
2396         196    4.5    98   1950        8617   
2397           4    5.5   175   3720        8915   
2398         173    4.5   130   2400        8335   
2399         192    3.5   102   2272        8132   

                                     address              town  \
0       Schaffhauserstrasse 363, 8050 Zürich            Zürich   
1              Max Bill Platz 5, 8050 Zürich            Zürich   
2          Regensbergstrasse 30, 8050 Zürich            Zürich   
3             Dörflistrasse 112, 8050 Zürich            Zürich   
4       Schaffhauserstrasse 445, 8050 Zürich 

## Feature Engineering

In [29]:
# Define Zurich HB (City Center) coordinates
ZURICH_HB_LAT = 47.378177
ZURICH_HB_LON = 8.540192

# Function to calculate Haversine distance (great-circle distance)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])  # Convert degrees to radians
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c  # Distance in km

# Apply function to dataset to compute distance to Zürich HB
df_data["distance_to_zurich_hb_in_km"] = df_data.apply(
    lambda row: haversine(row["lat"], row["lon"], ZURICH_HB_LAT, ZURICH_HB_LON), axis=1
)
print(df_data)

      bfs_number  rooms  area  price  postalcode  \
0            261    4.5   148   4180        8050   
1            261    2.0   122   3190        8050   
2            261    3.5    78   2780        8050   
3            261    3.5    69   3750        8050   
4            261    3.5    74   2390        8050   
...          ...    ...   ...    ...         ...   
2395           3    4.5   104   2055        8906   
2396         196    4.5    98   1950        8617   
2397           4    5.5   175   3720        8915   
2398         173    4.5   130   2400        8335   
2399         192    3.5   102   2272        8132   

                                     address              town  \
0       Schaffhauserstrasse 363, 8050 Zürich            Zürich   
1              Max Bill Platz 5, 8050 Zürich            Zürich   
2          Regensbergstrasse 30, 8050 Zürich            Zürich   
3             Dörflistrasse 112, 8050 Zürich            Zürich   
4       Schaffhauserstrasse 445, 8050 Zürich 

### Training

In [30]:
# Create train and test samples
features = ['rooms', 'area', 'distance_to_zurich_hb_in_km']

X_train, X_test, y_train, y_test = train_test_split(df_data[features], 
                                                         df_data['price'], 
                                                        test_size=0.20, 
                                                        random_state=42)

# train random_forest_model = RandomForestRegressor()
random_forest_model = RandomForestRegressor(random_state=42)

# Fit the model
random_forest_model.fit(X_train, y_train)
print("Train score: ", random_forest_model.score(X_train, y_train))
print("Test score: ", random_forest_model.score(X_test, y_test))

print("Train RMSE: ", root_mean_squared_error(y_train, random_forest_model.predict(X_train)))
print("Test RMSE: ", root_mean_squared_error(y_test, random_forest_model.predict(X_test)))

Train score:  0.9553916709079772
Test score:  0.5919896508378145
Train RMSE:  257.95756363746
Test RMSE:  805.2618868619544


## Build Website

In [31]:
# Define the core prediction function
def predict_apartment(rooms, area, town):
    bfs_number = 261
    df = df_data[df_data['bfs_number']==bfs_number].copy()
    df.reset_index(inplace=True)
    df.loc[0, 'rooms'] = rooms
    df.loc[0, 'area'] = area
    if len(df) != 1: # if there are more than two records with the same bfs_number reutrn -1
        return -1
    prediction = random_forest_model.predict(df[['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income']])
    return np.round(prediction[0], 0)

predict_apartment(3, 100, 'Zürich')

# Create Gradio interface
iFace = gr.Interface(
    fn=predict_apartment,
    inputs=[
        gr.Number(label="Rooms"),
        gr.Number(label="Area"),
        gr.Number(label="Distance To Zürich HB in km"),
        gr.Dropdown(choices=df["town"].unique().tolist(), label="Town")
    ],
    outputs="text",
    examples=[
        [5.1, 3.5, 1.4, 0.2],
        [6.2, 2.9, 4.3, 1.3],
        [7.7, 3.8, 6.7, 2.2],
    ],
    title="Apartment price Prediction",
    description="Enter the Apartment details to predict the price"
)

iFace.launch()


NameError: name 'df' is not defined