In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pickle
import re
from math import radians, cos, sin, asin, sqrt

# Load dataset
df = pd.read_csv('apartments_data_enriched_lat_lon_combined.csv')

# Function to calculate distance between two points using Haversine formula
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance between two points on earth."""
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r

# Zurich city center coordinates (approximate)
zurich_center_lat = 47.3769
zurich_center_lon = 8.5417

# Create new features
df['price_per_room'] = df['price'] / df['rooms']
df['price_per_m2'] = df['price'] / df['area']

# Calculate distance to city center
df['distance_to_center'] = df.apply(
    lambda row: haversine_distance(row['lat'], row['lon'], zurich_center_lat, zurich_center_lon), 
    axis=1
)

# Check if balcony is mentioned in description
df['has_balcony'] = df['description_raw'].str.contains('balkon|balcony', case=False, regex=True).astype(int)

# Check if renovated is mentioned in description
df['is_renovated'] = df['description_raw'].str.contains('renoviert|renovated|saniert', case=False, regex=True).astype(int)

# Features to use for model
features = [
    'rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income',
    'price_per_room', 'distance_to_center', 'has_balcony', 'is_renovated'
]

# Drop rows with NaN values
df_clean = df.dropna(subset=features)

# Split data
X = df_clean[features]
y = df_clean['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train R²: {train_score:.2f}, Test R²: {test_score:.2f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Save the model 
with open("apartment_price_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("\nModel saved as apartment_price_model.pkl")



Train R²: 1.00, Test R²: 0.97

Feature Importance:
               Feature  Importance
7       price_per_room    0.384680
1                 area    0.265346
0                rooms    0.193932
8   distance_to_center    0.145861
6           tax_income    0.003544
3             pop_dens    0.003495
2                  pop    0.001253
4              frg_pct    0.000845
5                  emp    0.000756
10        is_renovated    0.000204
9          has_balcony    0.000083

Model saved as apartment_price_model.pkl
