In [28]:
import pandas as pd
import numpy as np
import pickle
from geopy.distance import geodesic
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split

random_forest_model = RandomForestRegressor()

# Lade den Datensatz
df = pd.read_csv("original_apartment_data_analytics_hs24_with_lat_lon.csv")

# Feature 1: Arbeitsmarktattraktivität der Region
df["employment_ratio"] = df["emp"] / df["pop"]

def calculate_center_distance(row, city_centers):
    """Berechnet die Entfernung zum Stadtzentrum."""
    city = row["town"]
    if city in city_centers:
        center_coords = city_centers[city]
        return geodesic((row["lat"], row["lon"]), center_coords).km
    return np.nan

# Berechnung der Stadtzentren (Mittelwert der Koordinaten pro Stadt)
city_centers = df.groupby("town")[["lat", "lon"]].mean().to_dict(orient="index")
city_centers = {k: (v["lat"], v["lon"]) for k, v in city_centers.items()}

# Feature 2: Entfernung zum Stadtzentrum
df["distance_to_center"] = df.apply(lambda row: calculate_center_distance(row, city_centers), axis=1)

# Speichern des erweiterten Datensatzes
df.to_csv("enhanced_apartment_data.csv", index=False)

# Auswahl der Features und Target-Variable
features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income', 'employment_ratio', 'distance_to_center']
target = "price"
X = df[features].dropna()
y = df.loc[X.index, target]

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Modell mit Cross Validation
model = RandomForestRegressor(n_estimators=100, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

# Modell trainieren
model.fit(X_train, y_train)

# Modell speichern
with open("apartment_price_model.pkl", "wb") as f:
    pickle.dump(model, f)

def model_performance(features, df, random_forest_model = RandomForestRegressor(random_state=42)):
    df = df.sample(frac=1, random_state=42)
    X, y = df[features], df['price']
    scores = cross_val_score(random_forest_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
    print('CV results RMSE:', np.round(scores))
    print('Mean RMSE:', np.mean(np.round(scores, 0)))

features = ['rooms', 'area', 'pop', 'pop_dens', 'frg_pct', 'emp', 'tax_income', 'employment_ratio', 'distance_to_center']
model_performance(features, df)

CV results RMSE: [ -870.  -590.  -806. -1062.  -817.]
Mean RMSE: -829.0
