# Praxisübung: Ensemble Modelle

In dieser Übung wenden Sie die `scikit-learn` Implementierungen des Random Forest und des Gradient Boosting auf den Immoscout-Datensatz an. Es soll die Kaltmiete einer Wohnung vorhergesagt werden, d.h. es handelt sich um ein Regressionsmodell. Der Einfachheit halber ist der Code für die Vorverarbeitung gegeben.

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import linear_model

from IPython.display import display

pd.options.display.max_columns = 50

In [18]:
df = pd.read_csv("Daten/immo_data.csv")
desc = pd.read_csv("Daten/immo_data_column_description.csv")

In [19]:
def drop_columns(df):
    """ Entfernen (vermeintlich) unwichtiger Spalten """
    return df.drop(
        [
            "scoutId",
            "houseNumber",
            "geo_bln",
            "geo_krs",
            "geo_plz",
            "date",
            "street",
            "streetPlain",
            "description",
            "facilities",
            "regio3",
            "firingTypes",
            "telekomHybridUploadSpeed",
            "totalRent",
            "baseRentRange",
        ],
        axis=1,
    )


def remove_outliers(df, lower_limit=0.005, upper_limit=0.995):
    """ Entfernen der (unteren und oberen) Ausreißer """
    dfc = df.copy()
    columns_with_outliers = [
        "serviceCharge",
        "yearConstructed",
        "noParkSpaces",
        "baseRent",
        "livingSpace",
        "noRooms",
        "floor",
        "numberOfFloors",
        "heatingCosts",
        "lastRefurbish",
    ]
    
    # Für jede Spalte behalten wir: Daten die < (99.5%-Quantil) sind und > (0.5%-Quantil) sind ODER die NaN sind (damit befassen wir uns spaeter noch) 
    upper_limits = df[columns_with_outliers].quantile(upper_limit)
    lower_limits = df[columns_with_outliers].quantile(lower_limit)
    
    for colname in columns_with_outliers:
        col = dfc[colname]
        dfc = dfc[
            ((col <= upper_limits[colname]) & (col >= lower_limits[colname]))
            | col.isna()
        ]
    return dfc


def remove_rows_with_NaN_target(df):
    """ Entfernen der Datensätze ohne Label"""
    return df[df["baseRent"].isna() == False]


def impute_NaNs(df):
    """ Ersetzen von NaNs durch Mittelwert bzw. Modus """
    dfc = df.copy()
    categorical_columns = dfc.select_dtypes(exclude=np.number).columns
    imp_freq = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
    dfc.loc[:, categorical_columns] = imp_freq.fit_transform(dfc[categorical_columns])

    numeric_columns = dfc.select_dtypes(include=np.number).columns
    imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
    dfc.loc[:, numeric_columns] = imp_mean.fit_transform(dfc[numeric_columns])
    return dfc


def print_evaluation(pipeline_or_model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred, feature_names):
    """ Ausgabe von R2-Wert, MSE und MAE für Trainings- und Testset """
    r2_train = r2_score(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    r2_test = r2_score(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    
    print(
        f"{pipeline_or_model} Evaluation:\n"
        f"{'':6} {'R²':>10} | {'MSE':>14} | {'MAE':>10} | {'rows':>8} | {'columns':>8}\n"
        f"{'Train':6} {r2_train:10.5f} | {mse_train:14.2f} | {mae_train:10.2f} | {X_train.shape[0]:8} | {X_train.shape[1]:8}\n"
        f"{'Test':6} {r2_test:10.5f} | {mse_test:14.2f} | {mae_test:10.2f} | {X_test.shape[0]:8} | {X_test.shape[1]:8}\n"
    )

In [20]:
from sklearn.model_selection import GridSearchCV

# Datenvorverarbeitung
df_reduced = drop_columns(df.sample(10000, random_state=42))
df_reduced = remove_outliers(df_reduced)
df_reduced = remove_rows_with_NaN_target(df_reduced)
df_reduced = impute_NaNs(df_reduced)
df_reduced = pd.get_dummies(df_reduced)
y = df_reduced.pop("baseRent")

# Training-Test-Split
X_train, X_test, y_train, y_test = train_test_split(df_reduced, y, test_size=0.2, random_state=0)

# Training und Hyperparametersuche
parameters = {"alpha": [1e-3, 1e-1, 1, 10]}
m = linear_model.Ridge(random_state=42)
gs = GridSearchCV(m, parameters)
gs.fit(X_train, y_train)

# Vorhersage mit dem besten Modell
y_train_pred = gs.predict(X_train)
y_test_pred = gs.predict(X_test)

# Evaluation. Das beste Modell ist unter gs.best_estimator_ gespeichert. 
print_evaluation(gs.best_estimator_, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred, feature_names=df_reduced.columns)

Ridge(alpha=1, random_state=42) Evaluation:
               R² |            MSE |        MAE |     rows |  columns
Train     0.84369 |       25629.80 |     104.07 |     7742 |      505
Test      0.82320 |       28617.03 |     113.75 |     1936 |      505



## ✏ Aufgabe 1
Trainieren Sie einen Random Forest für das Wohnungsbeispiel. Experimentieren Sie mit den Parametern ``n_estimators``, ``max_depth`` und ``max_features``, um die Performance zu optimieren (entweder händisch oder mit GridSearchCV --> Vorsicht, Rechenzeit!). Was bedeuten die Parameter? 

In [8]:
# TO DO

## ✏ Aufgabe 2
Trainieren Sie ein Gradient Boosting Regressionsmodell für das Wohnungsbeispiel. Experimentieren Sie mit den Parametern ``learning_rate``, ``max_leaf_nodes``, ``n_estimators``, ``max_depth`` und ``max_features``, um die Performance zu optimieren (entweder händisch oder mit GridSearchCV --> Vorsicht, Rechenzeit!). Was bedeuten die Parameter?

In [9]:
# TO DO

## ✏ Aufgabe 3
Welcher Algorithmus ist in folgendem Code realisiert?

In [10]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from scipy.stats import mode

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_trees = 500
accuracy_scores = []

Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for k in range(n_trees):
    X_train_, y_train_ = resample(X_train, y_train, replace=True)
    
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train_, y_train_)
    
    Y_pred[k] = clf.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, clf.predict(X_test)))

y_pred, count = mode(Y_pred)
print(accuracy_score(y_test, y_pred[0]))
print(np.mean(accuracy_scores))

1.0
0.9878666666666668
