In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

#Imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer


In [91]:
train = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")

submission = pd.read_csv("house-prices-advanced-regression-techniques/sample_submission.csv")

In [92]:
print("Full train dataset shape is {}".format(train.shape))

Full train dataset shape is (1460, 81)


In [93]:
train = train.drop(columns="Id")

In [94]:
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [95]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [96]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [97]:
train.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64

In [98]:
# Identifiziere Spalten mit mehr als 200 fehlenden Werten
cols_with_many_nas = train.columns[train.isnull().sum() > 200]

print("Spalten mit mehr als 200 NA-Werten:")
print(cols_with_many_nas)

Spalten mit mehr als 200 NA-Werten:
Index(['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')


In [99]:
# Drop columns with more than 400 missing values
train = train.dropna(axis=1, thresh=len(train) - 200)

In [100]:
train.isna().sum()

MSSubClass       0
MSZoning         0
LotArea          0
Street           0
LotShape         0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 74, dtype: int64

In [101]:
# drop all rows with NA
#train.dropna(inplace=True)
#print("Aktuelle Größe des DataFrames nach Entfernen der Zeilen mit fehlenden Werten:", train.shape)

In [102]:
# Split the data into predictors and target
X = train.drop(columns=["SalePrice"])
y = train["SalePrice"]

In [103]:
print(X)

      MSSubClass MSZoning  LotArea Street LotShape LandContour Utilities  \
0             60       RL     8450   Pave      Reg         Lvl    AllPub   
1             20       RL     9600   Pave      Reg         Lvl    AllPub   
2             60       RL    11250   Pave      IR1         Lvl    AllPub   
3             70       RL     9550   Pave      IR1         Lvl    AllPub   
4             60       RL    14260   Pave      IR1         Lvl    AllPub   
...          ...      ...      ...    ...      ...         ...       ...   
1455          60       RL     7917   Pave      Reg         Lvl    AllPub   
1456          20       RL    13175   Pave      Reg         Lvl    AllPub   
1457          70       RL     9042   Pave      Reg         Lvl    AllPub   
1458          20       RL     9717   Pave      Reg         Lvl    AllPub   
1459          20       RL     9937   Pave      Reg         Lvl    AllPub   

     LotConfig LandSlope Neighborhood  ... OpenPorchSF EnclosedPorch  \
0       Inside 

In [104]:
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [105]:
# Create transformers and categorical columns
numerical_transformer = StandardScaler()
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

In [106]:
# columns transformer to apply transformers to the appropriate columns
# Hier füge sowohl den SimpleImputer für kategoriale Spalten als auch den IterativeImputer für numerische Spalten hinzu
numerical_pipeline = Pipeline([
    ('imputer', IterativeImputer()),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])


In [107]:
# Raster von Hyperparametern für RandomForest
param_dist = {
    'model__n_estimators': np.arange(10, 400, 10),
    'model__max_features': ['sqrt', 'log2'],
    'model__max_depth': np.arange(1, 20),
    'model__min_samples_split': np.arange(2, 10),
    'model__min_samples_leaf': np.arange(1, 10),
    'model__bootstrap': [True, False]
}


In [108]:
# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [109]:
# Create and evaluate the pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)
                           ])

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [111]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [112]:
print("Beste Hyperparameter:", random_search.best_params_)
print("Bester Score:", random_search.best_score_)


Beste Hyperparameter: {'model__n_estimators': 300, 'model__min_samples_split': 4, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 18, 'model__bootstrap': False}
Bester Score: 0.8531720275599044


In [113]:
# Evaluate the model
y_pred = random_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 955241707.3637645


In [114]:
r2_score_train = random_search.score(X_train, y_train)
print(f"R^2 Score for train_data: {r2_score_train}")

R^2 Score for train_data: 0.9964198257984684


In [115]:
r2_score_test = random_search.score(X_test, y_test)
print(f"R^2 Score for test_data: {r2_score_test}")

R^2 Score for test_data: 0.8754627011210163


In [116]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [117]:
# Entfernen Sie Spalten aus `test`, die mehr als 200 fehlende Werte haben
test = test.dropna(axis=1, thresh=len(test) - 200)

# Speichern Sie die 'Id'-Werte für spätere Verwendung
ids = test.Id
test = test.drop("Id", axis=1)

In [118]:
# Machen Sie die Vorhersagen
predict_test = random_search.predict(test)

# Erstellen Sie einen DataFrame für die Ausgabe
output = pd.DataFrame({'Id': ids,
                       'SalePrice': predict_test.squeeze()})

# Zeigen Sie die ersten Zeilen des Ausgabe-DataFrames an
print(output.head())

     Id      SalePrice
0  1461  124993.795981
1  1462  153707.696114
2  1463  183476.090472
3  1464  192758.255338
4  1465  199994.898210


In [119]:
output.to_csv("submission.csv", index=False)

In [120]:
# Holen Sie die Feature-Wichtigkeiten
importances = random_search.best_estimator_.named_steps['model'].feature_importances_

# Zuerst holen wir die Feature-Namen aus dem OneHotEncoder:
ohe_feature_names = random_search.best_estimator_.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)

# Kombinieren Sie die numerischen und OHE-Feature-Namen:
all_feature_names = np.concatenate([numerical_cols, ohe_feature_names])

# Holen Sie die Indices der Features, sortiert nach ihrer Wichtigkeit in absteigender Reihenfolge:
sorted_indices = np.argsort(importances)[::-1]

# Extrahieren Sie die Namen der Top-10-Features:
top_10_feature_names = all_feature_names[sorted_indices][:10]

print("Top 10 Features:")
print(top_10_feature_names)




Top 10 Features:
['OverallQual' 'GrLivArea' 'TotalBsmtSF' '1stFlrSF' 'GarageCars'
 'GarageArea' 'YearBuilt' 'ExterQual_TA' 'GarageYrBlt' '2ndFlrSF']


Damit habe ich Position 2413 von 4234 erreicht