## Step 1: Load data & packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
!pwd

/Users/sabasartipi/code/elissus/flatquest/notebooks


In [7]:
df_berlin = pd.read_csv("/Users/sabasartipi/code/elissus/flatquest/package_folder/berlin_cleaned.csv")

In [8]:
df_berlin.head()

Unnamed: 0.1,Unnamed: 0,serviceCharge,heatingType,newlyConst,balcony,telekomUploadSpeed,totalRent,yearConstructed,scoutId,hasKitchen,...,lift,typeOfFlat,noRooms,floorNo,numberOfFloors,garden,regio3,heatingCosts,ParkSpaces,fullAddress
0,0,320.0,central_heating,0,1,32.1,1140.0,1980.0,115671495,1,...,1,ground_floor,3.0,0.0,3.0,0,Staaken_Spandau,105.4,1,"Metropolitan Park 63, 13591 Berlin, Germany"
1,1,79.0,central_heating,0,0,40.0,955.0,1918.0,113359456,0,...,0,ground_floor,2.0,0.0,3.0,0,Weißensee_Weißensee,68.0,0,"Börnestraße 11, 13086 Berlin, Germany"
2,2,150.0,floor_heating,1,1,40.0,1300.0,2019.0,113704695,1,...,1,apartment,2.0,3.0,5.0,0,Mitte_Mitte,70.3,1,"Stallschreiberstraße 27, 10179 Berlin, Germany"
3,3,228.78,floor_heating,0,1,32.1,1428.78,2017.0,107589685,1,...,1,apartment,2.5,6.0,7.0,0,Kreuzberg_Kreuzberg,87.8,0,"Hallesche Straße 5a, 10963 Berlin, Germany"
4,4,147.08,district_heating,1,1,40.0,1559.05,2019.0,114773727,1,...,1,ground_floor,2.0,0.0,6.0,0,Tiergarten_Tiergarten,73.5,0,"Heidestraße 19, 10557 Berlin, Germany"


In [None]:
df_berlin.shape

(8879, 26)

In [None]:
df_berlin.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
serviceCharge,0
heatingType,0
newlyConst,0
balcony,0
telekomUploadSpeed,0
totalRent,0
yearConstructed,0
scoutId,0
hasKitchen,0


In [18]:
df_berlin.regio3

79

## Step 2: Feature Engineering

In [9]:
average_price = df_berlin['baseRent'].mean()

df_berlin['price_category'] = np.where(df_berlin['baseRent'] > average_price, 'above_average',
                                np.where(df_berlin['baseRent'] < average_price, 'below_average', 'at_average'))

df_berlin['price_category'].value_counts()


price_category
below_average    5450
above_average    3429
Name: count, dtype: int64

## Step 3: Data Preprocessing

In [12]:
# Features: You can include more features if you want
features = ['livingSpace', 'noRooms', 'balcony', 'yearConstructed', 'regio3']
X = df_berlin[features]

# Target: price_category added above
y = df_berlin['price_category']

# Encode categorical variables if any
X = pd.get_dummies(X, drop_first=True)

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize feature variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
X_train

array([[-0.94996146, -0.96363108, -1.66438272,  0.84923644],
       [ 0.42008383, -0.32291221,  0.60082335,  0.78544087],
       [ 1.10871251, -0.32291221,  0.60082335,  0.97682758],
       ...,
       [-0.75647894, -0.32291221, -1.66438272, -1.21348702],
       [-0.65526121, -0.32291221,  0.60082335,  1.08315353],
       [-0.59731593, -0.64327165,  0.60082335, -0.85197879]])

In [None]:
X_test

array([[ 0.08907452,  0.31780666,  0.60082335,  1.08315353],
       [ 0.83365895,  0.95852554,  0.60082335,  0.42393263],
       [ 0.00626002,  0.31780666,  0.60082335,  0.63658454],
       ...,
       [ 0.61132812,  0.95852554,  0.60082335,  1.06188834],
       [ 0.46211281,  0.31780666,  0.60082335,  1.06188834],
       [-0.76990831, -0.32291221, -1.66438272, -1.4899345 ]])

## Step 4: Model Training

In [13]:
# Initiate Random Forest classifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test only
y_pred = model.predict(X_test)


## Step 5: Model Evaluation

In [14]:
# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[545 133]
 [129 969]]

Classification Report:
               precision    recall  f1-score   support

above_average       0.81      0.80      0.81       678
below_average       0.88      0.88      0.88      1098

     accuracy                           0.85      1776
    macro avg       0.84      0.84      0.84      1776
 weighted avg       0.85      0.85      0.85      1776



In [15]:
# Predict on the entire dataset (? check this)
y_all_pred = model.predict(X)

# Add the predictions to the original dataframes
df_berlin['price_category_pred'] = y_all_pred




## Step 6: Save to a new df

In [16]:
df_berlin.to_csv('updated_apartment_data_with_predictions.csv', index=False)

In [None]:
df_predictions = pd.read_csv("updated_apartment_data_with_predictions.csv")

In [None]:
df_predictions.shape

(8879, 27)

In [None]:
df_predictions.head()

Unnamed: 0.1,Unnamed: 0,serviceCharge,heatingType,newlyConst,balcony,telekomUploadSpeed,totalRent,yearConstructed,scoutId,hasKitchen,...,noRooms,floorNo,numberOfFloors,garden,regio3,heatingCosts,ParkSpaces,fullAddress,price_category,price_category_pred
0,0,320.0,central_heating,0,1,32.1,1140.0,1980.0,115671495,1,...,3.0,0.0,3.0,0,Staaken_Spandau,105.4,1,"Metropolitan Park 63, 13591 Berlin, Germany",below_average,above_average
1,1,79.0,central_heating,0,0,40.0,955.0,1918.0,113359456,0,...,2.0,0.0,3.0,0,Weißensee_Weißensee,68.0,0,"Börnestraße 11, 13086 Berlin, Germany",below_average,above_average
2,2,150.0,floor_heating,1,1,40.0,1300.0,2019.0,113704695,1,...,2.0,3.0,5.0,0,Mitte_Mitte,70.3,1,"Stallschreiberstraße 27, 10179 Berlin, Germany",above_average,above_average
3,3,228.78,floor_heating,0,1,32.1,1428.78,2017.0,107589685,1,...,2.5,6.0,7.0,0,Kreuzberg_Kreuzberg,87.8,0,"Hallesche Straße 5a, 10963 Berlin, Germany",above_average,above_average
4,4,147.08,district_heating,1,1,40.0,1559.05,2019.0,114773727,1,...,2.0,0.0,6.0,0,Tiergarten_Tiergarten,73.5,0,"Heidestraße 19, 10557 Berlin, Germany",above_average,above_average


In [None]:
df_predictions.price_category

Unnamed: 0,price_category
0,below_average
1,below_average
2,above_average
3,above_average
4,above_average
...,...
8874,below_average
8875,below_average
8876,below_average
8877,below_average


In [None]:
df_berlin['price_category'].value_counts()

Unnamed: 0_level_0,count
price_category,Unnamed: 1_level_1
below_average,5450
above_average,3429
