In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('data_cleaned.csv')

In [3]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['livingRoom'] = df['livingRoom'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [4]:
categoricalFeatures = ['city','district','neighborhood']
numericalFeatures = ['room','livingRoom','area','age','floor']

In [5]:
fullPipeline = ColumnTransformer([
    ('num',StandardScaler(),numericalFeatures),
    ('cat',OneHotEncoder(handle_unknown='ignore'),categoricalFeatures)
])

In [6]:
X = df.drop('price',axis=1)
y = df['price']

In [8]:
bins = [x for x in range(0,70000,10000)]
labels = [x for x in range(1,7)]

In [9]:
y = pd.cut(y, bins = bins , labels= labels)

In [11]:
y.unique()

[2, 5, 4, 3, 6, 1]
Categories (6, int64): [1 < 2 < 3 < 4 < 5 < 6]

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [14]:
model = Pipeline([
    ('preparation', fullPipeline),
    ('model', RandomForestClassifier(n_estimators=100))
])

In [15]:
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
print(confusion_matrix(y_test,y_pred))

[[151  82   0   0   1   0]
 [ 43 589  38   2   1   3]
 [  3  97  78  12   7   2]
 [  0  16  25  13   6   3]
 [  1  10  11   4   8   1]
 [  0   5   4   2   4   2]]


In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.76      0.65      0.70       234
           2       0.74      0.87      0.80       676
           3       0.50      0.39      0.44       199
           4       0.39      0.21      0.27        63
           5       0.30      0.23      0.26        35
           6       0.18      0.12      0.14        17

    accuracy                           0.69      1224
   macro avg       0.48      0.41      0.43      1224
weighted avg       0.67      0.69      0.67      1224

