In [257]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error

# LOAD DATA

In [189]:
props_raw = pd.read_csv("props_polygons.csv", index_col = 0)

In [190]:
props_raw.head()

Unnamed: 0,X,address,bath,bed,description,link,price,propname,proptype,sqm,...,ADM0_AR,ADM0_PCODE,date,validOn,ValidTo,Shape_Leng,Shape_Area,coords.x1,coords.x2,optional
1,0,"Villa for Sale in October Hills, South Dahshur...",5.0,5.0,October hills VILLA delivered ...,https://m.propertyfinder.eg/en/buy/villa-for-s...,6800000.0,October hills VILLA delivered,Villa,670,...,مِصر,EG,2006/01/01,2017/04/21,,0.254109,0.003076,30.974007,30.012644,True
2,1,"Apartment for Sale in October Hills, South Dah...",3.0,3.0,"#Jadeer_Now_in_Zayed,\nWe present to you our n...",https://m.propertyfinder.eg/en/buy/apartment-f...,1219400.0,#Jadeer #Zayed (182 M²),Apartment,182,...,مِصر,EG,2006/01/01,2017/04/21,,0.254109,0.003076,30.974007,30.012644,True
3,2,"Villa for Sale in October Hills, South Dahshur...",5.0,5.0,"October hills ,\nStandalone villa ,\nLand = 58...",https://m.propertyfinder.eg/en/buy/villa-for-s...,9000000.0,October Hills Villa very Prime location,Villa,580,...,مِصر,EG,2006/01/01,2017/04/21,,0.254109,0.003076,30.974007,30.012644,True
4,3,"Apartment for Sale in October Hills, South Dah...",2.0,2.0,"#Jadeer_Now_in_Zayed,\nWe present to you our n...",https://m.propertyfinder.eg/en/buy/apartment-f...,988000.0,Ground floor apartment 152 M² + 80 M² garden (...,Apartment,152,...,مِصر,EG,2006/01/01,2017/04/21,,0.254109,0.003076,30.974007,30.012644,True
5,4,"Apartment for Sale in October Hills, South Dah...",2.0,2.0,"#Jadeer_Now_in_Zayed,\nWe present to you our n...",https://m.propertyfinder.eg/en/buy/apartment-f...,799500.0,Typical apartment 123 M² #Jadeer #Zayed,Apartment,123,...,مِصر,EG,2006/01/01,2017/04/21,,0.254109,0.003076,30.974007,30.012644,True


# CLEANING

In [191]:
props = props_raw[["proptype", "bed", "bath", "sqm", "Compound", "Developer", "ADM2_EN", 
                   "ADM1_EN", "price"]]

In [192]:
props

Unnamed: 0,proptype,bed,bath,sqm,Compound,Developer,ADM2_EN,ADM1_EN,price
1,Villa,5.0,5.0,670,October Hills,Other,6 October-1,Giza,6800000.0
2,Apartment,3.0,3.0,182,October Hills,Other,6 October-1,Giza,1219400.0
3,Villa,5.0,5.0,580,October Hills,Other,6 October-1,Giza,9000000.0
4,Apartment,2.0,2.0,152,October Hills,Other,6 October-1,Giza,988000.0
5,Apartment,2.0,2.0,123,October Hills,Other,6 October-1,Giza,799500.0
...,...,...,...,...,...,...,...,...,...
22943,Villa,6.0,6.0,2500,Bani Sweif Giza,Other,Badrashain,Giza,30000000.0
22944,Apartment,3.0,3.0,122,Katameya Gate,Other,Basatin,Cairo,1317600.0
22945,Apartment,2.0,2.0,140,Retaj,Other,New Cairo-1,Cairo,1500000.0
22946,Chalet,1.0,1.0,66,Pyramisa Beach Resort,Other,Safaga,Red Sea,250000.0


In [193]:
for column in props:
    print(props[column].value_counts())

Apartment          10011
Villa               5554
Chalet              2690
Townhouse           1703
Twin House          1359
Duplex               932
Penthouse            663
Hotel Apartment       34
Bungalow               1
Name: proptype, dtype: int64
3.0    10244
4.0     4842
2.0     4095
5.0     1838
1.0      843
6.0      578
7.0      413
0.0       88
Name: bed, dtype: int64
3.0    10244
4.0     4842
2.0     4095
5.0     1838
1.0      843
6.0      578
7.0      413
0.0       88
Name: bath, dtype: int64
200      548
140      466
160      372
300      348
120      333
        ... 
823        1
935        1
23500      1
1047       1
32         1
Name: sqm, Length: 903, dtype: int64
 Mivida                     1297
 Madinaty                    870
 Marassi                     822
 Mountain View Hyde Park     665
 Hyde Park                   589
                            ... 
 Street 66                     1
 Al Nakhla Street              1
 Rashdan St.                   1
 Al Hay Al T

In [194]:
props = props.applymap(lambda x: x.strip()if isinstance(x, str) else x)

Filter out properties with less than 1 bedroom

In [195]:
props = props[props.bed >0]

In [196]:
nacol = pd.DataFrame(props.isna().any()).reset_index()
nacol = nacol[nacol[0] == True]
nacol

Unnamed: 0,index,0
6,ADM2_EN,True
7,ADM1_EN,True


In [246]:
for i in nacol["index"]:
    props[i] = np.where(props[i].isna(),"None", props[i])

Split x and y 

In [247]:
x = props.loc[:,"proptype":"ADM1_EN"]
y = props.price

Transform y to log

In [249]:
y = np.log(y)

# LABEL ENCODING AND TRAIN/TEST SPLIT

In [250]:
props_obj = props.dtypes== object
xcat = props_obj[props_obj == True].index

le = preprocessing.LabelEncoder()

for i in xcat:
    x[i] = le.fit_transform(x[i])

In [251]:
x, x_test, y, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# RANDOM FOREST - INITIAL 

In [252]:
rf = RandomForestRegressor()
rf.fit(x,y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [253]:
scores = cross_val_score(rf, x, y, cv=5)
scores

array([0.71354552, 0.73992536, 0.75185636, 0.73477285, 0.75331112])

In [254]:
ypred_test = rf.predict(x_test)
rmsle = np.sqrt(mean_squared_log_error(np.exp(y_test),np.exp(ypred_test)))
rmsle

0.43703443721804447

# GRID SEARCH

In [259]:
param_grid = { 
    'n_estimators': list(range(2,30,10)),
    'max_features': ['auto'], 
    'max_depth': list(range(3,15)),  
}


rf_gs = GridSearchCV(rf, param_grid, cv = 5,return_train_score=True, n_jobs = -1, verbose = True)
rf_gs.fit(x,y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   23.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10

In [264]:
best_rf = rf_gs.best_estimator_
scores = cross_val_score(best_rf, x, y, cv=5)
scores

array([0.73143579, 0.76922828, 0.76957689, 0.7364101 , 0.76606763])

In [267]:
ypred_test = best_rf.predict(x_test)
rmsle = np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(ypred_test)))
rmsle

0.4179349867610109

In [272]:
pd.Series(rf.feature_importances_, index=x.columns).sort_values(ascending=False)

sqm          0.697365
Compound     0.076585
Developer    0.075611
proptype     0.048113
ADM2_EN      0.045785
ADM1_EN      0.031451
bed          0.012802
bath         0.012288
dtype: float64