In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns; sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


from imblearn.under_sampling import RandomUnderSampler

from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

pd.set_option('precision', 3) # set precision to 2 decimal points
%matplotlib inline

In [93]:
file_path = "./sao-paulo-properties-april-2019.csv"
df = pd.read_csv(file_path)

In [94]:
sp_lat = -23.533773
sp_lon = -46.625290

In [95]:
df['Distance'] = (df['Longitude'] - sp_lon)**2 + (df['Latitude'] - sp_lat)**2

In [96]:
df_norm = df.copy()

std_scaler = StandardScaler()
feature_list = ['Price', 'Condo','Size','Distance']

for i in feature_list:
    df_norm[i] = std_scaler.fit_transform(df_norm[i].values.reshape(-1, 1))





In [97]:
df_norm

Unnamed: 0,Price,Condo,Size,Rooms,Toilets,Suites,Parking,Elevator,Furnished,Swimming Pool,New,District,Negotiation Type,Property Type,Latitude,Longitude,Distance
0,-0.485,-0.620,-0.640,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.543,-46.479,-0.265
1,-0.485,-0.715,-0.674,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.550,-46.481,-0.265
2,-0.485,-0.779,-0.622,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.543,-46.486,-0.265
3,-0.485,-0.647,-0.622,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.547,-46.483,-0.265
4,-0.485,-0.369,-0.503,2,2,1,1,1,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.525,-46.482,-0.265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13635,-0.038,-0.356,-0.571,2,1,0,1,0,0,0,0,Jabaquara/São Paulo,sale,apartment,-23.653,-46.635,-0.265
13636,0.435,-0.079,-0.178,3,2,1,2,0,0,1,0,Jabaquara/São Paulo,sale,apartment,-23.649,-46.642,-0.265
13637,0.385,0.541,0.507,3,3,1,1,0,0,1,0,Jabaquara/São Paulo,sale,apartment,-23.650,-46.650,-0.265
13638,0.097,-0.847,-0.777,1,2,1,1,0,1,1,0,Jabaquara/São Paulo,sale,apartment,-23.652,-46.637,-0.265


# first we divide the dataset in train/test
# then we divide the train in train/validation
# model knn
# test
# accuracy, precision.

In [98]:
# take out some variables
df_train = df_norm.drop(columns = ["Latitude","Longitude", "Property Type"])
df_train["District"] = df_train["District"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_train = pd.get_dummies(df_train, columns = ['District'], drop_first=True)
df_train = pd.get_dummies(df_train, columns = ['Negotiation Type'], drop_first=True)

In [99]:
# start the modelling
# divide train, test, validation
X = df_train.loc[:, df_train.columns != 'Price']
y = df_train.loc[:, df_train.columns == 'Price']

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [100]:
# apply an random forest model
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train.values.ravel())

RandomForestRegressor(n_estimators=1000, random_state=42)

In [101]:
predictions = rf.predict(X_val)
#print(predictions)
#print(y_val.values.ravel())
errors = abs(predictions - y_val.values.ravel())
print('Mean Absolute Error:', round(np.mean(errors), 2))
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_val.values.ravel())
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.09
Accuracy: 91.75 %.


In [112]:
# test set
rf.fit(X_train, y_train.values.ravel())
predictions = rf.predict(X_test)
#print(predictions)
#print(y_val.values.ravel())
errors = abs(predictions - y_test.values.ravel())
print('Mean Absolute Error:', round(np.mean(errors), 2))
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values.ravel())
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.09
Accuracy: 78.38 %.


In [102]:
# hyperparameter tuning

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [113]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train, y_train.values.ravel())
rf_random.best_params_

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.2s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.2s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   1.2s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   4.0s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   4.3s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   4.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=14

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': True}

In [114]:
rf_params = rf_random.best_params_

In [115]:
rf_final = RandomForestRegressor(n_estimators=rf_params['n_estimators'], min_samples_split=rf_params['min_samples_split'],
min_samples_leaf=rf_params['min_samples_leaf'], max_features=rf_params['max_features'], max_depth=rf_params['max_depth'], bootstrap=rf_params['bootstrap'])


In [116]:
rf_final.fit(X_train, y_train.values.ravel())
predictions = rf_final.predict(X_val)
#print(predictions)
#print(y_val.values.ravel())
errors = abs(predictions - y_val.values.ravel())
print('Mean Absolute Error:', round(np.mean(errors), 2))
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_val.values.ravel())
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.09
Accuracy: 91.65 %.


In [117]:
# test set
rf_final.fit(X_train, y_train.values.ravel())
predictions = rf_final.predict(X_test)
#print(predictions)
#print(y_val.values.ravel())
errors = abs(predictions - y_test.values.ravel())
print('Mean Absolute Error:', round(np.mean(errors), 2))
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values.ravel())
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.09
Accuracy: 80.65 %.
