In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('DF_ZapIoveis.csv', encoding = "ISO-8859-1", index_col=0)
df = df.loc[:, df.columns != 'bairro']

In [3]:
# Labels are the values we want to predict
labels = np.array(df['preco'])

In [4]:
# Remove the labels from the features
# axis 1 refers to the columns
features= df.drop('preco', axis = 1)

In [5]:
# Saving feature names for later use
feature_list = list(features.columns)

In [6]:
# Convert to numpy array
features = np.array(features)

In [7]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

In [8]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [9]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (20394, 9)
Training Labels Shape: (20394,)
Testing Features Shape: (6798, 9)
Testing Labels Shape: (6798,)


In [10]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [11]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'R$.')

Mean Absolute Error: 2128.57 R$.


In [12]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 99.63 %.


In [20]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: m2                   Importance: 0.86
Variable: quartos              Importance: 0.05
Variable: suites               Importance: 0.04
Variable: vagas                Importance: 0.02
Variable: zona_centro          Importance: 0.01
Variable: zona_norte           Importance: 0.01
Variable: zona_leste           Importance: 0.0
Variable: zona_oeste           Importance: 0.0
Variable: zona_sul             Importance: 0.0


In [21]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)
# Extract the two most important features
important_indices = [feature_list.index('m2'), feature_list.index('quartos')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]
# Train the random forest
rf_most_important.fit(train_important, train_labels)
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_labels)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'R$.')
mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 118413.49 R$.
Accuracy: 88.75 %.
