# Importing libs

In [1]:
import pandas as pd
import joblib
import json
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Reading data

In [71]:
%%time
df = pd.read_csv('./assets/houses_to_rent_v2.csv')

Wall time: 19 ms


## Checking data

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10692 entries, 0 to 10691
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   city                 10692 non-null  object
 1   area                 10692 non-null  int64 
 2   rooms                10692 non-null  int64 
 3   bathroom             10692 non-null  int64 
 4   parking spaces       10692 non-null  int64 
 5   floor                10692 non-null  object
 6   animal               10692 non-null  object
 7   furniture            10692 non-null  object
 8   hoa (R$)             10692 non-null  int64 
 9   rent amount (R$)     10692 non-null  int64 
 10  property tax (R$)    10692 non-null  int64 
 11  fire insurance (R$)  10692 non-null  int64 
 12  total (R$)           10692 non-null  int64 
dtypes: int64(9), object(4)
memory usage: 1.1+ MB


In [73]:
df.head(3)

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211,42,5618
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750,63,7973
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0,41,3841


## Checking some columns' values

In [74]:
for col in ['city', 'rooms', 'bathroom', 'parking spaces', 'floor', 'rooms', 'animal', 'furniture']:
    print(col, df[col].unique())

city ['São Paulo' 'Porto Alegre' 'Rio de Janeiro' 'Campinas' 'Belo Horizonte']
rooms [ 2  4  1  3  7  5  8  6 10 13  9]
bathroom [ 1  4  3  2  6  5  7  9  8 10]
parking spaces [ 1  0  7  4  2  6  3  8  5 10 12]
floor ['7' '20' '6' '2' '1' '-' '4' '3' '10' '11' '24' '9' '8' '17' '18' '5'
 '13' '15' '16' '14' '26' '12' '21' '19' '22' '27' '23' '35' '25' '46'
 '28' '29' '301' '51' '32']
rooms [ 2  4  1  3  7  5  8  6 10 13  9]
animal ['acept' 'not acept']
furniture ['furnished' 'not furnished']


## Replacing some values

In [75]:
df['floor'].replace('-', 0, inplace=True)
df['floor'] = df['floor'].astype(int)

df['animal'] = df['animal'].map({'acept': 'Sim', 'not acept': 'Não'})

df['furniture'] = df['furniture'].map({'furnished': 'Sim', 'not furnished': 'Não'})

In [76]:
choice_cols = ['city', 'animal', 'furniture']
choice_dict = dict()
for col in choice_cols:
    choice_dict[col] = list(df[col].unique())

# Preprocessing

In [77]:
city_encoder = LabelEncoder()
pet_encoder = LabelEncoder()
furniture_encoder = LabelEncoder()

In [78]:
df['city'] = city_encoder.fit_transform(df['city'])
df['animal'] = pet_encoder.fit_transform(df['animal'])
df['furniture'] = furniture_encoder.fit_transform(df['furniture'])

# Splitting dataset into train and test datasets

In [79]:
X = df.drop(['rent amount (R$)', 'total (R$)'], axis=1)
y = df['rent amount (R$)']

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Instantiating classifier

In [81]:
classifier = DecisionTreeRegressor()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# Computing error metrics

In [82]:
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape}")

MAE: 86.5682668329177
MAPE: 0.0324769958320829


# Saving encoders and model

In [88]:
joblib.dump(city_encoder, './models/city_encoder.pkl')
joblib.dump(pet_encoder, './models/pet_encoder.pkl')
joblib.dump(furniture_encoder, './models/furniture_encoder.pkl')

joblib.dump(classifier, './models/br_house_rent_classifier.pkl')

['./models/br_house_rent_classifier.pkl']

In [84]:
choice_dict

{'city': ['São Paulo',
  'Porto Alegre',
  'Rio de Janeiro',
  'Campinas',
  'Belo Horizonte'],
 'animal': ['Sim', 'Não'],
 'furniture': ['Sim', 'Não']}

In [85]:
with open('./assets/choices_dict.json', 'w') as f:
    json.dump(choice_dict, f)

# Testing 

In [16]:
with open('./models/br_house_rent_classifier.pkl', 'rb') as f:
    classifier = joblib.load(f)

with open('./models/city_encoder.pkl', 'rb') as f:
    city_encoder = joblib.load(f)

with open('./models/pet_encoder.pkl', 'rb') as f:
    pet_encoder= joblib.load(f)

with open('./models/furniture_encoder.pkl', 'rb') as f:
    furniture_encoder = joblib.load(f)

In [22]:
arr_test = ['São Paulo', 196, 6, 2, 1, 7, 'Não', 'Sim', 250, 520, 90]
arr_test[0] = city_encoder.transform([arr_test[0]])[0]
arr_test[6] = pet_encoder.transform([arr_test[6]])[0]
arr_test[7] = furniture_encoder.transform([arr_test[7]])[0]


In [23]:
arr_test

[4, 196, 6, 2, 1, 7, 0, 1, 250, 520, 90]

In [24]:
classifier.predict([arr_test])

array([7130.])