# Rent Calculator - Data Modelling

### Data Acquisition

In [1]:
import pandas as pd
from urllib.error import HTTPError

In [2]:
# get csv
columns = ['rent', 'rooms', 'bathrooms', 'parking', 'area', 'zone']

try:
    df = pd.read_csv('https://raw.githubusercontent.com/smalvar/CalculadoraAluguel-Novembro/main/banco_final.csv',
                    usecols=['aluguel', 'quartos', 'banheiro', 'vaga', 'area', 'zona'])
    # df2 = pd.read_csv('https://raw.githubusercontent.com/smalvar/CalculadoraAluguel-Novembro/main/banco_final.csv')
except HTTPError as e_http:
    print('HTTP Error: {e_http.code}')
    
#df.columns = columns
#df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,aluguel,quartos,banheiro,vaga,area,zona
0,3400,2.0,2.0,1.0,130,Oeste
1,3400,2.0,2.0,2.0,88,Oeste
2,5400,2.0,2.0,1.0,72,Oeste
3,2600,2.0,2.0,1.0,91,Oeste
4,3800,1.0,1.0,1.0,44,Oeste


In [3]:
def translate_zone(zone: str) -> str:
    '''Translates a zone to English'''
    if zone == 'Oeste':
        return 'West'
    elif zone == 'Leste':
        return 'East'
    elif zone == 'Sul':
        return 'South'
    elif zone == 'Norte':
        return 'North'
    elif zone == 'Centro':
        return 'CBD'
    else:
        raise ValueError('Invalid zone.')

In [4]:
# map the zones to English equivalent
#df.zone = df.zone.map(translate_zone)
#df.zone.value_counts()

### Data Preparation

In [5]:
# split data into train and test data
from sklearn.model_selection import train_test_split

In [6]:
train, test = train_test_split(df, test_size=0.25, random_state=1, stratify=df['zona'])

In [7]:
train.head()

Unnamed: 0,aluguel,quartos,banheiro,vaga,area,zona
504,15000,4.0,6.0,4.0,464,Sul
397,7000,4.0,5.0,3.0,169,Sul
74,8000,3.0,2.0,3.0,280,Oeste
737,850,1.0,1.0,0.0,98,Leste
477,6000,8.0,2.0,8.0,1000,Sul


In [8]:
import numpy as np
train['aluguel']=train['aluguel'].apply(np.log1p)
train['area']=train['area'].apply(np.log1p)
test['aluguel']=test['aluguel'].apply(np.log1p)
test['area']=test['area'].apply(np.log1p)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['aluguel']=train['aluguel'].apply(np.log1p)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['area']=train['area'].apply(np.log1p)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['aluguel']=test['aluguel'].apply(np.log1p)
A value is trying to be set on a copy of a slice from a DataFra

In [9]:
# get dummies for the zone column
X_cols=['zona', 'area', 'quartos','banheiro','vaga']
y_col=['aluguel']

X_train = train[X_cols]
X_test = test[X_cols]
y_train = train[y_col]
y_test = test[y_col]

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [10]:
X_train.head()

Unnamed: 0,area,quartos,banheiro,vaga,zona_Centro,zona_Leste,zona_Norte,zona_Oeste,zona_Sul
504,6.142037,4.0,6.0,4.0,0,0,0,0,1
397,5.135798,4.0,5.0,3.0,0,0,0,0,1
74,5.638355,3.0,2.0,3.0,0,0,0,1,0
737,4.59512,1.0,1.0,0.0,0,1,0,0,0
477,6.908755,8.0,2.0,8.0,0,0,0,0,1


### KNN Model

In [11]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor()
# train the model
neigh.fit(X_train, y_train)

KNeighborsRegressor()

In [12]:
# get predictions for rent (y)
y_pred = neigh.predict(X_test)

In [13]:
for p, e in zip(y_pred[::10], y_test[::10]):
    print(f'Predicted: {p}, Expected: {e}')

Predicted: [7.76987168], Expected: aluguel


#### Metrics

In [14]:
import numpy as np

In [15]:
# Root Mean Square Error (RMSE)
np.sqrt(np.mean((y_pred-y_test.values)**2))

0.5838446059869178

In [16]:
# Mean Squared Errorfrom
np.mean((y_pred-y_test.values)**2)

0.34087452394001927

In [17]:
# Mean Absolute Error
np.mean(np.abs(y_pred-y_test.values))

0.44794482744682146

In [18]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7209396684217904

In [19]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

reg.score(X_test, y_test)

0.7533225177776965

### Testing with multiple models

In [20]:
import warnings
warnings.filterwarnings("ignore") # ignore warning

# import sklearn Estimators
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor

regressor_list = [ KNeighborsRegressor(),
                  LinearRegression(),
                  DecisionTreeRegressor(),
                  RandomForestRegressor(),
                  XGBRegressor(),
                  MLPRegressor(),
                  LGBMRegressor(),
                  ]

for reg in regressor_list:
  print('Training', reg.__class__.__name__)
  reg.fit(X_train, y_train)
  train_score = reg.score(X_train, y_train)
  val_score = cross_val_score(reg, X_train, y_train, cv=5)
  test_score = reg.score(X_test, y_test)
  print('Train score', train_score)
  print('Validation score', val_score)
  print('Test score', test_score)
  print('='*80)

Training KNeighborsRegressor
Train score 0.7985911413749257
Validation score [0.66945804 0.75771847 0.70358054 0.7286198  0.60056736]
Test score 0.7209396684217904
Training LinearRegression
Train score 0.7413310011623597
Validation score [0.69283186 0.79287584 0.72278178 0.76214167 0.64519801]
Test score 0.7533225177776965
Training DecisionTreeRegressor
Train score 0.9758351645143398
Validation score [0.5576097  0.59875958 0.62456905 0.43395393 0.61589144]
Test score 0.6754666836537955
Training RandomForestRegressor
Train score 0.9457146297692551
Validation score [0.71465774 0.77252422 0.75317034 0.6971233  0.73926479]
Test score 0.7817272989455956
Training XGBRegressor
Train score 0.9630670799901603
Validation score [0.66998745 0.7272265  0.70926709 0.63177362 0.69306454]
Test score 0.7758716381640421
Training MLPRegressor
Train score 0.7698213357672445
Validation score [0.69496689 0.73799272 0.7392496  0.70783947 0.61395538]
Test score 0.7738769233653229
Training LGBMRegressor
Train 

### Testing with XGBoost

In [21]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

### Using GriSearchCV to find the best parameters for the XGBRegressor

In [22]:
from sklearn.model_selection import GridSearchCV
parameters=[{'learning_rate':[0.1,0.2,0.3,0.4],
             'max_depth':[3,4,5,6,7,8],
             'n_estimators':[50, 100, 200]}]
            
xgb = XGBRegressor(n_estimators=100, max_depth=1)
gs = GridSearchCV(xgb,parameters,scoring='r2',n_jobs=-1,cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=1, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_paramet

In [23]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}

In [24]:
gs.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
reg = gs.best_estimator_

#### Run model using the best parameters

In [26]:
train_score = reg.score(X_train, y_train)
val_score = cross_val_score(reg, X_train, y_train, cv=3)
test_score = reg.score(X_test, y_test)
print('Train score', train_score)
print('Validation score', val_score)
print('Test score', test_score)

Train score 0.8045924988915034
Validation score [0.74617778 0.7633818  0.73210872]
Test score 0.7914393048009649


In [27]:
import pickle
# now you can save it to a file
with open('xgboost_regression.pkl', 'wb') as f:
    pickle.dump(reg, f)

### Testing all Estimators

In [28]:
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

all_regs = []
for name, RegressorClass in estimators:
    try:
        print('Appending', name)
        reg = RegressorClass()
        all_regs.append(reg)
    except Exception as e:
        print(e)

Appending ARDRegression
Appending AdaBoostRegressor
Appending BaggingRegressor
Appending BayesianRidge
Appending CCA
Appending DecisionTreeRegressor
Appending DummyRegressor
Appending ElasticNet
Appending ElasticNetCV
Appending ExtraTreeRegressor
Appending ExtraTreesRegressor
Appending GammaRegressor
Appending GaussianProcessRegressor
Appending GradientBoostingRegressor
Appending HistGradientBoostingRegressor
Appending HuberRegressor
Appending IsotonicRegression
Appending KNeighborsRegressor
Appending KernelRidge
Appending Lars
Appending LarsCV
Appending Lasso
Appending LassoCV
Appending LassoLars
Appending LassoLarsCV
Appending LassoLarsIC
Appending LinearRegression
Appending LinearSVR
Appending MLPRegressor
Appending MultiOutputRegressor
__init__() missing 1 required positional argument: 'estimator'
Appending MultiTaskElasticNet
Appending MultiTaskElasticNetCV
Appending MultiTaskLasso
Appending MultiTaskLassoCV
Appending NuSVR
Appending OrthogonalMatchingPursuit
Appending OrthogonalM

In [29]:
for reg in all_regs:
  try:
    print('Treinando', reg.__class__.__name__)
    reg.fit(X_train, y_train)
    train_score = reg.score(X_train, y_train)
    val_score = cross_val_score(reg, X_train, y_train, cv=3)
    test_score = reg.score(X_test, y_test)
    print('Train score', train_score)
    print('Validation score', val_score)
    print('Test score', test_score)
    print('='*80)
  except Exception as e:
    print(e)

Treinando ARDRegression
Train score 0.7409320868670166
Validation score [0.73752212 0.74464116 0.69679906]
Test score 0.7513967418284355
Treinando AdaBoostRegressor
Train score 0.7756446457436511
Validation score [0.73579935 0.75645677 0.7198024 ]
Test score 0.7755017670279257
Treinando BaggingRegressor
Train score 0.9373329472822722
Validation score [0.71015006 0.72971452 0.6866026 ]
Test score 0.7597739233193238
Treinando BayesianRidge
Train score 0.7412595218911704
Validation score [0.73924346 0.73952322 0.69406951]
Test score 0.7525237557882294
Treinando CCA
Train score 2.4980018054066022e-14
Validation score [-5.84323116e-05 -4.15321710e-03 -5.54408125e-03]
Test score -0.0013093542476898534
Treinando DecisionTreeRegressor
Train score 0.9758351645143398
Validation score [0.53676094 0.53857147 0.54499831]
Test score 0.6631638489778463
Treinando DummyRegressor
Train score 0.0
Validation score [-5.84323116e-05 -4.15321710e-03 -5.54408125e-03]
Test score -0.001309354247712946
Treinando