In [34]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import re
import xgboost
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

Training a regressor to predict housing prices in a city of northeast Brazil with data scrapped from OLX listings

**Results**: much of the data is dirty because of site users inserting wrong/incomplete information. Only features that helped predict prices consistently were meter area and location. Would be nice if the data were more real and contained geolocalization info. 

In [35]:
tf.random.set_seed(33)
np.random.seed(33)

In [36]:
data = r'/content/drive/MyDrive/Colab Notebooks/datafiles/jp_data_07_22.csv'

In [37]:
df = pd.read_csv(data)
df = df.drop("Unnamed: 0", axis=1)
df

Unnamed: 0,area,bedrooms,location,price
0,277m²,4 quartos,"João Pessoa, Manaíra",R$ 1.300.000
1,7300m²,3 quartos,"João Pessoa, Gramame",R$ 230.000
2,277m²,4 quartos,"João Pessoa, Manaíra",R$ 1.300.000
3,13000m²,3 quartos,"João Pessoa, Cristo Redentor",R$ 235.000
4,,,,
...,...,...,...,...
5695,300m²,3 quartos,"João Pessoa, Pedro Gondim",R$ 400.000
5696,235m²,3 quartos,"João Pessoa, Portal do Sol",R$ 570.000
5697,180m²,2 quartos,"João Pessoa, Expedicionários",R$ 560.000
5698,58m²,2 quartos,"João Pessoa, Funcionários",R$ 155.000


In [38]:
original = pd.read_csv(data).drop("Unnamed: 0", axis=1)

In [39]:
def fmt_area(area):
  if isinstance(area, str) and "m²" in area:
    area = area.replace("m²", '')
    return float(area)
  else:
    return np.nan

def fix_area(area):
  if isinstance(area, float):
    if area >= 2000 and area <= 100000:
      area =  area / 100
    elif area > 100000 and area < 1000000:
      area =  area / 1000
    elif area > 1000000:
      area = np.nan
    
    if area < 50:
      area = np.nan
    else:
      return float(area)

def fmt_bedrooms(bedrooms):
  if not "quarto" in str(bedrooms):
    return np.nan
  match = re.search("\d", bedrooms)
  if match is not None:
    return int(match.group(0))
  else:
    return np.nan

def fmt_values(num):
  if not isinstance(num, str):
    return num
  else:
    price = re.sub(r"R[^0-9] ",'', num)
    price = re.sub(r"\.",'', price)
    price = float(price)
  
  if price < 50000 or price > 8000000:
    return np.nan

  return price



In [40]:
df.area = df.area.apply(fmt_area)
df.area = df.area.apply(fix_area)
df.bedrooms = df.bedrooms.apply(fmt_bedrooms)

In [41]:
df.price = df.price.apply(fmt_values)

In [42]:
original.iloc[72]

area                             1300m²
bedrooms              5 ou mais quartos
location    João Pessoa, Jardim Oceania
price                      R$ 4.500.000
Name: 72, dtype: object

In [43]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4067 entries, 0 to 5699
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      4067 non-null   float64
 1   bedrooms  4067 non-null   float64
 2   location  4067 non-null   object 
 3   price     4067 non-null   float64
dtypes: float64(3), object(1)
memory usage: 158.9+ KB


In [45]:
locations = df.location.unique()
locations

array(['João Pessoa, Manaíra', 'João Pessoa, Gramame',
       'João Pessoa, Cristo Redentor', 'João Pessoa, Varjão',
       'João Pessoa, José Américo de Almeida',
       'João Pessoa, Distrito Industrial', 'João Pessoa, Ernani Sátiro',
       'João Pessoa, Paratibe', 'João Pessoa, Altiplano Cabo Branco',
       'João Pessoa, Valentina de Figueiredo',
       'João Pessoa, Funcionários', 'João Pessoa, Mangabeira',
       'João Pessoa, Aeroclube', 'João Pessoa, Indústrias',
       'João Pessoa, Brisamar', 'João Pessoa, Oitizeiro',
       'João Pessoa, Portal do Sol', 'João Pessoa, Jaguaribe',
       'João Pessoa, Bessa', 'João Pessoa, Ernesto Geisel',
       'João Pessoa, Estados', 'João Pessoa, Bancários',
       'João Pessoa, Jardim São Paulo', 'João Pessoa, Jardim Oceania',
       'João Pessoa, Expedicionários', 'João Pessoa, João Agripino',
       'João Pessoa, Ipês', 'João Pessoa, Mumbaba', 'João Pessoa, Cuiá',
       'João Pessoa, Muçumagro', 'João Pessoa, Tambiá',
       'Cabedelo

In [46]:
df["meter_price"] =  df.apply(lambda row: round(row.price / row.area, 2), axis=1)

In [47]:
df[df.location == "João Pessoa, Cabo Branco"].meter_price.mean()

5413.083333333333

Removing outliers based on price per square meter

In [48]:
stats = {}
for loc in locations:
  dff = df[df.location == loc]
  stats[loc] = {"mean": dff.meter_price.mean(), "std": dff.meter_price.std()}


In [49]:
def get_zscore(row):
  stat = stats[row.location]
  mean = stat["mean"]
  std = stat["std"]
  return (row.meter_price - mean )  / std

df["z_score"] = df.apply( get_zscore, axis =1)

  """


In [50]:
dff =  df[(df.z_score >= - 3 ) & (df.z_score <= 3 ) ]           # syntax #df[(df['col1'] >= 1) & (df['col1'] <=1 )]

In [51]:
dff

Unnamed: 0,area,bedrooms,location,price,meter_price,z_score
0,277.0,4.0,"João Pessoa, Manaíra",1300000.0,4693.14,0.299565
1,73.0,3.0,"João Pessoa, Gramame",230000.0,3150.68,0.477407
2,277.0,4.0,"João Pessoa, Manaíra",1300000.0,4693.14,0.299565
3,130.0,3.0,"João Pessoa, Cristo Redentor",235000.0,1807.69,-0.496481
6,250.0,3.0,"João Pessoa, Manaíra",1300000.0,5200.00,0.568619
...,...,...,...,...,...,...
5695,300.0,3.0,"João Pessoa, Pedro Gondim",400000.0,1333.33,-0.617782
5696,235.0,3.0,"João Pessoa, Portal do Sol",570000.0,2425.53,-1.587437
5697,180.0,2.0,"João Pessoa, Expedicionários",560000.0,3111.11,0.311313
5698,58.0,2.0,"João Pessoa, Funcionários",155000.0,2672.41,1.256131


In [52]:
def standardize_features( dataframe: pd.DataFrame, feature_names: list, operation="standardize"):
  """standardizes numerical features and returns a dataframe"""
  from sklearn.preprocessing import StandardScaler, Normalizer
  if operation == "normalize":
    operator = Normalizer()
  else:
    operator = StandardScaler()
  df = dataframe
  for column_name in feature_names:
      raw_data = np.array(df[column_name]).reshape(-1, 1)
      transformed = operator.fit_transform(raw_data)
      ready = pd.DataFrame(transformed, columns=[column_name])
      df = df.drop(column_name, axis=1).join(ready)
  return df

In [53]:
X = dff.drop(["price", "meter_price", "z_score","bedrooms"], axis=1,)

#One hot encoding  
X = pd.get_dummies(X, prefix='', prefix_sep='', columns=['location'],drop_first=True)
Y = dff.price

In [54]:
X.shape[1]

68

In [55]:
x_train_full, x_test, y_train_full, y_test =  train_test_split(X, Y, test_size=0.1)
x_valid, x_train =  x_train_full[:300], x_train_full[300:]
y_valid, y_train =  y_train_full[:300], y_train_full[300:]

In [56]:
x_train.shape

(3311, 68)

In [57]:
forests = RandomForestRegressor(n_estimators=100, random_state=32)
xgb = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.2, random_state=42, max_depth=10)

selected_model = xgb

selected_model.fit(x_train, y_train)
pred = selected_model.predict(x_test)
squared_error = mean_squared_error(pred, y_test, squared=False)
absolute_error = mean_absolute_error(pred, y_test)
print(absolute_error, squared_error)

180905.6829329913 465106.8946409851


In [68]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor="loss")



sample_to_normalize = x_train
norm_layer = keras.layers.Normalization(axis=1)
norm_layer.adapt(tf.constant(sample_to_normalize))

neural = keras.models.Sequential([
norm_layer,
keras.layers.Dense(64, activation="selu", kernel_initializer="lecun_normal", input_shape=X.shape[1:]),
keras.layers.Dense(64, kernel_initializer="lecun_normal", activation="selu"),
keras.layers.Dense(1, activation=None),
])

optimizer = keras.optimizers.SGD(learning_rate=0.0005, nesterov=True) #keras.optimizers.RMSprop(lr=0.001) 

neural.compile(loss=tf.keras.losses.MeanAbsoluteError(), optimizer= optimizer)

history = neural.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=200, callbacks=[early_stopping_cb])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [69]:
sample = x_test[10:20]
sample_answers = y_test[10:20]

In [70]:
neural.predict(sample)



array([[ 273382.03],
       [ 324423.6 ],
       [ 756497.5 ],
       [ 695332.5 ],
       [ 159591.39],
       [ 267543.6 ],
       [ 170341.47],
       [2089659.8 ],
       [ 297935.94],
       [ 953462.25]], dtype=float32)

In [71]:
list(sample_answers)

[450000.0,
 180000.0,
 530000.0,
 600000.0,
 150000.0,
 130000.0,
 165000.0,
 1100000.0,
 310000.0,
 799000.0]

In [72]:
pred = neural.predict(x_test)
real = y_test

squared_error = mean_squared_error(pred, real, squared=False)
absolute_error = mean_absolute_error(pred, real)
print(absolute_error, squared_error)

206356.1126982276 461599.8254902077


In [73]:
locations

array(['João Pessoa, Manaíra', 'João Pessoa, Gramame',
       'João Pessoa, Cristo Redentor', 'João Pessoa, Varjão',
       'João Pessoa, José Américo de Almeida',
       'João Pessoa, Distrito Industrial', 'João Pessoa, Ernani Sátiro',
       'João Pessoa, Paratibe', 'João Pessoa, Altiplano Cabo Branco',
       'João Pessoa, Valentina de Figueiredo',
       'João Pessoa, Funcionários', 'João Pessoa, Mangabeira',
       'João Pessoa, Aeroclube', 'João Pessoa, Indústrias',
       'João Pessoa, Brisamar', 'João Pessoa, Oitizeiro',
       'João Pessoa, Portal do Sol', 'João Pessoa, Jaguaribe',
       'João Pessoa, Bessa', 'João Pessoa, Ernesto Geisel',
       'João Pessoa, Estados', 'João Pessoa, Bancários',
       'João Pessoa, Jardim São Paulo', 'João Pessoa, Jardim Oceania',
       'João Pessoa, Expedicionários', 'João Pessoa, João Agripino',
       'João Pessoa, Ipês', 'João Pessoa, Mumbaba', 'João Pessoa, Cuiá',
       'João Pessoa, Muçumagro', 'João Pessoa, Tambiá',
       'Cabedelo

In [74]:
def predict_new(model, X, area, location, bedrooms=None):
  locations = list(X.columns)
  size=X.shape[1]
  entry = np.zeros(size)
  try:
    locations = list(locations)
    location_idx = locations.index(location)
    entry[location_idx] = 1
  except ValueError:
    pass
  entry[0] = area
  #entry[1] = bedrooms
  entry = entry.reshape(1,size)
  return model.predict(entry)

In [75]:
predict_new(neural, X, 182. ,"João Pessoa, José Américo de Almeida")



array([[376825.2]], dtype=float32)