# Scrap My Prop

### Laboratórios de Engenharia Informática

**"Development of an IT solution for the extraction and automatic analysis of data and relevant information for the calculation of land and properties."**

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from IPython.core.debugger import set_trace
import geopy.distance
import os.path
from sklearn import metrics

### Id

In [None]:
def preparationData(data_imo):
    data_imo = data_imo[['Id','Preço']]
    
    data_imo = data_imo.dropna(subset=['Id'])
    data_imo.index = np.arange(1, len(data_imo) + 1)
    
    data_imo['Id'] = data_imo['Id'].astype(int)
    
    return data_imo

### Preço

In [None]:
def toNumeric(string):
    res = str(string)
    res = res.replace(" ", "")
    res = res.replace(",", ".")
    res = float(pd.to_numeric(res, errors='ignore')) # tem que ser float porque esse tipo consegue interpretar o np.nan
    return res

In [None]:
def preparationPreco(data_imo):

    data_imo['Preço'] = data_imo['Preço'].apply(toNumeric)

    nan_prices = data_imo['Preço'].index[data_imo['Preço'].apply(np.isnan)]
    data_imo = data_imo.drop(nan_prices)
    data_imo.index = np.arange(1, len(data_imo) + 1)

    data_imo['Preço'] = data_imo['Preço'].apply(int)
    
    return data_imo

### Imovel

In [None]:
class Imovel():
    def getData(self):
        return self.datas

    def getPreco(self):
        return self.precos
    
    def __init__(self):
        self.datas = []
        self.precos = []
        self.previsoes=[]

### Get and Prepare Data

In [None]:
imoveis = {}

for mes in range(4,7):
    if mes > 0 and mes < 10:
        mes = str(mes).zfill(2)
    print("------ Mês:", mes)
    for dia in range(1,31):
        if dia > 0 and dia < 10:
            dia = str(dia).zfill(2)
        print("--- Dia:", dia)
        if os.path.isfile(f'../dados/dados_imovirtual_{dia}_{mes}.csv'):
            data_imo = pd.read_csv(f'../dados/dados_imovirtual_{dia}_{mes}.csv', engine='python', encoding='utf8')
            data_imo = preparationData(data_imo)
            data_imo = preparationPreco(data_imo)
            print(data_imo.shape)
            for index, row in data_imo.iterrows():
                #imovel = Imovel(f'{dia}/{mes}', row['Preço'])
                if row['Id'] not in imoveis:
                    imoveis[row['Id']] = Imovel()
                imoveis[row['Id']].datas.append(f'{dia}/{mes}')
                imoveis[row['Id']].precos.append(row['Preço'])

In [None]:
imoveis.keys()

In [None]:
print(imoveis.get(11160514).datas)
imoveis.get(11160514).precos

In [None]:
for k, v in imoveis.items():
    dif = v.precos[-1]-v.precos[0]
    if abs(dif) > 10000:
        print(k, dif, set(v.precos))

In [None]:
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
plt.plot(imoveis.get(11160514).datas, imoveis.get(11160514).precos, 'go--', linewidth=2, markersize=12)
plt.show()

In [None]:
#for k,v in imoveis.items():
#    print("chave ", k, " com os valores ", v.getPreco())

In [None]:
for k,v in imoveis.items():
    print(len(v.precos))

# Auto Regression

In [None]:
from statsmodels.tsa.ar_model import AutoReg

for k,v in imoveis.items():
    if len(v.precos)>3: # por alguma razão tem de ser >3
        model=AutoReg(v.precos, lags=1)
        history = model.fit()
        predictions=history.predict(len(v.precos),len(v.precos)+7)
        v.previsoes=predictions

In [None]:
arr_preco = [*imoveis.get(11160514).precos, *imoveis.get(11160514).previsoes]
arr_data = [*imoveis.get(11160514).datas, *range(0,len(imoveis.get(11160514).previsoes))]
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
plt.plot(arr_data, arr_preco, 'go--', linewidth=2, markersize=12)
plt.show()
print(imoveis.get(11160514).precos)
print(imoveis.get(11160514).previsoes)

# Simple Exponential Smoothing

In [None]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

for k,v in imoveis.items():
    if len(v.precos)>1:
        model = SimpleExpSmoothing(v.precos)
        model_fit = model.fit()
        # make prediction
        predictions = model_fit.predict(len(v.precos), len(v.precos)+7)
        v.previsoes=predictions

In [None]:
arr_preco = [*imoveis.get(11160514).precos, *imoveis.get(11160514).previsoes]
arr_data = [*imoveis.get(11160514).datas, *range(0,len(imoveis.get(11160514).previsoes))]
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
plt.plot(arr_data, arr_preco, 'go--', linewidth=2, markersize=12)
plt.show()
print(imoveis.get(11160514).precos)
print(imoveis.get(11160514).previsoes)

# Holt Winter’s Exponential Smoothing/Triple Exponential Smoothing

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

for k,v in imoveis.items():
    if len(v.precos)>1:
        model = ExponentialSmoothing(v.precos)
        model_fit = model.fit()
        # make prediction
        predictions = model_fit.predict(len(v.precos), len(v.precos)+7)
        v.previsoes=predictions

In [None]:
arr_preco = [*imoveis.get(11160514).precos, *imoveis.get(11160514).previsoes]
arr_data = [*imoveis.get(11160514).datas, *range(0,len(imoveis.get(11160514).previsoes))]
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
plt.plot(arr_data, arr_preco, 'go--', linewidth=2, markersize=12)
plt.show()
print(imoveis.get(11160514).precos)
print(imoveis.get(11160514).previsoes)

# Rede LSTM (não apropriada para este tipo de problema)

In [None]:
import tensorflow as tf
import pdb
timesteps=5 #linhas usadas para prever o(s) proximo(s) valor(es)
multisteps=1 #número de linhas que irá prever
features=1 #nº variáveis usadas para prever os próximos valores
batch_size=8

X,y=list(),list()
dataset_size=len(imoveis.get(15386584).precos)
for curr_pos in range(dataset_size):
    input_index=curr_pos+timesteps
    label_index=input_index+multisteps
    if label_index<dataset_size:
        X.append(imoveis.get(15386584).precos[curr_pos:input_index])
        y.append(imoveis.get(15386584).precos[input_index:label_index])
X=np.reshape(np.array(X),(len(X),timesteps,features))
y=np.reshape(np.array(y),(len(y),multisteps))
drop=0.2
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(256, return_sequences=True, input_shape=(timesteps,features)))
model.add(tf.keras.layers.Dropout(drop))
model.add(tf.keras.layers.LSTM(256, return_sequences=True))
model.add(tf.keras.layers.Dropout(drop))
model.add(tf.keras.layers.LSTM(256, return_sequences=True))
#model.add(tf.keras.layers.Dropout(drop))
#model.add(tf.keras.layers.LSTM(256, return_sequences=True))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation = 'relu'))
model.add(tf.keras.layers.Dense(multisteps, activation = 'relu'))
model.compile(
    loss= tf.keras.losses.mae,
    optimizer= tf.keras.optimizers.Adam(),
    metrics=['MeanSquaredError'])
stepsper = X.shape[0]/batch_size
history=model.fit(X, y, shuffle=False, epochs=200, verbose=1, steps_per_epoch = stepsper, batch_size = batch_size)
predictions=model.predict(X)
imoveis.get(15386584).previsoes = predictions
print(imoveis.get(15386584).precos)
print(imoveis.get(15386584).previsoes)

In [None]:
import tensorflow as tf
import pdb
timesteps=3 #linhas usadas para prever o(s) proximo(s) valor(es)
multisteps=1 #número de linhas que irá prever
features=1 #nº variáveis usadas para prever os próximos valores
batch_size=8

for k,v in imoveis.items():
    if len(v.precos)>timesteps:
        X,y=list(),list()
        dataset_size=len(v.precos)
        for curr_pos in range(dataset_size):
            input_index=curr_pos+timesteps
            label_index=input_index+multisteps
            if label_index<dataset_size:
                X.append(v.precos[curr_pos:input_index])
                y.append(v.precos[input_index:label_index])
        X=np.reshape(np.array(X),(len(X),timesteps,features))
        y=np.reshape(np.array(y),(len(y),multisteps))
        drop=0.2
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.LSTM(16, return_sequences=True, input_shape=(timesteps,features)))
        #model.add(tf.keras.layers.Dropout(drop))
        #model.add(tf.keras.layers.LSTM(16, return_sequences=True))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(multisteps, activation = 'relu'))
        model.compile(
            loss= tf.keras.losses.mae,
            optimizer= tf.keras.optimizers.Adam(),
            metrics=['MeanSquaredError'])
        stepsper = X.shape[0]/batch_size
        history=model.fit(X, y, shuffle=False, epochs=50, verbose=0, steps_per_epoch = stepsper, batch_size = batch_size)
        predictions=model.predict(X)
        imoveis.get(k).previsoes = predictions

In [None]:
import tensorflow as tf
import pdb
timesteps=3 #linhas usadas para prever o(s) proximo(s) valor(es)
multisteps=1 #número de linhas que irá prever
features=1 #nº variáveis usadas para prever os próximos valores
batch_size=8
drop=0.3
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(512, return_sequences=True, input_shape=(timesteps,features)))
model.add(tf.keras.layers.Dropout(drop))
model.add(tf.keras.layers.LSTM(512, return_sequences=True))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(512, activation = 'relu'))
model.add(tf.keras.layers.Dense(multisteps, activation = 'relu'))
model.compile(
    loss= tf.keras.losses.mae,
    optimizer= tf.keras.optimizers.Adam(),
    metrics=[])

for k,v in imoveis.items():
    if len(v.precos)>batch_size:
        X,y=list(),list()
        dataset_size=len(v.precos)
        for curr_pos in range(dataset_size):
            input_index=curr_pos+timesteps
            label_index=input_index+multisteps
            if label_index<dataset_size:
                X.append(v.precos[curr_pos:input_index])
                y.append(v.precos[input_index:label_index])
        X=np.reshape(np.array(X),(len(X),timesteps,features))
        y=np.reshape(np.array(y),(len(y),multisteps))
        stepsper = X.shape[0]/batch_size
        history=model.fit(X, y, shuffle=False, epochs=10, verbose=0, steps_per_epoch = stepsper, batch_size = batch_size)
        predictions=model.predict(X)
        imoveis.get(k).previsoes = predictions

In [None]:
print(imoveis.get(15324144).precos)
print(imoveis.get(15324144).previsoes)