In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn import metrics


In [107]:
# carregando a base de dados
data = pd.read_csv("imoveis.csv", sep=",", encoding="utf-8")

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8448 entries, 0 to 8447
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   local      8448 non-null   object 
 1   preco      8448 non-null   int64  
 2   metragem   8448 non-null   int64  
 3   quartos    8435 non-null   float64
 4   garagens   8116 non-null   float64
 5   banheiros  8448 non-null   int64  
 6   bairro     8448 non-null   int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 429.1+ KB


In [109]:
data = data[data['metragem'].notna()]
data = data[data['quartos'].notna()]
data = data[data['garagens'].notna()]
data = data[data['banheiros'].notna()]
data = data[data['bairro'].notna()]
data = data[data['preco'] != 'Sobconsulta']
data = data.astype({'preco':'float'})

data.head()

#data.to_csv('imoveis_media.csv', index=False)

Unnamed: 0,local,preco,metragem,quartos,garagens,banheiros,bairro
0,"Rua Maria Thereza Gonçalves, Umbara",399900.0,106,3.0,2.0,2,76
1,"Travessa Paulo Ribeiro, Sitio Cercado",765190.0,319,5.0,3.0,4,70
2,"Uberaba, Curitiba",2350000.0,500,5.0,7.0,5,75
4,"Rua João Fonseca Mercer, Atuba",1140000.0,276,5.0,3.0,5,7
5,"Uberaba, Curitiba",800000.0,133,3.0,3.0,3,75


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8106 entries, 0 to 8447
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   local      8106 non-null   object 
 1   preco      8106 non-null   float64
 2   metragem   8106 non-null   int64  
 3   quartos    8106 non-null   float64
 4   garagens   8106 non-null   float64
 5   banheiros  8106 non-null   int64  
 6   bairro     8106 non-null   int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 475.0+ KB


In [111]:
x = data.drop("preco", axis=1).drop("local", axis=1)
y = data["preco"]

In [112]:
x.fillna(0, inplace=True)

In [113]:
# recupera os valores e nomes de atributos a partir do dataframe
val_x = x.values
val_y = y.values

In [114]:
# preparando os arrays X (atributos de entrada) e y (rótulos)
X = val_x
y = val_y
print(X.shape)
print(y.shape)

(8106, 5)
(8106,)


In [115]:
#Rescaling features age, trestbps, chol, thalach, oldpeak.
scaler = StandardScaler()
features = [['preco', 'metragem', 'quartos', 'garagens', 'banheiros', 'bairro']]
for feature in features:
    data[feature] = scaler.fit_transform(data[feature])

In [116]:
# separando uma parte para base de validação (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=13)

In [117]:
neigh = KNeighborsRegressor(n_neighbors=3)
neigh.fit(X_train, y_train)

# fazendo previsões com a base de validação
y_pred = neigh.predict(X_val)

# calculando a acurácia
acc = neigh.score(X_val, y_val)
print("Acurácia: {:.2f}%".format(acc*100))


Acurácia: 61.06%


In [121]:
print(metrics.mean_absolute_error(y_val, y_pred))

0.2577106399776771


In [None]:
# parâmetros da KNN
parameters_KNN = [
  {'n_neighbors': [3, 5, 7], 
    'weights': ['uniform', 'distance'],
     'p': [1,2,3,4,5,6]}
  ]

In [None]:
clf = KNeighborsRegressor()
gs = GridSearchCV(clf, parameters_KNN, scoring = 'accuracy', cv=3, n_jobs=-1)
gs.fit(X_val, y_val)
print(gs.best_params_)

{'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [None]:
# validação cruzada 10-folds
folds=10
clf=gs.best_estimator_
result = cross_val_score(clf, X_train, y_train, cv=folds, n_jobs=5)
print("\nCross Validation Results %d folds:" % folds)
print("Mean accuracy: %.2f" % (result.mean()*100))
print("Std: %.2f" % (result.std()*100))


Cross Validation Results 10 folds:
Mean accuracy: 55.56
Std: 24.42
