In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor


In [9]:
# Loading dataset
data = pd.read_csv("data/usa_real_estate.csv", sep=",", encoding="utf-8")

In [10]:
# dropping columns
data.drop(['street', 'full_address', 'sold_date', 'state', 'city', 'status'], axis=1, inplace=True)

In [16]:
# Dropping null collumns
data = data[data['bed'].notna()]
data = data[data['price'].notna()]
data = data[data['bath'].notna()]
data = data[data['acre_lot'].notna()]
data = data[data['zip_code'].notna()]
data = data[data['house_size'].notna()]

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421244 entries, 0 to 923157
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   price       421244 non-null  float64
 1   bed         421244 non-null  float64
 2   bath        421244 non-null  float64
 3   acre_lot    421244 non-null  float64
 4   zip_code    421244 non-null  float64
 5   house_size  421244 non-null  float64
dtypes: float64(6)
memory usage: 22.5 MB


In [18]:
data.head()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size
0,105000.0,3.0,2.0,0.12,601.0,920.0
1,80000.0,4.0,2.0,0.08,601.0,1527.0
2,67000.0,2.0,1.0,0.15,795.0,748.0
3,145000.0,4.0,2.0,0.1,731.0,1800.0
5,179000.0,4.0,3.0,0.46,612.0,2520.0


In [19]:
x = data.drop("price", axis=1)
y = data["price"]

In [20]:
x.fillna(0, inplace=True)

In [21]:
# recupera os valores e nomes de atributos a partir do dataframe
val_x = x.values
val_y = y.values

In [22]:
# preparando os arrays X (atributos de entrada) e y (rótulos)
X = val_x
y = val_y
print(X.shape)
print(y.shape)

(421244, 5)
(421244,)


In [37]:
# separando uma parte para base de validação 
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.95, random_state=13)

In [38]:
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train)

# fazendo previsões com a base de validação
y_pred = neigh.predict(X_val)

# calculando a acurácia
acc = neigh.score(X_val, y_val)
print("Acurácia: {:.2f}%".format(acc*100))


Acurácia: 98.10%


In [39]:
# parâmetros da KNN
parameters_KNN = [
  {'n_neighbors': [3, 5, 7], 
    'weights': ['uniform', 'distance'],
     'p': [2]}
  ]

In [40]:
clf = KNeighborsRegressor()
#clf = DecisionTreeRegressor()

In [41]:
gs = GridSearchCV(clf, parameters_KNN, scoring = 'accuracy', cv=3, n_jobs=-1)
gs.fit(X_val, y_val)



GridSearchCV(cv=3, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid=[{'n_neighbors': [3, 5, 7], 'p': [2],
                          'weights': ['uniform', 'distance']}],
             scoring='accuracy')

In [42]:
print(gs.best_params_)

{'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}


In [44]:
# validação cruzada 10-folds
folds=20
clf=gs.best_estimator_
result = cross_val_score(clf, X_train, y_train, cv=folds, n_jobs=5)
print("\nCross Validation Results %d folds:" % folds)
print("Mean accuracy: %.5f" % result.mean())
print("Std: %.5f" % result.std())


Cross Validation Results 20 folds:
Mean accuracy: 0.92598
Std: 0.07763
