### Classify House Prices Based on Area


- Put houses in price groups and try to predict based on Latitude and Longitude
- That will show if the area is a good indicator of the house unit price

In [1]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, r2_score

In [2]:
#Read the data
data = pd.read_csv("C:/Users/HP/Desktop/DATA SCIENCE/Python/Learn Python With Rune/Machine Learning With Python/MachineLearningWithPython-main/files/house_prices.csv")
data.head()

Unnamed: 0,Transaction,House age,Distance to MRT station,Number of convenience stores,Latitude,Longitude,House unit price
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [4]:
#Prepare data

#Create 15 bins of house prices
data["Class"] = pd.cut(data["House unit price"], bins=15)

#Get the category codes by transforming the column 'Class'
#This will make it easier to use the knn classifier
data["Class id"] = data["Class"].cat.codes
data.head()

Unnamed: 0,Transaction,House age,Distance to MRT station,Number of convenience stores,Latitude,Longitude,House unit price,Class,Class id
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9,"(36.907, 44.233]",4
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2,"(36.907, 44.233]",4
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3,"(44.233, 51.56]",5
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8,"(51.56, 58.887]",6
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1,"(36.907, 44.233]",4


In [5]:
#Prepare training and test data
x = data.iloc[:, :-1]
y = data["Class id"]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.15)

In [6]:
#Train a  𝑘 -Neighbours Classifier
model = KNeighborsClassifier()
model.fit(x_train[["Latitude", "Longitude"]], y_train)

y_pred = model.predict(x_test[["Latitude", "Longitude"]])

#Calculate the accuracy of the model
accuracy_score(y_test, y_pred)

#Comparing the categories seem not to give convincing results. R^2 of 0.44 is very low

0.4444444444444444

In [12]:
#Make prediction of categories

#convert the predicted categories to a dataframe
df_pred = pd.DataFrame(y_pred, columns=["Pred cat"])

#Get the middle value of the prediction category
#Take the categories and map them to the middle value of the intervals, "Class" column
df_pred["Pred"] = df_pred["Pred cat"].apply(lambda x: x_test["Class"].cat.categories[x].mid)
df_pred.head()

#Calculate the R^2 of the predicted and real house price 'House unit price' of x_test
r2_score(x_test["House unit price"], df_pred["Pred"])

0.7039083923865217