In [124]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [125]:
# import
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [126]:
# load data
data = pd.read_csv('/kaggle/input/amsterdam-house-price-prediction/HousingPrices-Amsterdam-August-2021.csv', index_col=0)

In [127]:
print(data.head())
print(data.shape)

In [128]:
# check nan
data.isna().sum()

In [129]:
# fill nan by interpolate
data = data.dropna()

In [130]:
data.isna().sum()

In [131]:
data.describe()

In [132]:
# before remove outliers
sns.boxplot(x='Price', data = data)

In [133]:
# remove outliers (IQR)
q1 = data.describe()['Price']['25%']
q3 = data.describe()['Price']['75%']
iqr = q3 - q1
max_price = q3 + 1.5 * iqr

In [134]:
# calculate percens of 
outliers = data[data['Price'] >= max_price]
outliers_count = outliers['Price'].count()
prices_count = data['Price'].count()
print('Percentage removed: ' + str(round(outliers_count/prices_count * 100, 2)) + '%')

# remove outliers
data = data[data['Price'] < max_price]

In [135]:
# after remove outliers
sns.boxplot(x='Price', data = data)

In [136]:
# get street from address
def get_street(address):
    list = address.split()
    word = []
    for element in list:
        if element.isalpha() == True: 
            word.append(element)
        else:
            break
    word = ' '.join(word)
    return word

In [137]:
tqdm.pandas()
data['Zip No'] = data['Zip'].progress_apply(lambda x: x.split()[0])
data['Zip Code'] = data['Zip'].progress_apply(lambda x: x.split()[-1])
data['Address'] = data['Address'].progress_apply(lambda x: x.split(',')[0])
data['Street'] = data['Address'].progress_apply(lambda x: get_street(x))

In [138]:
data.head()

In [139]:
# drop unnecessary columns
data = data.drop(columns=['Address', 'Zip'])

In [140]:
# price / area
data['Price_m2'] = data['Price'] / data['Area']

In [141]:
data.head()

In [142]:
# encoder string
lbe = LabelEncoder()
lbe.fit(list(data['Zip Code'].values))
data['Zip Code'] = lbe.transform(list(data['Zip Code'].values))
lbe.fit(list(data['Street'].values))
data['Street'] = lbe.transform(list(data['Street'].values))

In [143]:
data.head()

In [144]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(data['Price'], bins=30)
plt.show()

In [145]:
sns.pairplot(data, x_vars=['Area', 'Room', 'Lon', 'Lat', 'Zip Code', 'Zip No', 'Street', 'Price_m2'], y_vars='Price', height=4, aspect=1, kind='scatter')
plt.show()

In [146]:
sns.pairplot(data)

In [147]:
sns.heatmap(data.corr(), annot = True)
plt.show()

In [148]:
# get X and y
X = data.drop(columns=['Price', 'Room', 'Lon', 'Lat', 'Zip Code', 'Zip No', 'Street'])
y = data.Price

In [149]:
X

In [150]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=1008)

In [151]:
# standard data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [152]:
# Random Forest Regressor
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)
y_random_forest = random_forest.predict(X_test)
print("Mean_squared_error: ", np.sqrt(mean_squared_error(y_test, y_random_forest)))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_random_forest))

In [153]:
# Linear Regression
linearreg = LinearRegression()
linearreg.fit(X_train, y_train)
y_predict = linearreg.predict(X_test)
print("Mean_squared_error: ", np.sqrt(mean_squared_error(y_test, y_predict)))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_predict))

In [154]:
# Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print("Mean_squared_error: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

In [155]:
plt.scatter(y_test, y_random_forest)
plt.xlabel("Prices: $Y_i$")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")