In [174]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [175]:
# import
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
import re
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [232]:
# load data
data = pd.read_csv('/kaggle/input/amsterdam-house-price-prediction/HousingPrices-Amsterdam-August-2021.csv', index_col=0)

In [233]:
print(data.head())
print(data.shape)
print(data.columns)

In [178]:
# check nan
data.isna().sum()

In [179]:
# fill nan by interpolate
data = data.dropna()

In [180]:
data.isna().sum()

In [181]:
data.Room.value_counts().plot.bar()

In [182]:
data = data[data.Room < 10]

In [183]:
data.describe()

In [184]:
# before remove outliers
sns.boxplot(x='Price', data = data)

In [185]:
# remove outliers (IQR)
q1 = data.describe()['Price']['25%']
q3 = data.describe()['Price']['75%']
iqr = q3 - q1
max_price = q3 + 1.5 * iqr

In [186]:
# calculate percens of 
outliers = data[data['Price'] >= max_price]
outliers_count = outliers['Price'].count()
prices_count = data['Price'].count()
print('Percentage removed: ' + str(round(outliers_count/prices_count * 100, 2)) + '%')

# remove outliers
data = data[data['Price'] < max_price]

In [187]:
# after remove outliers
sns.boxplot(x='Price', data = data)

In [188]:
# get street from address
def get_street(address):
    list = address.split()
    word = []
    for element in list:
        if element.isalpha() == True: 
            word.append(element)
        else:
            break
    word = ' '.join(word)
    return word

In [189]:
tqdm.pandas()
data['Zip No'] = data['Zip'].progress_apply(lambda x: x.split()[0])
data['Zip Code'] = data['Zip'].progress_apply(lambda x: x.split()[-1])
data['Address'] = data['Address'].progress_apply(lambda x: x.split(',')[0])
data['Street'] = data['Address'].progress_apply(lambda x: get_street(x))

In [190]:
data.head()

In [191]:
# drop unnecessary columns
data = data.drop(columns=['Address', 'Zip'])

In [192]:
data.head()

In [193]:
# encoder string
lbe = LabelEncoder()
lbe.fit(list(data['Zip Code'].values))
data['Zip Code'] = lbe.transform(list(data['Zip Code'].values))
lbe.fit(list(data['Street'].values))
data['Street'] = lbe.transform(list(data['Street'].values))

In [194]:
data.head()

In [195]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(data['Price'], bins=30)
plt.show()

In [196]:
sns.pairplot(data, x_vars=['Area', 'Room', 'Lon', 'Lat', 'Zip Code', 'Zip No', 'Street'], y_vars='Price', height=4, aspect=1, kind='scatter')
plt.show()

In [197]:
sns.pairplot(data)

In [198]:
sns.heatmap(data.corr(), annot = True)
plt.show()

In [220]:
# get X and y
X = data.drop(columns=['Price', 'Room', 'Lon', 'Lat', 'Zip Code', 'Zip No', 'Street'])
y = data.Price

In [221]:
X

In [234]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=1008)
print(X_train.shape)
print(X_test.shape)

In [223]:
# standard data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [224]:
# custome
def stochastic_gradient_descent(X, y, eta, loop_number):
    X = np.asarray(X)
    m = X.shape[0]
    n = X.shape[1]
    
    # initial theta
    theta = list(np.random.rand(n + 1))
    y = np.asarray(y)
    len_theta = len(theta)
    
    # loop for number
    for k in range(loop_number):
#         print(f'Loop {k + 1}:')
        
        for i in range(m):
            h_i = 0
            row = np.append(1, X[i])
            
            # cal h_i
            for j in range(len_theta):
                h_i += theta[j] * row[j]
            
            # cal theta
            for j in range(len_theta):
#                 print(f'theta {j} = {theta[j]} + {eta} * ({y[i]} - {h_i}) * {row[j]} = {theta[j] + eta * (y[i] - h_i) * row[j]}')
                theta[j] = theta[j] + eta * (y[i] - h_i) * row[j]
            
#             print(theta)
    return theta

In [225]:
def batch_gradient_descent(X, y, eta, loop_number):
    X = np.asarray(X)
    m = X.shape[0]
    n = X.shape[1]
    
    # initial theta
    theta = list(np.random.rand(n + 1))
    y = np.asarray(y)
    len_theta = len(theta)
    
    # loop for number
    for k in range(loop_number):
#         print(f'Loop {k + 1}:')
        delta = list(np.zeros(n + 1))
        
        for i in range(m):
            h_i = 0
            row = np.append(1, X[i])
            
            # cal h_i
            for j in range(len_theta):
                h_i += theta[j] * row[j]
            
            # cal delta
            for j in range(len_theta):
                delta[j] += (y[i] - h_i) * row[j]
            
        # cal theta
        for i in range(len_theta):
            theta[i] = theta[i] + eta * delta[i] 
            
    return theta


In [226]:
# predict function
def predict_lm(X, theta):
    X = np.asarray(X)
    m = X.shape[0]
    n = X.shape[1]
    y_pred = []
    
    for i in range(m):
        pred = 0
        row = np.append(1, X[i])
        for j in range(n + 1):
            pred += theta[j] * row[j]
        
        y_pred.append(pred)
    
    return y_pred

In [227]:
# decission tree regression in sklearn
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train,y_train)

# print(model.coef_)
tree_pred = model.predict(X_test)
print("Mean_squared_error: ", np.sqrt(mean_squared_error(y_test, tree_pred)))
print('Coefficient of determination: %.2f' % r2_score(y_test, tree_pred))

In [228]:
# linearRegression in sklearn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

# print(model.coef_)
lm_pred = model.predict(X_test)
print("Mean_squared_error: ", np.sqrt(mean_squared_error(y_test, lm_pred)))
print('Coefficient of determination: %.2f' % r2_score(y_test, lm_pred))

In [229]:
# LMS - Stochastic gradient descent (SGD)
for i in range(10):
    print(f'Loop {i + 1}')
    eta = 1 / math.pow(10, i + 1)
    iteration =  10 * (i + 1)
    theta = stochastic_gradient_descent(X_train, y_train, eta, iteration)
    # print(theta)
    sgd_pred = predict_lm(X_test, theta)
    print("Mean_squared_error: ", np.sqrt(mean_squared_error(y_test, sgd_pred)))
    print('Coefficient of determination: %.2f' % r2_score(y_test, sgd_pred))

In [230]:
# LMS - Batch Gradient descent (GD)
for i in range(10):
    print(f'Loop {i + 1}')
    eta = 1 / math.pow(10, i + 1)
    iteration =  10 * (i + 1)
    theta = batch_gradient_descent(X_train, y_train, eta, iteration)
    # print(theta)
    gd_pred = predict_lm(X_test, theta)
    print("Mean_squared_error: ", np.sqrt(mean_squared_error(y_test, gd_pred)))
    print('Coefficient of determination: %.2f' % r2_score(y_test, gd_pred))

In [218]:
# plt.scatter(y_test, y_pred)
# plt.xlabel("Prices: $Y_i$")
# plt.ylabel("Predicted prices")
# plt.title("Prices vs Predicted prices")