In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

In [2]:
# load the data set
df = pd.read_csv('../data/residence.csv',sep=";", index_col=False)  
df

Unnamed: 0,house_type,house_zip_code,house_rooms,house_square_meters,house_year,house_price
0,Villa,4892.0,4.0,163.0,1995.0,1095000.0
1,Fritidshus,7673.0,4.0,65.0,1997.0,1145000.0
2,Villa,6920.0,4.0,150.0,1973.0,1145000.0
3,Fritidshus,9982.0,4.0,79.0,1994.0,1148000.0
4,Fritidshus,6100.0,3.0,56.0,1962.0,795000.0
...,...,...,...,...,...,...
62888,Villa,9830.0,2.0,149.0,1950.0,450000.0
62889,Villa,6950.0,6.0,260.0,2000.0,4195000.0
62890,Villa,6270.0,6.0,201.0,1992.0,745000.0
62891,Villa,6971.0,4.0,88.0,1927.0,400000.0


In [3]:
# transform the data and specify prediction parameters

# Encode the house type from string to integer
label_enc =preprocessing.LabelEncoder()
df['house_type'] = label_enc.fit_transform(df['house_type'].astype(str))

# Set zip for more precise predictions 
df = df.loc[df['house_zip_code'] == 3480]

# Set type for more precise predictions
#df = df.loc[df['house_type'] == 1]

#Data
df_x = df[['house_zip_code', 'house_rooms', 'house_square_meters',
       'house_year']]
#Target
df_y = df[['house_price']]

#df_y
df_x
#df

Unnamed: 0,house_zip_code,house_rooms,house_square_meters,house_year
651,3480.0,5.0,126.0,1988.0
1142,3480.0,6.0,174.0,1972.0
2419,3480.0,5.0,118.0,1980.0
2866,3480.0,6.0,145.0,1979.0
3222,3480.0,4.0,138.0,2008.0
...,...,...,...,...
49058,3480.0,5.0,117.0,1876.0
50372,3480.0,5.0,140.0,1954.0
52114,3480.0,5.0,140.0,1954.0
54311,3480.0,4.0,171.0,1940.0


In [4]:
# initialize the linear model
reg = linear_model.LinearRegression()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42 )

In [6]:
#Normalize

# fit scaler on training data
norm = MinMaxScaler().fit(x_train)


# transform training data
x_train_norm = norm.transform(x_train)


# transform testing dataa
x_test_norm = norm.transform(x_test)

In [7]:
# Standardize
from sklearn.preprocessing import StandardScaler

scale= StandardScaler()
 
# standardization of dependent variables
x_train_std = scale.fit_transform(x_train)
x_test_std = scale.fit_transform(x_test) 
print(x_train_std)
#print(x_test)

[[ 0.         -0.39052077  0.11787428 -0.7473402 ]
 [ 0.         -0.39052077 -0.52523765  0.65893864]
 [ 0.         -2.31058122 -1.52729576 -0.58713122]
 [ 0.         -0.39052077 -0.52523765 -0.21331026]
 [ 0.         -0.39052077 -0.19620364  1.13956558]
 [ 0.          0.24949938 -0.4355011   0.67673963]
 [ 0.          0.24949938 -0.4355011   0.67673963]
 [ 0.          0.24949938  0.43195219  0.65893864]
 [ 0.          0.88951953  0.26743519  0.83694862]
 [ 0.          0.24949938  0.01318164  0.9971576 ]
 [ 0.          2.16955984  1.19471285 -0.7473402 ]
 [ 0.          0.88951953  1.34427376  0.35632167]
 [ 0.          0.24949938 -0.37567673  0.55213265]
 [ 0.         -1.67056107 -1.21321784  0.37412267]
 [ 0.         -0.39052077  0.0879621  -3.11487293]
 [ 0.          0.88951953 -0.03168663  0.65893864]
 [ 0.         -1.67056107 -1.21321784  0.37412267]
 [ 0.         -0.39052077 -0.52523765  0.65893864]
 [ 0.         -1.03054092 -0.82435947  0.74794363]
 [ 0.         -0.39052077 -0.54

In [8]:
print(y_train)

       house_price
39582    3995000.0
21584    3695000.0
23360    1645000.0
8712     2275000.0
30718    4595000.0
17692    2995000.0
4597     3095000.0
28762    5395000.0
33006    4995000.0
9544     4500000.0
38454    6455000.0
11715    4175000.0
34013    3650000.0
7877     1595000.0
11393    5500000.0
2866     3695000.0
9000     1595000.0
21283    3695000.0
4829     2495000.0
35952    2395000.0
4191    14500000.0
35119    3645000.0
18064    2995000.0
45398    4895000.0
54668    5395000.0
28137    5395000.0
22825    2375000.0
26082    1945000.0
23683    1645000.0
54311    5395000.0
8052     2275000.0
52114    2750000.0
14928    8795000.0
37130    4995000.0
22244    2375000.0
31206    3395000.0
24740    4495000.0
41029    1995000.0
31189    2895000.0
16962    5495000.0
49058    3095000.0
31465    3395000.0
33222    4995000.0
39943    3995000.0
21213    4395000.0
15228    8795000.0
1142     4095000.0
27340   13800000.0
9707     5500000.0
2419     3095000.0
11156   10495000.0
46852    183

In [9]:
#reg.fit(x_train_norm, y_train)
reg.fit(x_train_std, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
data = [[0., -0.54772256, -0.6757862, 0.26405096]]
testing_df = pd.DataFrame (data, columns = ['house_zip_code','house_rooms', 'house_square_meters', 'house_year'])
testing_df
# result should be 2395000

Unnamed: 0,house_zip_code,house_rooms,house_square_meters,house_year
0,0.0,-0.547723,-0.675786,0.264051


In [11]:
# predictions
#pred = reg.predict(testing_df)
#pred = reg.predict(x_test_norm)
pred = reg.predict(x_test_std)
print(pred) 

[[ 5417484.68628211]
 [ 7486629.41005666]
 [ 3556935.19496616]
 [10040253.6039497 ]
 [ 5700789.37120763]
 [ 3436184.76993202]
 [ 3763795.21009992]
 [ -248506.56308432]
 [ 3675778.48757071]
 [ 8424522.95103769]
 [ 3675778.48757071]
 [ 3479006.26559561]
 [ 5417484.68628211]
 [ 6447577.30199198]
 [ 6447577.30199198]
 [ 5449108.70614558]
 [  570304.85611158]
 [ 3763795.21009992]
 [ 3927036.27978195]
 [ 3479006.26559561]
 [ 6518803.34061539]
 [ 4512839.74249193]
 [ 6159752.92425423]
 [  689761.55483404]
 [ 3218648.66328096]
 [ 6183975.11501442]
 [ 2389584.02541857]
 [ 4553331.23287511]
 [ 2293338.34624684]
 [  689761.55483404]]


In [12]:
y_test

Unnamed: 0,house_price
23036,6745000.0
27901,13800000.0
15977,3795000.0
6514,14500000.0
25057,4100000.0
651,3795000.0
33448,4295000.0
9010,1675000.0
5613,3425000.0
10092,4175000.0


In [13]:
#Standerdize the end result
y_test_std = scale.fit_transform(y_test)
y_pred_std = scale.fit_transform(pred) 

In [14]:
print(np.mean((y_pred_std - y_test_std)**2))

0.41784050885158147
