In [6]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing



In [16]:
# load the data set
df = pd.read_csv('../data/residence.csv',sep=";", index_col=False)  
df

Unnamed: 0,house_type,house_zip_code,house_rooms,house_square_meters,house_year,house_price
0,Andelsbolig,9500,3,89.0,1906,148000
1,Andelsbolig,6800,3,98.0,2007,260000
2,Andelsbolig,7130,3,105.0,2007,275000
3,Fritidshus,4920,3,50.0,1870,299000
4,Villa,7660,3,100.0,1951,350000
...,...,...,...,...,...,...
33211,Fritidshus,6470,3,105.0,2010,1595000
33212,Villa,9830,2,149.0,1950,450000
33213,Villa,6950,6,260.0,2000,4195000
33214,Villa,6270,6,201.0,1992,745000


In [19]:
# transform the data and specify prediction parameters

# Encode the house type from string to integer
label_enc =preprocessing.LabelEncoder()
df['house_type'] = label_enc.fit_transform(df['house_type'].astype(str))

# Set zip for more precise predictions 
#df = df.loc[df['house_zip_code'] == 3480]

# Set type for more precise predictions
df = df.loc[df['house_type'] == 4]

#Data
df_x = df[['house_type','house_zip_code', 'house_rooms', 'house_square_meters',
       'house_year']]
#Target
df_y = df[['house_price']]

#df_y
df_x
#df

Unnamed: 0,house_type,house_zip_code,house_rooms,house_square_meters,house_year
4,4,7660,3,100.0,1951
5,4,5932,3,59.0,1895
6,4,4891,3,126.0,1900
7,4,5953,6,213.0,1900
11,4,7860,4,140.0,1900
...,...,...,...,...,...
33210,4,7742,3,75.0,1906
33212,4,9830,2,149.0,1950
33213,4,6950,6,260.0,2000
33214,4,6270,6,201.0,1992


In [20]:
# initialize the linear model
reg = linear_model.LinearRegression()

In [21]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42 )

In [22]:
#Normalize

# fit scaler on training data
norm = MinMaxScaler().fit(x_train)


# transform training data
x_train_norm = norm.transform(x_train)


# transform testing dataa
x_test_norm = norm.transform(x_test)


In [9]:
# Standardize
from sklearn.preprocessing import StandardScaler

scale= StandardScaler()
 
# standardization of dependent variables
x_train_std = scale.fit_transform(x_train)
x_test_std = scale.fit_transform(x_test) 
print(x_train_std)
#print(x_test)

[[ 0.         -0.54772256  0.15089231 -1.13347065]
 [ 0.          0.18257419 -0.56305731 -1.64947862]
 [ 0.         -0.54772256  0.97757083  0.54355528]
 [ 0.         -2.00831604 -1.52125286  0.22105029]
 [ 0.          0.18257419  2.31152934 -0.01545336]
 [ 0.          0.18257419  0.62059602 -1.45597563]
 [ 0.          0.91287093 -0.60063361 -0.33795835]
 [ 0.          0.18257419  0.54544342  0.56505561]
 [ 0.          0.91287093  0.50786713  0.41455328]
 [ 0.          0.91287093  0.82726564  0.7585586 ]
 [ 0.         -1.2780193  -0.90124398 -0.703464  ]
 [ 0.         -0.54772256 -0.1685062   1.18856525]
 [ 0.          2.37346442  1.01514712 -0.29495768]
 [ 0.          0.18257419  0.01937528  0.97356192]
 [ 0.          0.18257419 -0.46911657  0.43605362]
 [ 0.         -0.54772256 -0.90124398 -0.07995436]
 [ 0.          0.91287093  0.35756194  0.56505561]
 [ 0.          0.18257419 -0.54426916  0.58655594]
 [ 0.          0.18257419 -0.78851509 -0.93996766]
 [ 0.          0.18257419 -0.22

In [28]:
print(x_train_norm)

[[0.         0.3340319  0.14       0.36014767 0.82846715]
 [0.         0.95459308 0.1        0.15978987 0.91849148]
 [0.         0.96274304 0.16       0.36402556 0.88686131]
 ...
 [0.         0.94411457 0.1        0.15720461 0.91119221]
 [0.         0.60880196 0.14       0.2903456  0.93430657]
 [0.         0.92688322 0.12       0.23088457 0.85644769]]


In [29]:
print(y_train)

       house_price
19347      1999999
29621       398000
30845      1295000
14746      1495000
12042      3798000
...            ...
19761      2495000
20871      1295000
10068       845000
1620       1695000
26013      1248000

[14303 rows x 1 columns]


In [25]:
reg.fit(x_train_norm, y_train)
#reg.fit(x_train_std, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
data = [[0,-0.54772256, 0.15089231, -1.13347065]]
testing_df = pd.DataFrame (data, columns = ['house_zip_code','house_rooms', 'house_square_meters', 'house_year'])
testing_df
# result should be 1999999

Unnamed: 0,house_zip_code,house_rooms,house_square_meters,house_year
0,0.0,-0.547723,-0.675786,0.264051


In [26]:
# predictions
#pred = reg.predict(testing_df)
pred = reg.predict(x_test_norm)
#pred = reg.predict(x_test_std)
print(pred) 

[[2881025.05203669]
 [3515373.86677263]
 [ 707055.59699286]
 ...
 [ 978965.19203833]
 [3825834.37369064]
 [1440155.93179809]]


In [27]:
y_test

Unnamed: 0,house_price
24182,595000
8738,4995000
10107,1295000
1520,895000
31176,845000
...,...
14547,1398000
24952,1775000
470,3595000
23456,1695000


In [222]:
#Standerdize the end result
y_test_std = scale.fit_transform(y_test)
y_pred_std = scale.fit_transform(pred) 

In [223]:
print(np.mean((y_pred_std - y_test_std)**2))

1.0
