In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



In [2]:
# load the data set
df = pd.read_csv('../data/residence.csv',sep=";", index_col=False)  
df

Unnamed: 0,house_type,house_zip_code,house_rooms,house_square_meters,house_year,house_price
0,Andelsbolig,6270,3,100.0,1995,153289
1,Villa,6430,3,89.0,1900,275000
2,Fritidshus,8410,0,28.0,1970,349000
3,Fritidshus,4942,2,36.0,1970,385000
4,Villa,4900,3,75.0,1880,395000
...,...,...,...,...,...,...
6675,Villa,7800,5,166.0,1911,1395000
6676,Ejerlejlighed,4000,1,43.0,1969,1395000
6677,Fritidshus,4550,3,126.0,1981,1395000
6678,Villa,7400,3,98.0,1942,1395000


In [3]:
# transform the data so that we only calculate predictions on givin ZIP


df = df.loc[df['house_zip_code'] == 3400]

#Data
df_x = df[['house_zip_code', 'house_rooms', 'house_square_meters',
       'house_year']]
#Target
df_y = df[['house_price']]

df_y
#df_x

Unnamed: 0,house_price
245,2995000
303,3995000
751,3795000
772,4095000
1164,3645000
1189,4495000
1191,4495000
1709,4495000
2440,3695000
2474,4195000


In [4]:
# initialize the linear model
reg = linear_model.LinearRegression()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42 )

In [6]:
# fit scaler on training data
norm = MinMaxScaler().fit(x_train)


# transform training data
x_train_norm = norm.transform(x_train)


# transform testing dataa
x_test_norm = norm.transform(x_test)


In [7]:
print(x_train_norm)

[[0.         0.         0.2        0.98639456]
 [0.         0.4        0.42222222 1.        ]
 [0.         0.2        0.32222222 0.91836735]
 [0.         0.6        1.         0.6462585 ]
 [0.         0.2        0.42222222 0.7414966 ]
 [0.         0.4        0.55555556 0.68027211]
 [0.         0.4        0.63333333 0.65986395]
 [0.         0.2        0.41111111 0.78911565]
 [0.         0.2        0.12222222 0.91836735]
 [0.         0.2        0.57777778 0.        ]
 [0.         0.         0.08888889 0.80272109]
 [0.         0.         0.17777778 0.93197279]
 [0.         0.2        0.33333333 0.54421769]
 [0.         0.2        0.37777778 0.3877551 ]
 [0.         0.4        0.62222222 0.93197279]
 [0.         1.         0.9        0.28571429]
 [0.         0.         0.         1.        ]
 [0.         0.6        0.8        0.63945578]]


In [8]:
print(y_train)

      house_price
3988      3895000
303       3995000
1164      3645000
1189      4495000
751       3795000
4004      4150000
5712      4895000
772       4095000
6463      3595000
6132      4250000
4623      2695000
6513      4995000
5076      4495000
1709      4495000
2509      5150000
3395      4995000
4631      2795000
1191      4495000


In [9]:
reg.fit(x_train_norm, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
data = [[0, 0.55555556, 0.57589286, 0.61340206]]
testing_df = pd.DataFrame (data, columns = ['house_zip_code','house_rooms', 'house_square_meters', 'house_year'])
testing_df
# result should be 37

Unnamed: 0,house_zip_code,house_rooms,house_square_meters,house_year
0,0,0.555556,0.575893,0.613402


In [19]:
# predictions
pred = reg.predict(x_test_norm)
print(pred)

[[4923611.71949001]
 [4155223.38411241]
 [3016208.51694467]
 [3655453.07265739]
 [3641154.19219429]
 [3788040.18572243]
 [5005782.72010536]
 [4508362.35496578]
 [3498729.67937998]
 [3749111.96020341]]


In [20]:
y_test

Unnamed: 0,house_price
2474,4195000
6394,2400000
2440,3695000
5286,2495000
245,2995000
2900,4495000
4009,4245000
5295,2750000
2866,3595000
3292,4450000


In [13]:
print(np.mean((pred - y_test)**2))


house_price    1.241624e+12
dtype: float64
