In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



In [2]:
# load the data set
df = pd.read_csv('../data/residence.csv',sep=";", index_col=False)  
df

Unnamed: 0,house_type,house_zip_code,house_rooms,house_square_meters,house_year,house_price
0,Andelsbolig,6270,3,100.0,1995,15
1,Villa,6430,3,89.0,1900,27
2,Fritidshus,8410,0,28.0,1970,34
3,Fritidshus,4942,2,36.0,1970,38
4,Villa,4900,3,75.0,1880,39
...,...,...,...,...,...,...
33034,Villa,7742,3,75.0,1906,75
33035,Fritidshus,6470,3,105.0,2010,15
33036,Villa,9830,2,149.0,1950,45
33037,Villa,6950,6,260.0,2000,41


In [3]:
# transform the data so that we only calculate predictions on givin ZIP


df = df.loc[df['house_zip_code'] == 3400]

#Data
df_x = df[['house_zip_code', 'house_rooms', 'house_square_meters',
       'house_year']]
#Target
df_y = df[['house_price']]

df_y
#df_x

Unnamed: 0,house_price
245,29
303,39
751,37
772,40
1164,36
...,...
24267,59
24407,22
25263,54
25534,39


In [4]:
# initialize the linear model
reg = linear_model.LinearRegression()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42 )

In [6]:
# fit scaler on training data
norm = MinMaxScaler().fit(x_train)


# transform training data
x_train_norm = norm.transform(x_train)


# transform testing dataa
x_test_norm = norm.transform(x_test)


In [7]:
print(x_train_norm)

[[0.         0.11111111 0.18303571 0.01546392]
 [0.         0.55555556 0.5625     0.92268041]
 [0.         0.55555556 0.52232143 0.57731959]
 [0.         0.55555556 0.57589286 0.61340206]
 [0.         0.55555556 0.62946429 0.71134021]
 [0.         0.55555556 0.64732143 0.74742268]
 [0.         0.44444444 0.78125    0.99484536]
 [0.         0.55555556 0.60714286 0.95360825]
 [0.         0.33333333 0.31696429 0.71649485]
 [0.         0.44444444 0.45089286 0.7371134 ]
 [0.         0.77777778 1.         0.72680412]
 [0.         0.22222222 0.26785714 0.32474227]
 [0.         0.33333333 0.35714286 0.91752577]
 [0.         0.33333333 0.37946429 0.51546392]
 [0.         0.33333333 0.61607143 0.80412371]
 [0.         0.22222222 0.04910714 0.99484536]
 [0.         1.         0.57142857 0.68041237]
 [0.         0.22222222 0.29910714 0.92783505]
 [0.         0.22222222 0.22767857 0.97938144]
 [0.         0.         0.         0.32474227]
 [0.         0.66666667 0.55803571 0.6185567 ]
 [0.         

In [14]:
print(y_train)

       house_price
12847           31
6867            44
9561            49
15099           37
1189            44
...            ...
22215           43
3395            49
18074           32
10339           28
21545           37

[78 rows x 1 columns]


In [16]:
reg.fit(x_train_norm, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
data = [[0, 0.55555556, 0.57589286, 0.61340206]]
testing_df = pd.DataFrame (data, columns = ['house_zip_code','house_rooms', 'house_square_meters', 'house_year'])
testing_df
# result should be 37

Unnamed: 0,house_zip_code,house_rooms,house_square_meters,house_year
0,0,0.555556,0.575893,0.613402


In [10]:
# predictions
pred = reg.predict(testing_df)
print(pred) 

[[41.93582024]
 [36.81510917]
 [54.96552656]
 [15.87235331]
 [43.4849566 ]
 [24.48601444]
 [36.40698873]
 [29.46281397]
 [49.34739083]
 [30.92629723]
 [25.23286148]
 [31.43480462]
 [32.54348399]
 [43.53961656]
 [31.48063691]
 [32.82011903]
 [65.43420563]
 [34.55118849]
 [37.27086114]
 [59.71428686]
 [30.61260491]
 [25.51601649]
 [39.54693414]
 [31.89573313]
 [35.29420242]
 [43.73392316]
 [24.28929482]
 [39.56740566]
 [19.24776953]
 [42.80020111]
 [32.4477651 ]
 [64.82171281]
 [37.56976695]
 [41.86929524]
 [35.80917712]
 [19.63361921]
 [48.93100384]
 [19.67298419]
 [20.16574078]]


In [11]:
y_test

Unnamed: 0,house_price
9548,40
1164,36
10705,67
9412,17
2509,51
15701,20
14048,35
18413,31
8297,57
2866,35


In [12]:
print(np.mean((pred - y_test)**2))

house_price    204.135434
dtype: float64
