# Data Plotting


In [119]:
import importlib
import pandas as pd
import data_preprocessor as dpp
importlib.reload(dpp)

melbourne_data = dpp.process(pd.read_csv('melb_data.csv'))
melbourne_data.describe()


Unnamed: 0,rooms,price,distance,postcode,bedroom2,bathroom,car,landsize,buildingarea,yearbuilt,lattitude,longtitude,propertycount
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1068828.0,9.751097,3101.947708,2.902034,1.57634,1.573596,471.00694,141.568645,1964.081988,-37.807904,144.990201,7435.489509
std,0.971079,675156.4,5.612065,86.421604,0.970055,0.711362,0.929947,897.449881,90.834824,38.105673,0.07585,0.099165,4337.698917
min,1.0,131000.0,0.0,3000.0,0.0,1.0,0.0,0.0,0.0,1196.0,-38.16492,144.54237,389.0
25%,2.0,620000.0,5.9,3044.0,2.0,1.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198,4383.75
50%,3.0,880000.0,9.0,3081.0,3.0,1.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958,6567.0
75%,4.0,1325000.0,12.4,3147.0,3.0,2.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527,10175.0
max,8.0,9000000.0,47.4,3977.0,9.0,8.0,10.0,37000.0,3112.0,2018.0,-37.45709,145.52635,21650.0


In [120]:
melbourne_data.head()

Unnamed: 0,suburb,address,rooms,type,price,method,sellerg,date,distance,postcode,...,bathroom,car,landsize,buildingarea,yearbuilt,councilarea,lattitude,longtitude,regionname,propertycount
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,7/05/2016,2.5,3067.0,...,2.0,0.0,245.0,210.0,1910.0,Yarra,-37.8024,144.9993,Northern Metropolitan,4019.0
7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,8/10/2016,2.5,3067.0,...,1.0,2.0,256.0,107.0,1890.0,Yarra,-37.806,144.9954,Northern Metropolitan,4019.0


Let's confirm there are now NaN values...

In [121]:
melbourne_data.columns

Index(['suburb', 'address', 'rooms', 'type', 'price', 'method', 'sellerg',
       'date', 'distance', 'postcode', 'bedroom2', 'bathroom', 'car',
       'landsize', 'buildingarea', 'yearbuilt', 'councilarea', 'lattitude',
       'longtitude', 'regionname', 'propertycount'],
      dtype='object')

In [122]:

for col in melbourne_data.columns:
    print(f'{col} has NaN') if melbourne_data[col].hasnans  else None

## Selecting the Prediction Target

In [123]:
model_features = [ 'rooms', 
        'distance', 'postcode', 'bedroom2', 'bathroom', 'car',
       'landsize',  'lattitude',
       'longtitude',  'propertycount']


y = melbourne_data.price
X = melbourne_data[model_features]

X.describe()

Unnamed: 0,rooms,distance,postcode,bedroom2,bathroom,car,landsize,lattitude,longtitude,propertycount
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,9.751097,3101.947708,2.902034,1.57634,1.573596,471.00694,-37.807904,144.990201,7435.489509
std,0.971079,5.612065,86.421604,0.970055,0.711362,0.929947,897.449881,0.07585,0.099165,4337.698917
min,1.0,0.0,3000.0,0.0,1.0,0.0,0.0,-38.16492,144.54237,389.0
25%,2.0,5.9,3044.0,2.0,1.0,1.0,152.0,-37.855438,144.926198,4383.75
50%,3.0,9.0,3081.0,3.0,1.0,1.0,373.0,-37.80225,144.9958,6567.0
75%,4.0,12.4,3147.0,3.0,2.0,2.0,628.0,-37.7582,145.0527,10175.0
max,8.0,47.4,3977.0,9.0,8.0,10.0,37000.0,-37.45709,145.52635,21650.0


In [124]:
X.head()


Unnamed: 0,rooms,distance,postcode,bedroom2,bathroom,car,landsize,lattitude,longtitude,propertycount
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,144.9934,4019.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,144.9944,4019.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,144.9941,4019.0
6,3,2.5,3067.0,4.0,2.0,0.0,245.0,-37.8024,144.9993,4019.0
7,2,2.5,3067.0,2.0,1.0,2.0,256.0,-37.806,144.9954,4019.0


## Normalizing our Data

In [125]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,0.275915,0.205719,0.104348,0.322448,0.082334,0.15736,0.01273,0.504381,0.455122,0.331428
std,0.138726,0.118398,0.088456,0.107784,0.101623,0.092995,0.024255,0.107159,0.100779,0.204021
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.142857,0.124473,0.045036,0.222222,0.0,0.1,0.004108,0.437227,0.390077,0.187891
50%,0.285714,0.189873,0.082907,0.333333,0.0,0.1,0.010081,0.512369,0.460812,0.290579
75%,0.428571,0.261603,0.150461,0.333333,0.142857,0.2,0.016973,0.574601,0.518639,0.460279
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [126]:

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.8, random_state=1)
lr_model = LinearRegression()
lr_model.fit(train_X, train_y)

pred_Y = lr_model.predict(val_X)
mean_absolute_error(val_y, pred_Y)


312424.8210104571

In [127]:
train_x_df = pd.DataFrame(train_X)
train_x_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2629,0.142857,0.080169,0.211873,0.111111,0.0,0.1,0.0,0.458048,0.402986,0.388458
4123,0.285714,0.28692,0.151484,0.333333,0.142857,0.2,0.00773,0.408883,0.562034,0.150181
5872,0.285714,0.198312,0.025589,0.333333,0.142857,0.2,0.014297,0.457525,0.304295,0.223085
2766,0.142857,0.236287,0.074719,0.222222,0.0,0.1,0.00973,0.618256,0.490894,1.0
3895,0.285714,0.172996,0.012282,0.222222,0.0,0.1,0.0,0.520633,0.330118,0.219604


In [128]:
val_x_def = pd.DataFrame(val_X)
val_x_def.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
3273,0.285714,0.280591,0.020471,0.333333,0.285714,0.2,0.005973,0.5568,0.302577,0.180048
4775,0.428571,0.130802,0.125896,0.444444,0.142857,0.2,0.01427,0.46555,0.508882,0.286581
2192,0.285714,0.059072,0.0,0.333333,0.142857,0.1,0.0,0.502126,0.433678,0.804619
3178,0.285714,0.128692,0.186285,0.333333,0.142857,0.2,0.028081,0.425413,0.446788,0.60444
1987,0.714286,0.118143,0.103378,0.666667,0.714286,0.3,0.036054,0.51145,0.492215,0.467617


In [130]:
my_house = pd.DataFrame([[3, 1, 6155, 5, 5, 3, 15000, -37.8079,	144.9934, 6]])
scaled = scaler.fit_transform(my_house) 
lr_model.predict(scaled)

array([716114.63972727])