In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
DATASET = r'C:\Users\91930\Documents\GITHUB\ArtOfAI\dataset\MelbourneHousingSnapshot\melb_data.csv'

In [5]:
melbourne_data = pd.read_csv(DATASET)

### Selecting Data for Modeling

In [6]:
melbourne_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [7]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [8]:
# Your Iowa data doesn't have missing values in the columns you use. 
# So we will take the simplest option for now, and drop houses from our data. 
# Don't worry about this much for now, though the code is:

# dropna drops missing values
melbourne_data = melbourne_data.dropna(axis=0)

#### Selecting the Predicting Variable

By convention, the prediction target is called **y**.

In [9]:
y = melbourne_data['Price']

In [10]:
y.head()

1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64

### Choosing "Features"

The columns that are inputted into our model (and later used to make predictions) are called "features".

In this case, those would be the columns used to determine the home price.

We select multiple features by providing a list of column names inside brackets. Each tiem in that list should be a string.

In [11]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

By convention, this data is called **X**

In [12]:
X = melbourne_data[melbourne_features]

In [13]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [14]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


### Building Your Model

The steps to building and using a model are:

- **Define**: What type of model will it be?
- **Fit**: Capture patterns from provided data. This is the heart of modeling
- **Predict**: Just what is sounds like
- **Evaluate**: Determine how accurate the model's prediction are.

Example of defining a decision tree model with sciket-learn and fitting it with the features target variable.

In [15]:
from sklearn.tree import DecisionTreeRegressor

In [16]:
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
# Fit model
melbourne_model.fit(x_train, y_train)

Many machine learning models allow some randomness in model training. Specifying a number for `random_state` ensures you get the same results in each run. This is considered a good practice. You use any number, and model quality won't depend meaningfully on exactly what value you choose.

We now have a fitted model that we can use to make predictions.

In practice, you'll want to make predictions for new houses coming on the market rather than the houses we already have prices for. But we'll make predictions for the first few rows of the training data to see how the predict function works.



In [21]:
print("Making predictions for the following 5 houses:")
print(x_train)
print("The predictions are")
print(melbourne_model.predict(x_test))

Making predictions for the following 5 houses:
       Rooms  Bathroom  Landsize  Lattitude  Longtitude
8937       5       2.0     733.0  -37.91894   145.24305
6236       4       2.0     705.0  -37.75950   145.11760
10541      3       1.0     275.0  -37.76126   145.00247
5443       4       2.0     843.0  -37.73480   145.06560
5101       2       1.0     229.0  -37.71000   145.00820
...      ...       ...       ...        ...         ...
3657       5       2.0     534.0  -37.80440   145.03860
5904       3       2.0     722.0  -37.74270   144.92260
6536       2       1.0       0.0  -37.86120   144.89710
4794       3       1.0     114.0  -37.83710   144.93900
2122       3       1.0     727.0  -37.79420   145.12180

[4956 rows x 5 columns]
The predictions are
[ 543000.  750000. 1990000. ... 1050000. 1150000. 1700000.]
