<a href="https://colab.research.google.com/github/fabiansd/AI-workshop/blob/master/HPP_Samling_at_level_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# House price prediction

## initialization script

This is a initialization code. Run this and ignore it for now

In [0]:
## Import of libraries
import sklearn
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

## Option settings
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


#############################################################################


## Function that converts categorical variables to integer-encoded varables
def encode_variables(data):

  
  from sklearn import preprocessing

  ## Make encoder objects
  Neighborhood_enc = preprocessing.LabelEncoder()
  Condition1_enc = preprocessing.LabelEncoder()
  HouseStyle_enc = preprocessing.LabelEncoder()
  Fence_enc = preprocessing.LabelEncoder()
  PoolQualityCondition_enc = preprocessing.LabelEncoder()
  LandContour_enc = preprocessing.LabelEncoder()
  Heating_enc = preprocessing.LabelEncoder()

  ## Convert categorical columns
  data['Neighborhood'] = Neighborhood_enc.fit_transform(data['Neighborhood'])
  data['Condition1'] = Condition1_enc.fit_transform(data['Condition1'])
  data['HouseStyle'] = HouseStyle_enc.fit_transform(data['HouseStyle'])
  data['Fence'] = Fence_enc.fit_transform(data['Fence'])
  data['PoolQC'] = PoolQualityCondition_enc.fit_transform(data['PoolQC'])
  data['LandContour'] = LandContour_enc.fit_transform(data['LandContour'])
  data['Heating'] = Heating_enc.fit_transform(data['Heating'])

  
  return data

## Function that plots a correlation plot over the features in the dataset
def correlation_plot(data, figsize = (18,16)):
  sns.set(style="white")

  # Compute the correlation matrix
  corr = data.corr()

  # Generate a mask for the upper triangle
  mask = np.zeros_like(corr, dtype=np.bool)
  mask[np.triu_indices_from(mask)] = True

  # Set up the matplotlib figure
  f, ax = plt.subplots(figsize=figsize)

  # Generate a custom diverging colormap
  cmap = sns.diverging_palette(220, 10, as_cmap=True)

  # Draw the heatmap with the mask and correct aspect ratio
  sns.heatmap(corr, mask=mask, cmap=cmap,annot = True,fmt = ".2f", vmax=.3, center=0,
              square=True, linewidths=.5, cbar_kws={"shrink": .5})
  
def correlation_scatter(feature1, feature2, data, figsize=(15, 10), color = 'black'):
  plt.figure(figsize=figsize)
  plt.scatter(data[feature1], data[feature2], color=color)
  plt.xlabel(feature1)
  plt.ylabel(feature2)
  plt.title(feature1 + ' correlation with ' + feature2)
  plt.legend(loc='upper left')
  plt.show()

def distribution_numeric_features(feature, data, figsize=(15,7)):
  #Lab figur-objekt og bestem størrelsen
  plt.figure(figsize=figsize)

  #Velg type plot med sns (seaborn) biblioteket.
  sns.distplot(data[feature],bins=50, kde=False);

  #X- og y-aksen kan navngis
  plt.ylabel('Distribution amount')
  plt.xlabel(feature)


  #Sett tittel på figur
  plt.title('Distribution of ' + feature)
  plt.show()
  
def distribution_categorical_features(feature, data, sub_feature = None, figsize=(15,7)):
  plt.figure(figsize=figsize)
  plt.xticks(rotation=90)
  if sub_feature != None:
    sns.countplot(x=feature, hue=sub_feature, data=data)
    plt.title('Distribution of ' + feature + ' within ' + sub_feature)
  else: 
    sns.countplot(x=feature, data=data)
  
    plt.title('Distribution of ' + feature)
  
  
def split_data_function(data):
  
  
  from sklearn.model_selection import train_test_split
  
  input_features = list(data.columns.values)
  input_features.remove('SalePrice')

  ## Input data is all columns except the last, output data is the last column
  input_data = data.loc[:,input_features].copy(deep=True)
  output_data = data['SalePrice'].copy(deep=True)

  return train_test_split(input_data, output_data, test_size=0.2, random_state=1)

def prediction_result_plot(true_saleprice, predicted_saleprice, figsize=(15, 10)):
  plt.figure(figsize=figsize)
  plt.scatter(x=true_saleprice, y=predicted_saleprice)
  plt.xlabel('True Saleprice')
  plt.ylabel('Predicted Saleprice')
  plt.title('Prediction result -- truth against prediction')
  plt.show()

## Download dataset from Github into the script
from six.moves import urllib
urllib.request.urlretrieve("https://raw.githubusercontent.com/fabiansd/AI-workshop/master/data/Melbourne_train.csv", "./Melbourne_train.csv")
data_raw = pd.read_csv('Melbourne_train.csv')

# Select a number of columns to include
aktuelle_kolonner = ['Id','LotArea','Neighborhood','WoodDeckSF','Condition1','HouseStyle','LandContour','OverallQual','OverallCond','YearBuilt','YearRemodAdd','Heating','1stFlrSF','FullBath','HalfBath','BedroomAbvGr','Fence','KitchenAbvGr','TotRmsAbvGrd','GarageCars','GarageArea','Fireplaces','PoolArea','PoolQC','MoSold','YrSold']
data = data_raw[aktuelle_kolonner]

# Generating new feature based on other features
data['HouseAge'] = data_raw['YrSold'] - data_raw['YearBuilt']

#Fence NaN changed to None
data['Fence'] = data['Fence'].fillna('None')

#Adding saleprice on the end
data['SalePrice'] = data_raw['SalePrice']

#Removal of extreme points
data = data[data['LotArea'] < 100000]

## Getting started

Python alows users to use pre-made functions from various libraries. The libraries has to be **imported** in order to use them. Once imported, their functions can be used by defining what library, followed by dot and what function to use. E.g.:

library.function()


### Running a notebook

To run a code: press ctrl + enter or press play on the left top corner of the code block.

See all shortcuts on "Tools" or press Ctrl + m + h

Remember that you run code sequentally. If you try to use a variable/object from an earlier code block, this code block must have been run. If not, the notebook will not find the variable/object.


### Python library documentation

Pandas (data processing) <a href="https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html">dokumentasjon</a> og <a href="https://pandas.pydata.org/pandas-docs/stable/10min.html#min">tutorial</a> <br> 


Matplotlib (plot functions) <a href="https://matplotlib.org/gallery/index.html"> dokumentasjon </a> og <a href="https://www.datacamp.com/community/tutorials/matplotlib-tutorial-python">tutorial </a> <br>

Scikit-learn (machine learning library) <a href="http://scikit-learn.org/stable/" > dokumentasjon </a> og <a href="http://scikit-learn.org/stable/tutorial/index.html"> tutorial </a> <br>

In python, the **Scikit-learn** library is called **sklearn** for simplicity

Dont forget to google, often!

# Explore data

## Pandas dataframe

 <a href="https://www.kaggle.com/c/home-data-for-ml-course/data"> Description </a> of Melbourne house sale dataset on Kaggle
 
 We use Pandas to read the dataset and load it into the python script. Pandas offers a lot of functions that allows us to explore the data.
 
 Print the n first registrations in the dataset using
 
 <br>$data.head(n)$
 
 

Use the function $data.describe()$ <br> to print a statistical overview of the dataset

The function $data.info() $<br>
will print useable information about each feature in the dataset

## Plotting features

We can plot numerical features to see how they are distributed.

Use the following function to plot numerical features

$distribution\_numeric\_features(feature = 'feature\_name', data = dataset, figsize=(width, height))$

In [0]:
#Example
#distribution_numeric_features(feature = 'SalePrice', data = data, figsize=(20,7))





When plotting categorical variables we count the features rather than plotting a distribution.

$distribution\_categorical\_features(feature = 'feature\_name', data = dataset, figsize=(width, height))$ <br>

This function is used to plot categorical features. It can be modified to include undercategories as well:

$distribution\_categorical\_features(feature = 'feature\_name', sub\_feature = 'sub\_feature\_name', data = dataset, figsize=(width, height))$ <br>

In [0]:
#Example
#distribution_categorical_features(feature = 'Neighborhood', data = data, figsize=(15,7))





In [0]:
#Example
#distribution_categorical_features(feature = 'YrSold', sub_feature='MoSold', data = data, figsize=(15,7))







##Algorithms prefer numbers

Algorithms read only number. Therefore, the categorical fetures must be encoded into a numeric labelling. For example, a feature containing 'Yes' and 'No' can be encoded to 0 and 1.


Use the function $ data = encode\_variables(data)$ to encode the categorical features and print the 10 first registrations in the dataset.


In [0]:
# Encode categorical featrues into numbers






Now print the 5 first rows of the dataset to see that the categorical features has been encoded into number

# Select features

## Correlation plot

A correlation plot shows to what degree features change in correlation. If two features have a high correlation, they are somewhat related and have an impact on each other. In other words, they are connected. 

We generally want features that have an impact on the saleprice such that we can use these features to predict the saleprice. 

If a features has a positive correlation with the saleprice, the saleprice will be high when this feature is high (like house area). If a feature has a negative correlation with the saleprice, then the saleprice is high when the feature is low (like house age).


To plot correlation, use the function <br> $correlation\_plot(data = 'data', figsize = (width,height))$

## Create subdataset

We can select features by creating a sub dataset including only the desired features. This is obtained by creating a list of the specific features we want to use.



In [0]:
#Write down the features you want to include. E.g.: 

# data_selection = data[['LotArea','WoodDeckSF','SalePrice']]'

# data_selection.head()


Print an info summary and correlation plot of the new dataset

#Machine learning modelling

We typically select machine learning model based on the amount of features and data. Scikit-learn offers a <a href = "http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html">  model chart </a> that allows you to easily select the right models to test.


## Dividing into test and train data

First, the data is splitt into two separate data objects <br>
-- **Input**: all the features the model is predicting on <br>
-- **output** the saleprice, or label <br>
<br>
Then, the data is split into a train dataset and test dataset <br>
 -- **Train dataset**: used for training or building the machine learning model <br>
 -- **Test dataset**: used for testing after machine learning model is built (model validation) <br>
 <br>
 This results in a total of 4 datasets. We use the split function:
 
 <br>
 $input\_trainingdata, input\_testdata, output\_trainingdata, output\_testdata = split\_data\_function('data')$

In [0]:
## The data is split into two: Trainingdata and testdata for input and output (4 data objects in total)
input_trainingdata, input_testdata, output_trainingdata, output_testdata = split_data_function(data_selection)


#Print functions to reveal the shape of the data
print('Input trainingdata (samples, features): {}, Input testdata (samples, features): {}\n'.format(input_trainingdata.shape, input_testdata.shape)) 
print('Output trainingsdata (samples, label): {}, Output testdata (samples, label): {}\n'.format(output_trainingdata.shape, output_testdata.shape))


To measure the error on our predictions, we use mean absolute error. This is simply the absolute error between out prediction and the real saleprice over the whole test dataset.

In [0]:
from sklearn.metrics import mean_absolute_error

## Training a linear regression model

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Linear_regression.svg/438px-Linear_regression.svg.png">


This step shows how to use the data to build a macine learning model based using the trainingdata, then test the performance using the testdata.

To build a linear regression model, go to the <a href = "http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html" > sklearn documentation on linear regression </a>. The documentation explains how the model is used in the python script. The sklearn models accept a range of parameters which you can define, or leave to its defaults.



In [0]:
### TRAINING PHASE ###

## The model we want to use is imported. The model name from the documentation is used

from sklearn.linear_model import LinearRegression

## We then create a model object, which is a linear regression model in this case

model_linear_regression = LinearRegression()


## Finally, we train the model by using its built in function 'fit'. The input and output training data is used as parameters

model_linear_regression.fit(input_trainingdata, output_trainingdata)


## We now have a trained machine learning model that is able to predict output (saleprice) from input (features)

In [0]:
### TEST PHASE ###

## We make predictions on the input from the test data by using the built in function 'predict'. The input testdata is used a parameter
saleprice_predictions_lin_reg = model_linear_regression.predict(input_testdata)


## Finally, we calculate the mean absolute error between output testdata (true saleprice on testdata) and output predictions (predicted saleprice on testdata)
MAE_lin_reg = mean_absolute_error(saleprice_predictions_lin_reg, output_testdata)


## The test error is printed
print("MAE: {}".format(MAE_lin_reg))

In [0]:
### VISUALIZATION PHASE ###
## 'output_testdata' is the true house price
## 'saleprice_predictions_lin_reg' is the predicted house prices from the linear regression model
prediction_result_plot(true_saleprice = output_testdata, predicted_saleprice = saleprice_predictions_lin_reg, figsize=(15, 10))

This is the coding pipeline for creating your own machine learning model. Fill inn the 

In [0]:
## The text strings need to be defined by you ##

#Training
from sklearn.'Model_Type' import 'Model'

'model_name' = 'Model'

'model_name'.fit(input_trainingdata, output_trainingdata)

#Testing
'predictions_name' = 'model_name'.predict(input_testdata)

'MAE_name' = mean_absolute_error('predictions_name', output_testdata)

print("MAE: {}".format('MAE_name'))

#Visualizing
prediction_result_plot(true_saleprice = output_testdata, predicted_saleprice = 'predictions_name', figsize=(15, 10))

### Model proposals:

<a href="http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html">Decision tree regression</a>

<a href="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html">Lasso regression</a>

<a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html">Gradient boosting regression</a>



In [0]:
#### Import examples ####

# from sklearn.tree import DecisionTreeRegressor
# from sklearn.linear_model import Lasso
# from sklearn.ensemble import GradientBoostingRegressor


# Custom models

Create a model that beats the linear regression score

Training phase

Testing phase

Results visualization