# HACKATHON - Goal: Create a Market Model for the price of a property in Amsterdam

Read the instruction and try create a sharp model within your team

In [None]:
import pandas as pd
import numpy as np

In [None]:
try:
    import folium
except:
    !pip install folium

# (!) Action Required - Importing the data

![as](https://i.imgur.com/vo4XluQ.png)

- Activate the empty cell below (cursor should blink in the empty cell below).
- Click the data symbol on the right
- Find your data set > Insert to code > Insert pandas DataFrame

# (!) Action Required - Rename houses_raw to the fresly imported data frame

Most likely DSX have imported the data as `df_data_5` or similar.
It is good practive to rename the data in the next cell, and continue from there.

* Store the `df_data_X` (where X is a number) `houses_raw` to make the next cells work, indepentenly how name was chosen during import.

In [None]:
houses_raw = df_data_4
houses_raw.head()

# Filter Data - As the data is from internet, it might contain faulthy rows

In [None]:
filterd_on_price = houses_raw[ houses_raw["price"] > 50000 ]
filtered_on_type = filterd_on_price[ filterd_on_price["type"] != "garage" ]
filtered_on_area = filtered_on_type[filtered_on_type["area"] > 10 ]
houses = filtered_on_area
houses.head()

In [None]:
import folium

lat_lng_list = list( zip( list(houses.lat), list(houses.lng) ) )
house_density = [ (lat,lng,0.3) for (lat,lng) in lat_lng_list ]
centre_amsterdam = (52.372842, 4.893643)

from folium.plugins import HeatMap

map_with_houses = folium.Map(centre_amsterdam, tiles='stamentoner', zoom_start=11)

HeatMap(house_density).add_to(map_with_houses)
map_with_houses

In [None]:
from geopy.distance import great_circle

def distance_to_Dam_Square(lat, lng):
    DAM_SQUARE = (52.372842, 4.893643)
    return( great_circle( (lat,lng), DAM_SQUARE ).km )

anna_frank = (52.375239, 4.883885)
willems_huis = (51.957744, 4.553619)

print( "The distance (KM) from The Anna Frank house, to the Dam Square is ", distance_to_Dam_Square(*anna_frank) )
print( "The distance (KM) from my house, to the Dam Square is ", distance_to_Dam_Square(*willems_huis) )

In [None]:
import math

distance_series = houses.apply( lambda x : distance_to_Dam_Square(x.lat , x.lng) , axis=1)
distance_df = distance_series.to_frame(name="distance_to_dam")

In [None]:
houses_with_distance = houses.join( distance_df )
houses_with_distance.head()

In [None]:
volumne_per_square = houses_with_distance.apply( lambda x : (x.volume / (x.area + 1) ) , axis=1)
volumne_per_square_df = volumne_per_square.to_frame(name="volume_per_square")
volumne_per_square_df.head()

In [None]:
houses_feature_engineered = houses_with_distance.join(volumne_per_square_df)
houses_feature_engineered.head()

In [None]:
def house_category( house_type ):
    type = 0
    
    if isinstance( house_type , int):
        return( house_type )
    
    house_type = str(house_type)
    if house_type == "woonhuis":
        return 1
    if house_type == "appartement":
        return 2
    if house_type == "parkeergelegenheid":
        return 3
    return type

In [None]:
houses_feature_engineered["type"] = houses_feature_engineered["type"].apply( house_category )
houses_feature_engineered.head()

## Creation of Test/Train set - Using a seed to make sure nobody has an advantage by randomness

In [None]:
np.random.seed(2018)
train_mask = np.random.uniform(size=len(houses_feature_engineered)) <= .85


train = houses_feature_engineered[train_mask]
test = houses_feature_engineered[~train_mask]

print("There are {} rows in the train set, There are {} rows in the test set".format( train.shape[0], test.shape[0]) )

## Example - A Classic Linear Regression from a Text-Book, price based on square-m

In [None]:
from sklearn.linear_model import LinearRegression


# Create linear regression object
regr = LinearRegression(fit_intercept=True)

In [None]:
features = train[ ["area"] ]
target = train[ ["price"] ]

regr.fit( features, target )

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

plt.scatter(features.values, target.values,  color='black')
plt.plot(features, regr.predict(features), color='blue',         linewidth=1)
plt.xlabel( "Living Area in Square Meter")
plt.ylabel(" Price in Euros")
plt.ticklabel_format(style='plain', axis='y', )
plt.show()

In [None]:
"""
Something is 'Quite well Predicted' if the prediction error is 10% or less. 

"""


def percentage_quite_well_predicted( actual_values, predicted_values ):
    """
        Summary:
            Calculated a performance metrics - understandable for none-analysts,
            the number of well predicted prices, which is within an
            error margin of 10% (treshold = 0.1)
         
        Input:
            - first argument: A list or numpy array with the actual price 
            - second argument: A list or numpy array with predictions
        
        Output:
            A single float, indicating the proportion of correctly predicted values
            
            Example:
            0.5 - 50% of predictions are well predicted
            0.9 - 90% of predictoins are well predicted

    """
    actual_values = np.array(actual_values ).ravel()
    predicted_values = np.array(predicted_values).ravel()
    difference = (actual_values - predicted_values) 
    relative_error = np.absolute(difference) / actual_values
    treshold = 0.1
    proportion_within_treshold = np.mean( relative_error <= treshold)
    return( proportion_within_treshold )


# Example call
percentage_quite_well_predicted( [100,100,100,100,100], [104,104.4,105.5,79,198])

In [None]:
test_features = test[ ["area"] ]
test_target = test[ ["price"] ]

preds = regr.predict(test_features)

probability_correct = percentage_quite_well_predicted(test_target.price.tolist(),  [ val[0] for val in preds]  )
"The model was able to predict {percentage:4.2f}% of the Houses quite well".format(percentage=(100* probability_correct) )

# Goal - Create a (better) model, Trained on the "train" set

Use the `test` pandas dataframe, and the `percentage_quite_well_predicted()` metric, to calculate the performance of your team


To give you a (poor) starting point, XGBOOST have been setup to work on the data. The paremeters are not chosen well.
Random Forests, and Neirest Neighbor, has proven to be powerfull models on this data set.

In [None]:
import xgboost

In [None]:
houses_feature_engineered.columns

In [None]:
X = train[ ['area', 'volume', 'floors', 'year_build', 'has_garden', 'type', 'photos', 'rooms', 'distance_to_dam', 'volume_per_square'] ]
y = train[ ["price"] ]

In [None]:
from sklearn.cross_validation import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

In [None]:
from xgboost.sklearn import XGBClassifier, XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error

from xgboost import plot_importance
from matplotlib import pyplot

import pprint

In [None]:
pipeline = Pipeline([('scaler', StandardScaler()), ('regressor', XGBRegressor(n_estimators=10, learning_rate=0.9))])
pipeline

In [None]:
pipeline.fit(X_train.values, y_train.values)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

labels_of_columns = ['area', 'volume', 'floors', 'year_build', 'has_garden', 'type', 'photos', 'rooms', 'distance_to_dam', 'volume_per_square']
xgbooster_of_fit = pipeline.steps[1][1].booster()
feature_scores = xgbooster_of_fit.get_fscore()
labels,feat_importance = zip(*[ (labels_of_columns[int(k[1:])],imp) for (k,imp) in feature_scores.items() ])

fig, ax = plt.subplots()

y_ticks = np.array(range(len(feat_importance)))
width=.8

ax.barh(y_ticks, feat_importance, color="blue")
ax.set_yticklabels(labels)
ax.set_yticks(y_ticks + width / 2)
plt.show()

In [None]:
predictions_on_test = pipeline.predict(test[['area', 'volume', 'floors', 'year_build', 'has_garden', 'type', 'photos', 'rooms', 'distance_to_dam', 'volume_per_square']])
true_values_test = test[["price"]].values


probability_correct = percentage_quite_well_predicted(predictions_on_test, true_values_test)
"The model was able to predict {percentage:4.2f}% of the Houses quite well".format(percentage=(100* probability_correct) )