In [188]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import urllib

In [189]:
# Gather Data
boston_dataset = load_boston()
data = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
features = data.drop(['INDUS', 'AGE'], axis=1)

log_prices = np.log(boston_dataset.target)
target = pd.DataFrame(log_prices, columns=['PRICE'])


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [190]:
# Create indexes in order to simplify later access to our data (using unpacking):
CRIM_IDX, ZN_IDX, CHAS_IDX, NOX_IDX, RM_IDX, DIS_IDX, RAD_IDX, TAX_IDX, PTRATIO_IDX, B_IDX, LSTAT_IDX =  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10

As we want to generate predictions of the value of a property, we'll need to create something that looks like another row of data, structured the same way of our features, to store predictions. Let's create a variable for that: property_stats. For the first property we'll evaluate, let's just go with Boston's AVG for all features. mean() returns a pandas Series, we need to take its .values 1D array, then reshape it to 2D to fit it into stats

In [191]:
property_stats = features.mean().values.reshape(1, 11)

OK, now we have a template of what we want our predictions to look like. We have populated that template with the mean of each feature. Now let's use scikit-learn to get the estimated theta values, MSE, and RMSE for our model.

In [192]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)

MSE = mean_squared_error(target, fitted_vals, squared = True)
RMSE = mean_squared_error(target, fitted_vals, squared = False)
print('MSE =', MSE, 'RMSE =', RMSE)

MSE = 0.03516080084618688 RMSE = 0.18751213519713034


In [193]:
def get_log_estimate(nr_rooms, students_per_classroom, by_river=False, high_confidence=True):
    
    # Configure property
    property_stats[0][RM_IDX] = nr_rooms
    property_stats[0][PTRATIO_IDX] = students_per_classroom
    
    if by_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0
    
    # Make prediction
    log_estimate = regr.predict(property_stats)[0][0]
    
    # Calc Range
    # If high_confidence, we'll calculate the 95% prob one. Otherwise, the 68% one:
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE
        interval = 68
    
    return log_estimate, upper_bound, lower_bound, interval

**Challenge:** Write the code that converts the log price estimate using 1970s prices as well as the upper and lower bounds to today's prices. Round to the nearest 1000$

In [194]:
# Taking inflation into account, taking today's median price from Zillow:
ZILLOW_MEDIAN_PRICE_TODAY = 583.3
SEVENTIES_MEDIAN_PRICE = np.median(boston_dataset.target)
scale_factor = ZILLOW_MEDIAN_PRICE_TODAY / SEVENTIES_MEDIAN_PRICE
print('Scale factor =', scale_factor)

Scale factor = 27.514150943396224


In [195]:
def get_dollar_estimate(rm, ptratio, chas=False, large_interval=True):

    """Estimate the price of a property in Boston.
    
    Keyword arguments:
    rm -- number of rooms in the property.
    ptratio -- number of students per teacher in the classroom for the school in the area.
    chas -- True if the property is next to the river, False otherwise.
    large_range -- True for a 95% prediction interval, False for a 68% interval.
    
    Unrealistic inputs:
    rm < 1
    ptratio < 1 or ptratio > 50
    """
    
    # Reject unrealistic values
    if rm<1 or ptratio<1 or ptratio>50 :
        print('This is unrealistic. Try again.')
        return

    log_est, upper, lower, conf = get_log_estimate(nr_rooms = rm, 
                                                   students_per_classroom = ptratio, 
                                                   by_river=chas, 
                                                   high_confidence=large_interval)

    current_price_USD = round(np.e**log_est*1000*scale_factor,-3)
    current_upper_USD = round(np.e**log_est*1000*scale_factor,-3)
    current_lower_USD = round(np.e**log_est*1000*scale_factor,-3)
    
    print('Current estimated price in $:', current_price_USD)
    print('1970s Upper bound in $:', current_upper_USD)
    print('1970s Lower bound in $:', current_lower_USD)
    print('Prediction\'s confidence:', estimate[3], '%')

In [196]:
# Price to estimate in log dollars:
get_dollar_estimate(rm=2, ptratio=51, chas=True)

This is unrealistic. Try again.


We now created a module from this code (called boston_valuation.py, stored in this same folder). Now we can call the module from any of our notebook in this same folder, by importing the module and calling the function:

In [197]:
import boston_valuation as val
val.get_dollar_estimate(6, 12, True)

NameError: name 'boston_dataset' is not defined