In [1]:
from sklearn.linear_model import LinearRegression as lr
from sklearn.metrics import mean_squared_error as mse

import pandas as pd
import numpy as np
import boston_valuation as val

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
df_data = pd.DataFrame(data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 
                                      'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO','B','LSTAT'])
features = df_data.drop(['INDUS', 'AGE'], axis=1)

log_prices = np.log(target)
target = pd.DataFrame(log_prices, columns=['PRICE'])


In [3]:
CRIM_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
RM_IDX = 4
PTRATIO_IDX = 8

property_stats = np.ndarray(shape=(1,11))
property_stats[0][CRIM_IDX] = features['CRIM'].mean()
property_stats[0][ZN_IDX] = features['ZN'].mean()
property_stats[0][CHAS_IDX] = features['CHAS'].mean()

property_stats

array([[3.61352356e+000, 1.13636364e+001, 6.91699605e-002,
        1.97626258e-323, 2.47032823e-323, 3.45845952e-323,
        3.95252517e-323, 4.44659081e-323, 4.94065646e-323,
        5.43472210e-323, 5.92878775e-323]])

In [4]:
type(features.mean())

pandas.core.series.Series

In [5]:
type(features.mean().values)

numpy.ndarray

In [6]:
features.mean().values.shape

(11,)

In [7]:
# Musíme reshape na pole 1,11

features.mean().values.reshape(1,11)

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [8]:
property_stats = features.mean().values.reshape(1,11)

In [9]:
regr = lr().fit(features, target)
fitted_vals = regr.predict(features)

# Challenge: Calculate the MSE and RMSE using sklearn
MSE = mse(target,fitted_vals)

RMSE = np.sqrt(MSE)


In [10]:
# Funkcia na odhad log ceny domu

def get_log_estimate(nr_rooms,
                    students_per_classroom,
                    next_to_river=False,
                    high_confidence=True):
    # Configure property
    property_stats[0][RM_IDX] = nr_rooms
    property_stats[0][PTRATIO_IDX] = students_per_classroom
    
    if next_to_river:
        property_stats[0][CHAS_IDX] = 1
    else:
        property_stats[0][CHAS_IDX] = 0
    
    print('Property Stats: ', property_stats[0])
    print('Property Stats: ', property_stats.shape)
    # Make prediction
    log_estimate = regr.predict(property_stats)[0][0]
    print('Log estimate: ', log_estimate)
    
    # Calc Range
    if high_confidence:
        # Do X (95%)
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        # Do Y (65%)
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE
        interval = 68
        
    return log_estimate, upper_bound, lower_bound, interval

In [11]:
get_log_estimate(5, 5, next_to_river=True, high_confidence=False)

Property Stats:  [  3.61352356  11.36363636   1.           0.55469506   5.
   3.79504269   9.54940711 408.23715415   5.         356.67403162
  12.65306324]
Property Stats:  (1, 11)
Log estimate:  3.5194923023383096




(3.5194923023383096, 3.70700443753544, 3.3319801671411793, 68)

In [12]:
np.e**np.median(target) # lebo target je v logaritmických cenách

21.2

In [14]:
# Challenge : Write the python code that converts the log price extimate using 1970s prices
# as well as upper and the lower bounds to todays´ prices? Round the values to the nearest 1000 dollars

zillow_median_price = 583.3
scale_factor = zillow_median_price / np.e**np.median(target)

log_est, upper, lower, conf = get_log_estimate(nr_rooms=5, students_per_classroom=15,
                                            next_to_river=False, high_confidence=False)

# convert to todays dollars
dollar_est = np.e**log_est * 1000 * scale_factor
dollar_hi = np.e**upper * 1000 * scale_factor
dollar_lo = np.e**lower * 1000 * scale_factor

# Round the dollar values to nearest thousand
rounded_est = np.around(dollar_est, -3)
rounded_hi = np.around(dollar_hi, -3)
rounded_lo = np.around(dollar_lo, -3)

print(f'The estimated property value is {rounded_est}')
print(f'AT {conf}% confidence the valuation range is')
print(f'USD {rounded_lo} at the lower end to USD {rounded_hi} at the high end.')

Property Stats:  [  3.61352356  11.36363636   0.           0.55469506   5.
   3.79504269   9.54940711 408.23715415  15.         356.67403162
  12.65306324]
Property Stats:  (1, 11)
Log estimate:  3.04008483550242
The estimated property value is 575000.0
AT 68% confidence the valuation range is
USD 477000.0 at the lower end to USD 694000.0 at the high end.




In [15]:
def get_dollar_estimate(rm,
                    ptratio,
                    chas=False,
                    large_range=True):
    
    """
    Estimate the price of the property in Boston.
    rm -- number of rooms in the property.
    ptratio -- number of students per teacher in the classroom for the school in the area.
    chas --True if the property is next to river, False otherwise.
    large_range -- True for 95 % prediction interval, False for a 68 % interval.
    
    """
    if rm < 1 or ptratio < 1:
        print('That is unrealistic. Try again')
        return
        
        
    log_est, upper, lower, conf = get_log_estimate(rm,
                                                    students_per_classroom=ptratio,
                                                    next_to_river=chas,
                                                    high_confidence=large_range)
    # convert to todays dollars
    dollar_est = np.e**log_est * 1000 * scale_factor
    dollar_hi = np.e**upper * 1000 * scale_factor
    dollar_lo = np.e**lower * 1000 * scale_factor

    # Round the dollar values to nearest thousand
    rounded_est = np.around(dollar_est, -3)
    rounded_hi = np.around(dollar_hi, -3)
    rounded_lo = np.around(dollar_lo, -3)

    print(f'The estimated property value is {rounded_est}')
    print(f'AT {conf}% confidence the valuation range is')
    print(f'USD {rounded_lo} at the lower end to USD {rounded_hi} at the high end.')

In [16]:
get_dollar_estimate(rm=2, ptratio=30, chas=True)

Property Stats:  [  3.61352356  11.36363636   1.           0.55469506   2.
   3.79504269   9.54940711 408.23715415  30.         356.67403162
  12.65306324]
Property Stats:  (1, 11)
Log estimate:  2.3118263814453304
The estimated property value is 278000.0
AT 95% confidence the valuation range is
USD 191000.0 at the lower end to USD 404000.0 at the high end.




In [17]:
val.get_dollar_estimate(6,12,True)

The estimated property value is 783000.0
AT 95% confidence the valuation range is
USD 538000.0 at the lower end to USD 1139000.0 at the high end.


