In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore")

## Review

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)  

def correlation(t, x, y):
    return np.mean(standard_units(t.column(x))*standard_units(t.column(y)))

In [None]:
def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sum((point1-point2)**2) ** 0.5

def all_distances(training, new_row, y_name):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop(y_name)
    def distance_from_new(row):
        return distance(make_array(new_row), make_array(row))
    return attributes.apply(distance_from_new)


def table_with_distances(training, new_point, y_name):
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point, y_name))


def nearest(training, new_point, y_name, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point, y_name)
    sorted_by_distance = with_dists.sort('Distance')
    nearest_neighbors_table = sorted_by_distance.take(np.arange(k))
    return nearest_neighbors_table


## New material

In [None]:
all_sales = Table.read_table('house.csv')

In [None]:
sales = all_sales.where('Bldg Type', '1Fam').where('Sale Condition', 'Normal').select(
    'SalePrice', '1st Flr SF', '2nd Flr SF', 
    'Total Bsmt SF', 'Garage Area', 
    'Wood Deck SF', 'Open Porch SF', 'Lot Area', 
    'Year Built', 'Yr Sold')

Exploration!

In [None]:
sales.num_rows

In [None]:
sales.sample(6).show(6)

In [None]:
sales.hist('SalePrice', bins=32, unit='$')

In [None]:
sales.scatter('1st Flr SF', 'SalePrice')

In [None]:
correlation(sales, 'SalePrice', '1st Flr SF')

- It's always a good idea to take a look at each of the variables in some capacity.
    -  To make the demo shorter we will just look at the $r$ for each variable, but you should look at the plots as well.

In [None]:
for label in sales.labels:
    print('Correlation of', label, 'and SalePrice:\t', correlation(sales, label, 'SalePrice'))

______

For ease of computational efficiency, we will use the five variables having the highest correlation coefficients with `SalePrice`.

In [None]:
sales = sales.select(0, 1, 2, 3, 4, 8)
sales.show(3)

### How is an MLR estimate calculated?

This is a 50/50 train-test split.

In [None]:
train, test = sales.split(k = round(sales.num_rows/2))

**Task**: Find the best slopes for the linear regression model:

$$\text{estimate of sale price} = a*\text{1st floor SF} + b*\text{2nd floor SF} + c*\text{Bsmt SF} + d*\text{Garage Area} + e*\text{Year Built} + f$$

We are going to find the best slopes in the model by minimization. Let's set up our equation to do this in a function!

In [None]:
def sales_train_rmse(a,b,c,d,e,f):

    y = train.column('SalePrice')
    
    first_floor = train.column('1st Flr SF')
    second_floor = train.column('2nd Flr SF')
    basement = train.column('Total Bsmt SF')
    garage = train.column('Garage Area')
    year = train.column('Year Built')
    
    estimate = a*first_floor + b*second_floor + c*basement + d*garage + e*year + f
    
    return (np.mean((y - estimate) ** 2)) ** 0.5

You do not need to be concerned with how to supply the `start` argument in `minimize()`. We will do this for you. In this case, we generated the starting slopes at random from a normal distribution with mean 10 and standard deviation of 1.

In [None]:
example_slopes = np.random.normal(10, 1, len(train.row(0)))

The `smooth` argument will always read `True` for our purposes!

In [None]:
minimize(sales_train_rmse, start=example_slopes, smooth=True)

In [None]:
sales_train_rmse(## copy the slopes that you get from minimize() in here!)

**Task**: Find the fitted values (sale price predictions on the *training* set) for our model.

In [None]:
def sales_estimate(data, a,b,c,d,e,f):

    first_floor = data.column('1st Flr SF')
    second_floor = data.column('2nd Flr SF')
    basement = data.column('Total Bsmt SF')
    garage = data.column('Garage Area')
    year = data.column('Year Built')
    
    return a*first_floor + b*second_floor + c*basement + d*garage + e*year + f

In [None]:
fitted_values = sales_estimate(train, ## copy the slopes that you get from minimize() in here!)

### How can we evaluate our model fit?

In [None]:
fit_table = Table().with_columns('Actual', train.column('SalePrice'),
                                'Fitted',  fitted_values,
                                'Residuals', (train.column('SalePrice') - fitted_values))

In [None]:
fit_table.sample(3).show()

In [None]:
fit_table.scatter('Actual', 'Residuals')

In [None]:
fit_table.scatter('Fitted','Actual')

### How we evaluate accuracy of an MLR model?

**Task** calculate the RMSE between:

- the predicted house prices of the testing data
- the actual house prices of the testing data

In [None]:
def sales_test_rmse(a,b,c,d,e,f):

    y = test.column('SalePrice')
    
    first_floor = test.column('1st Flr SF')
    second_floor = test.column('2nd Flr SF')
    basement = test.column('Total Bsmt SF')
    garage = test.column('Garage Area')
    year = test.column('Year Built')
    
    estimate = a*first_floor + b*second_floor + c*basement + d*garage + e*year + f
    
    return (np.mean((y - estimate) ** 2)) ** 0.5

In [None]:
sales_test_rmse(## copy the slopes that you get from minimize() in here! )

### We can bring nearest neighbors to linear regression, too!

In [None]:
example_row = test.drop(0).row(0)

In [None]:
nearest(train, example_row, 'SalePrice', 5)

In [None]:
def nn_5_regression_estimate(new_point):
    return np.average(nearest(train, new_point, 'SalePrice', 5).column('SalePrice'))

In [None]:
nn_5_regression_estimate(example_row)

Perform a 5-nearest neighbors regression on all testing points.

In [None]:
predictions = test.drop('SalePrice').apply(nn_5_regression_estimate)

Calculate the $\text{RMSE}$ for our model!

In [None]:
(np.mean((test.column('SalePrice') - predictions) ** 2)) ** 0.5