# House Price Predictions with Linear Regression

In [None]:
%run Coding_linear_regression.ipynb 
# allows us to use the functions we wrote

import pandas

### Exploring the relationship between price and area

In [None]:
data = pandas.read_csv('Hyderabad.csv')
plot_scatter(
    data['Area'], data['Price'], "Housing Area", "Housing Price")
plt.show()

## With turicreate

### Testing a model with only one feature

In [None]:
import turicreate as tc
data_tc = tc.SFrame('Hyderabad.csv')
data

In [None]:
simple_model = tc.linear_regression.create(data_tc, features=['Area'], target='Price')
simple_model.coefficients
b, m = simple_model.coefficients['value']
print("slope:", m)
print("y-intercept:", b)

plot_scatter(data_tc['Area'], data_tc['Price'])
draw_line(m, b, starting=0, ending=max(data_tc['Area']))
plt.show()

# With statsmodels

### Testing a model with only one feature

statsmodels doesn't automatically add an intercept (constant bias) column, so we have to add that ourselves.

In [None]:
import statsmodels.api as sm

exog = sm.add_constant(data['Area']) # adds an intercept column
model_linear_regression = sm.OLS(
    endog = data['Price'],
    exog = exog)
results_regression = model_linear_regression.fit()

In [None]:
results_regression.summary()

In [None]:
results_regression.params

In [None]:
plot_scatter(
    data['Area'], data['Price'], "Housing Area", "Housing Price")
draw_line(*results_regression.params[::-1], starting=0, ending=max(data['Area']))
plt.show()

In [None]:
plot_scatter(
    results_regression.fittedvalues,
    results_regression.resid,
    x_label = "Fitted Values",
    y_label = "Residual Values")
plt.show()
# Not sure why it plots twice
sm.qqplot(results_regression.resid_pearson, line = "q")

### Building a model that uses all the features

statsmodels doesn't handle categorical values for us, so we need to adjust our dataset using `pandas.get_dummies()`

In [None]:
exog = data.copy()
exog = sm.add_constant(exog) # adds an intercept column
exog = pandas.get_dummies(exog) # Converts categorical to one-hot
endog = exog.pop("Price")

model_linear_regression = sm.OLS(
    endog = endog,
    exog = exog)
results_regression = model_linear_regression.fit()

In [None]:
results_regression.summary() # yields a very large printout

In [None]:
results_regression.params

In [None]:
plot_scatter(
    results_regression.fittedvalues,
    results_regression.resid,
    x_label = "Fitted Values",
    y_label = "Residual Values")
plt.show()
# Not sure why it plots twice
sm.qqplot(results_regression.resid_pearson, line = "q")

In [None]:
house = { 'No. of Bedrooms' : 3, 'Area': 1000 }

def predict_linear_regression(fitted_model, dict_features):
    """ 
    Calculates y ~ const + sum( parameter*value )

    { 'feature name' : value }
    
    Does not assume you have all features present, so prediction may be off.
    Assumes const parameter is not present in dictionary
    """
    list_given_terms = [
        fitted_model.params[key]*value for key, value in dict_features.items()
    ]
    constant_value = fitted_model.params['const']
    list_given_terms.append(constant_value)
    
    return sum(list_given_terms)

prediction = predict_linear_regression(results_regression, house)
print("Predicted housing price:", prediction)