In [None]:
# this block will load the housing data as we did before


import os
import tarfile
from six.moves import urllib
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()
housing.head()

In [None]:
# here are some visualizations of the data that are in this dataset

import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# some more code for visualizing the relationships between two attributes in the dataset

# if you want bigger figures, create another cell below this block with fewer attributes selected

from pandas.plotting import scatter_matrix

attributes = ["latitude", "longitude", "total_bedrooms", "median_house_value", "median_income", "total_rooms",
              "housing_median_age", "population", "households" ]
scatter_matrix(housing[attributes], figsize=(12, 8))


In [None]:
#now let's try pulling out some items in the dataset (for this, I'll look at proximity to the bay as the output and median income as the input)

X = np.array(housing["longitude"]).reshape(-1,1)
yval = np.array(housing["median_house_value"])
y = ( yval > 200000 ).astype(np.int)

print( X.shape, y.shape )

In [None]:
# we can look at this as logistic data (1 or 0)

import matplotlib.pyplot as plt
plt.plot( X, y, 'ro')

In [None]:
# or, we can look at this as linear data

import matplotlib.pyplot as plt
plt.plot( X, yval, 'ro')

In [None]:
# let's try some logistic regression on the data

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X,y)

In [None]:
# you'll want to visualize the output of the logistic regression

# to create the curves, need to sample the space evenly
X_new = np.linspace(-124, -114, 1000).reshape(-1, 1) #make sure to update this to match the range of your xvariable
# predicted probabilities for each of the sampled papers
y_proba = log_reg.predict_proba(X_new)

# plot the data
plt.plot(X,y == 1,'r.')

#plot the curves
plt.plot(X_new, y_proba[:, 1], "g-")
plt.plot(X_new, y_proba[:, 0], "b--")
plt.ylabel('probability')

In [None]:
# the above graph is bad, but how bad is it? For this block, you will need to write
# some code to evaluate the quality of this logistic regression


In [None]:
# let's also try some linear regression on the data

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

polynomial_features = PolynomialFeatures(degree=1,
                                         include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
                     ("linear_regression", linear_regression)])
pipeline.fit(X, yval)


In [None]:
X_test = np.linspace(-124, -114, 1000)
plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
plt.scatter(X, yval, edgecolor='b', s=20, label="Samples")
plt.xlabel("x")
plt.ylabel("y")


In [None]:
# ohh, this looks bad, too! For this block, you will need to write
# some code to evaluate the quality of this linear regression (hint RMSE)


In [None]:
# now I'd like you to try using the example code above to find at least
# one good combination for a logistic regression, and


In [None]:
# at least two good linear regression fits


In [None]:
# we might be able to do better looking at more than one feature

X = np.vstack((housing["latitude"], housing["longitude"]) ).T
y = ( housing["ocean_proximity"] == "INLAND" ).astype(np.int)
Y = housing["ocean_proximity"] 

#idx = np.arange(housing["ocean_proximity"]=="NEAR BAY")


In [None]:
log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1e5)
log_reg.fit(X,y)

In [None]:
colors = "brmgc"
c2 = ['b','r','m','g','c']
classes = np.unique(housing["ocean_proximity"])
print(classes)


# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = log_reg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
#plt.scatter(X[:, 0], X[:, 1], c=c2, edgecolors='k', cmap=plt.cm.Paired)


# Plot also the training points
for i, color in zip(classes, colors):
    idx = np.where(housing["ocean_proximity"] == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=i, cmap=plt.cm.Paired, edgecolor='black', s=20)

    


In [None]:
# how does this prediction do?

In [None]:
# try doing some type of regression with an SGD regressor
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor
