In [None]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import StratifiedShuffleSplit

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Acquire housing data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

# Load housing data into Pandas Dataframe
def load_housing_data(housing_path=HOUSING_PATH):
    fetch_housing_data()
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

# Ensure that records from the test set are not added back into the train set via randomization
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

# Split data into testing and training portions (20% testing, 80% training)
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

# Create Pandas dataframe
housing = load_housing_data()

# housing.head() # First 5 records shown
# housing.info() # Info function
# housing.describe() # describe function

# Histogram for data
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
housing_with_id = housing.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index") # Creates train/test sets
print(len(train_set)) # Print length of train
print(len(test_set)) # Print length of test

In [None]:
# The pd.cut() function is used to create an income category attribute with 5 categories (labeled from 1 to 5): category 1 ranges from # 0 to 1.5 (i.e., less than $15,000), category 2 from 1.5 to 3, and so on
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0.,1.5,3.0,4.5,6.,np.inf], labels=[1,2,3,4,5])
housing["income_cat"].hist() # Histogram of income brackets

In [None]:
# Stratified sampling for income category (making sure the sample represents the population)
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Print out percentage of people in each income bracket after stratified sampling
print(strat_test_set["income_cat"].value_counts() / len(strat_test_set))

# Remove "income_cat" to bring data back to original state
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1,inplace=True)

In [None]:
# Create copy set of training set
housing = strat_train_set.copy()
# Plot housing by population and long/lat
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
# Graph based on population, lat/long, and pricing
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,s=housing["population"]/100,label="population",figsize=(10,7),c="median_house_value",cmap=plt.get_cmap("jet"),colorbar=True)
plt.legend()

In [None]:
# Computing the standard correlation coeficient (Pearson's r)
corr_matrix = housing.corr()

# Output how different datapoints correlate to the median house value
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
# Pandas scatter_matrix function to plot attributes against each other
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))

In [None]:
# Median income to median housing price
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

In [None]:
# Creating new attributes
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

# Another look at correlation matrix
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# Reverting back to a clean training set
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

# Dealing with the total_bedrooms attribute missing cells (different options available in notes)
from sklearn.impute import SimpleImputer

# Use SimpleImputer function to fill missing values
imputer = SimpleImputer(strategy="median")
# Drop non-numerical data for SimpleImputer sklearn function
housing_num = housing.drop("ocean_proximity", axis=1)
# Fit imputer to housing_num to learn median values
imputer.fit(housing_num)
# print(imputer.statistics_)
# print(housing_num.median().values)
# Fill x with housing_num + median values for previously empty cells
x = imputer.transform(housing_num) # This results in numpy array
# Transforming back into Pandas dataframe
housing_tr = pd.DataFrame(x, columns=housing_num.columns)

# Working with the ocean_proximity attribute
housing_cat = housing[["ocean_proximity"]]
# housing_cat.head(10)

# Import OrdinalEncoder to transform ocean_proximity into numerical data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# Note: We are using OneHotEncoder, not OrdinalEncoder. Reasoning in notes under 'transforming into numerical data'
# However, it is still encoded/transformed in both ways in this program for educational purposes

# Create OrdinalEncoder object
ordinal_encoder = OrdinalEncoder()
# Encode/transform dataset
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
# print(housing_cat_encoded[:10])
# print(ordinal_encoder.categories_)

# Encoder object created
cat_encoder = OneHotEncoder()
# Data encoded/transformed
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
# print(housing_cat_1hot)
# housing_cat_1hot is currently a SciPy sparse matrix. It is useful in this format, however if someone wanted to use a 2d array
# instead they can use the toarray() method