In [None]:
import sys
!{sys.executable} -m pip install pandas matplotlib zlib

In [None]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

#calling this function creates a datasets/housing directory in my workspace,downloads the
#housing.tgz file and from that extracts the housing.tgz file in this directory

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    os.makedirs(housing_path, exist_ok = True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()
    

In [None]:
fetch_housing_data()


In [None]:
import pandas as pd

def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head() 
#gives te top five rows of the pandas data frame object
#each row represents one district


In [None]:
#Provides general information about the dataset, total number of rows (amount of districts), type of each
#attribute and the number of non null values. Note that the value "amount of total bedrooms" is null in
#207 districts.
housing.info()

In [None]:
#shows which types of values there are in the housing data and how many districts belong to each category
housing["ocean_proximity"].value_counts()


In [None]:
housing.describe()

* *count* decribes the count of values in each attribute (null values are ignored) <br />
* *std* = standard deviation/ dispersion of values <br />
* *percentiles* indicate the value below which a given percentage in a group of observations fall,
     example: 25% of the districts have a housing median age below 18, 50 % below 29 and so on. It is 
     referred to as first quartile (25th percentile), median (50th), third quartile (75th).





In [None]:
%matplotlib inline 
##uses jupyters backend
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show() #optional

**Oberservations from plots:** <br/>
* housing median ange, median income and median house value are preprocessed. <br />
* median income is scaled, value = value * tens of thousands -> 6 equals 60.000 USD <br />
* median house value was capped at 500k, ML algorithm might learn that house values do not go over that price --> problem, as this is the target value of our problem. <br />
* median income also seems to be capped <br />
* differences in scaling <br />
* tail heavy - histograms extend more to the right of the median <br />
* avoid snooping bias from first impressions! <br/>

<h3>Creating a test set</h3>


In [None]:
import numpy as np

#function for splitting data into train set and test set 
def split_train_test(data, test_ratio): 
    shuffled_indices = np.random.permutation(len(data))
    #test set size = 20 per cent of dataset size
    test_set_size = int(len(data) * test_ratio) 
    
    test_indices = shuffled_indices[:test_set_size] #first 80% of shuffled indices
    train_indices = shuffled_indices[test_set_size:] #remaining 20% of shuffled indices
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
len(train_set)


In [None]:
len(test_set)


**These two solutions are not stable as when we run the program again it will create a different train and test split and data that was in train before will go into test and vice versa.**

**Possible "solutions":**
* save test set on first run and then load it in subsequent runs
* setting the seed of np.random.permutation so that it always generates the same shuffled indices

But also these will break if we want to fetch an updated dataset. We want to have a stable train/test split that remains stable also when updating the data.

**A common solution for this issue:**
* use each instances identifier to decide whether or not they should go in the test set (assuming that each instance has a unique and immutable identifier
* we have the possibility of computing a hash of each instances identifier and decide whether if goes into train or test based on the hash being f.e. higher or lower (lower/equal) than 20% of the max hash value.
* this ensures that even if we refresh the data set the new test set will contain 20% of the new instances while also keeping it guaranteed that
    * the "old" values stay in the test set
    * 20% of the new values are in the test set
    * no value from the train set will go into the test set
    
This is what an implementation of that principle looks like:

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_ny_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


In [None]:
#the housing dataset does not have an identifier column, simple solution: using row index as id 
#we have to always ensure that new data always gets appended to the end of the dataset and no row gets deleted

housing_with_id = housing.reset_index() #adding index column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")



In [None]:
#alternative: using most stable features to build unique identifier
#f.e. longitude/latitude will remain stable; we can combine them into a unique id like so:

housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

