In [None]:
print('Hello world!')

In [None]:
import numpy as np
import pandas as pd

In [None]:
import os
import tarfile

In [None]:
HOUSING_PATH = 'datasets/housing'

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    housing_csv_path = os.path.join(housing_path, 'housing.csv') 
    housing_tgz_path = os.path.join(housing_path, 'housing.tgz') 
    if os.path.isfile(housing_csv_path):
        print(f'Find {housing_csv_path}, do nothing')
        return
    if os.path.isfile(housing_tgz_path):
        print(f'Find {housing_tgz_path}, will extract it')
        housing_tgz = tarfile.open(housing_tgz_path)
        housing_tgz.extractall(path=housing_path)
        housing_tgz.close()
        return
    print(f'Can not find {housing_csv_path}')

fetch_housing_data()

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing.ocean_proximity.value_counts()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
housing.median_income.describe()

In [None]:
housing.median_income.hist(bins=15)
plt.show()

In [None]:
income_cat = np.ceil(housing.median_income / 1.5)

In [None]:
income_cat.where(income_cat < 5.0, 5.0, inplace=True)

In [None]:
# The above operations can be replaced by the following
income_cat2 = np.ceil(housing.median_income / 1.5)
income_cat2[income_cat2 > 5.0] = 5.0
(income_cat2 == income_cat).all()

In [None]:
income_cat.describe()

In [None]:
income_cat.value_counts() / len(income_cat)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [None]:
housing['income_cat'] = income_cat

for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]


In [None]:
Stratified = strat_test_set['income_cat'].value_counts().sort_index() / len(strat_test_set)
Overall = housing['income_cat'].value_counts().sort_index() / len(housing)
data = pd.DataFrame({'Overall': Overall, 'Stratified' : Stratified})
data['Strat. %error'] = (data['Overall'] - data['Stratified']) / data['Overall'] * 100
data

## Visualizing Data

In [None]:
strat_train_set_copy = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x='longitude', y='latitude')

In [None]:
housing.plot(kind="scatter", x='longitude', y='latitude', alpha=0.1)

In [None]:
strat_train_set_copy.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
                          s=strat_train_set_copy.population/100,
                          c=strat_train_set_copy.median_house_value,
                          cmap=plt.get_cmap("jet"),
                          label="population", figsize=(15, 15),
                          colorbar=True)
plt.legend()

In [None]:
corr_matrix = strat_train_set_copy.corr()

In [None]:
corr_matrix.median_house_value.sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
strat_train_set_copy.plot.scatter(x="median_income", y="median_house_value", alpha=0.1)

### Experimenting with Attribute Combinations

In [None]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
housing.info()

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

## 2.5 Prepare the Data for Machine Learning Algorithms

In [None]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [None]:
housing.info()

In [None]:
housing.dropna(subset=['total_bedrooms']).info()

In [None]:
housing.drop('total_bedrooms', axis=1).info()

In [None]:
housing['total_bedrooms'].fillna(housing['total_bedrooms'].median()).describe()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_

In [None]:
imputer.strategy

In [None]:
housing.drop("ocean_proximity", axis=1).median().values

In [None]:
X = imputer.transform(housing_num)
X

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr.head()