In [72]:
print("Ciao mondo!")

Ciao mondo!


In [73]:
#download and unzip
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL , housing_path = HOUSING_PATH):    
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [76]:
#call the function to download and unzip
fetch_housing_data()

HTTPError: HTTP Error 404: Not Found

In [None]:
#function to load data into panda obj
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
#load data into housing
housing = load_housing_data()
housing.head()

In [None]:
#display columns with info on data type per column
housing.info()

In [None]:
#check why ocean_proximity is an object type
housing["ocean_proximity"].value_counts()

In [None]:
#quick overview on data
housing.describe()

In [None]:
#plot stuff
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#function to get test set
import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
#call function to split data. test set will be 20%
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set))
print(len(test_set))

In [None]:
#function to create a crc code (unique id)
#to ensure we keep the same sample inside the test set in case we add more data in the future.
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio *2**32

In [None]:
#split data tekn into account an id field
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio)) # check if in test set
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
#new dataset with index field
housing_with_id = housing.reset_index() #adds an 'index' column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
#id will be about latitute and longitute
housing_with_id["id"] = housing["longitude"]*1000 + housing["longitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

In [None]:
#create "categories" of income. why ? to try to fit into test data an requal % of sample from all the different categories
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0.,1.5,3.0,4.5,6., np.inf],labels=[1,2,3,4,5] )
housing["income_cat"].hist()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
#cechk population origin in test set
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
#cechk population origin in training set
strat_train_set["income_cat"].value_counts() / len(strat_train_set)

In [None]:
#remove income_cat
for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis = 1, inplace = True)

In [None]:
#create a copy of training set
housing = strat_train_set.copy()

In [None]:
housing.info()

In [None]:
#we have longiute and latitute. let's plot in a scatterplot!
housing.plot(kind="scatter", x="longitude", y="latitude")
#and looks like california yeaaa

In [None]:
#so let's add alpha to focus on density
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
#let's add more info:
#size of circle = population
#color = median house pricing (from cold (low, blue) to hot(red, high))
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/1000, label="population", figsize=(10,7), c="median_house_value",cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

In [None]:
#Looking for correlation!
corr_matrix = housing.corr()

In [None]:
#print correlation with median house value ...
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
#another example to put in play all (or a list of desired...) values
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))

In [None]:
#closer look to median house and icome
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

In [None]:
#play around with data, build more useful data
housing["rooms_per_houshold"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_houshold"] = housing["population"]/housing["households"]

In [None]:
#check again coorelation matrix!
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
#prepare the data for the ML algorithms
#revert to a clean training set
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
housing_labels

In [None]:
#since some attributes have some NULL values, let's deal with them. we are going to put the median in null values for total_bedrooms
#and we are going to use scikit-learn to do that
from sklearn.impute import SimpleImputer

#create an istance
imputer = SimpleImputer(strategy="median")
#butr can be done on numeric values only, so get rid for a whuile of non-num attributes
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
#the istance has calc the median and saved in a attribute

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
#let's transform our set by replacing missing values
X = imputer.transform(housing_num)
#the result is a plain numpy array. let's back to a pandas DataFrame please
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [None]:
housing[["ocean_proximity"]].head(10)

In [None]:
housing["ocean_proximity"].head(10)

In [None]:
housing_cat = housing[["ocean_proximity"]]

from sklearn.preprocessing import OrdinalEncoder
ordinal_econder = OrdinalEncoder() #get instance
housing_cat_encoded = ordinal_econder.fit_transform(housing_cat)

#print it ...
housing_cat_encoded[:10]

In [None]:
#let's print the encoded categories, saved in the istance !
ordinal_econder.categories_

In [None]:
# problem! ML algorithms "understands" near value as "more similar"
# possible solution: one hot encoding!
# using a TRANSFOMRMER of OneHotEncoder class

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
#the reuslt is a sparse matrix SciPy

In [None]:
housing_cat_1hot[1]

In [None]:
housing_cat_1hot.toarray()

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, housolds_ix = 3, 4, 5, 6

# we inherit from some base class
# we need to provide a couple of methods:
# 1) fit, just returns itself
# 2) transform
# 3) fit_transform() => if you inherit from TransformerMixin, you get it from free (usually is just call fit() and then transform())

# inherit from base class for estimator
# inherit

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):  # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y = None):
        return self #nothing else to do

    def transform(self, X):
        rooms_per_housold = X[:, rooms_ix] / X[:, housolds_ix]
        population_per_housold = X[:, population_ix] / X[:, housolds_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_housold, population_per_housold, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_housold, population_per_housold]

#to use it in the exercise 

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
#Transformation pipelines!

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#let's build a pipeleine for our numerical columns

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()),('std_scaler', StandardScaler()),])

In [None]:
# and then let's add our categorial columns
# we have just one, and we want the 1 hot encoder

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)   # list with the numerical attributes(columns)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs),])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared

In [None]:
#Training and evaluating on the Training Set!
# train a regression model!

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() #istance
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
# Done! let's try out a few istances from the training set
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions: ", lin_reg.predict(some_data_prepared)) # print predictions for our training set
print("Labels: ", list(some_labels)) # and the actual values

In [None]:
# Lets calc the RMSE we got

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

lin_rmse

In [None]:
#so, our predict error is of $ 68628. quite huge since most of district have a median_housing_value in the range 120000 and 265000 !
# we have UNDERFITTING
# two causes:
# 1) features do not provide enough info
# 2) model is not powerful enough

In [None]:
#let's see with another model

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)

tree_rmse

In [None]:
# wow 0.0! that's rock! maybe overfitting problem ?
# let's try use scikit learn function corrs validation feature
# it divide training set in 10
# then use 9 for trianing and 1 for validaiton
# result is an array

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores) #yeah, negative of it !

In [None]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
print_scores(tree_rmse_scores)

In [None]:
#wow, seems even worse than linear regression.
# we got 68628.19819848922
# let's use the cross validation with the linear regression and let's see the result


In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-lin_scores) #yeah, negative of it !
print_scores(lin_rmse_scores)

In [None]:
# slighlty better.
# med is less, std deiation is more
# so let's try another model: random forest regressor

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores) #yeah, negative of it !
print_scores(forest_rmse_scores)

In [None]:
# yeah better than linear and decision tree

In [None]:
# Fine-Tune your model
# use the GridSearchCV function from scikit to test out hyperparameter

from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3, 10, 30], 'max_features':[2, 4, 6, 8]}, {'bootstrap':[False], 'n_estimators':[3,10], 'max_features':[2, 3, 4]}]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
# let's get the best result:
grid_search.best_params_

In [None]:
# get best estimators
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_

cvres["mean_test_score"]
cvres["params"]

In [None]:
# get results for all combination

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
# evaluate your system on the test set

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis = 1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse