In [None]:
import os 
import tarfile 
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/" 
HOUSING_PATH = os.path.join("datasets", "housing") 
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [None]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):    
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path) 

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
#Ocean proximity is a categorical attribute.
housing["ocean_proximity"].value_counts()

In [None]:
#Describe method to show summary of numerical attributes.
housing.describe()
#The count of total_bedrooms is 20,433 not 20640.Some districts are missing this attribute

In [None]:
#Another quick way to get a feel of the data you are dealing with is to plot a histogram for each numerical attribute. 

import matplotlib.pyplot as plt 
housing.hist(bins=50, figsize=(20,15)) 
plt.show()
#If you observe median_income, they are scaled to 15 for higher values and 0.5 to lower values
#Median age and median_income are also capped.Ml algorithms may learn that those attributes will never go beyong that limit. Collect proper labels for the districts or remove those districts
#Histograms are tail-heavy, bit harder for ML algorithms to detect patterns


In [None]:
#create a test set
import numpy as np 
def split_train_test(data, test_ratio):
      shuffled_indices = np.random.permutation(len(data))
      test_set_size = int(len(data) * test_ratio)
      test_indices = shuffled_indices[:test_set_size] 
      train_indices = shuffled_indices[test_set_size:]
      return data.iloc[train_indices], data.iloc[test_indices] 

train_set, test_set = split_train_test(housing, 0.2) 
len(train_set)      

In [None]:
len(test_set) 

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) 

In [None]:
#median income is a very important attribute to predict median housing prices. pd.cut() function is used to create an income category with five categories( 1 to 5)

housing["income_cat"] = pd.cut(housing["median_income"],                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],                               labels=[1, 2, 3, 4, 5]) 
housing["income_cat"].hist()


In [None]:
#Stratified sampling
#With stratified sampling, the researcher divides the population into separate groups, called strata. Then, a probability sample (often a simple random sample ) is drawn from each group.
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 
for train_index, test_index in split.split(housing, housing["income_cat"]):    
    strat_train_set = housing.loc[train_index] 
    strat_test_set = housing.loc[test_index] 
strat_test_set["income_cat"].value_counts() / len(strat_test_set) 

In [None]:
#Remove income_cat attribute so that data is back to its original state
for set_ in (strat_train_set, strat_test_set):    
    set_.drop("income_cat", axis=1, inplace=True) 

In [None]:
#Visualize the data to gain Insights. Let’s create a copy so that you can play with it without harming the training set
housing = strat_train_set.copy() 


In [None]:
#Visualizing geographical data
housing.plot(kind="scatter", x="longitude", y="latitude")


In [None]:
#hard to see any particular pattern. Setting the alpha option to 0.1
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)


In [None]:
#Now let’s look at the housing prices. The radius of each circle represents the district’s population (option s), and the color represents the price (option c). We will use a predefined color map (option cmap) called jet, which ranges from blue (low values) to red (high prices):
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
        c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, )
plt.legend()



In [None]:
#Looking for correlations
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False) 
# the median house value tends to go up when the median income goes up

In [None]:
#Another way to check correlation
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",              "housing_median_age"] 
scatter_matrix(housing[attributes], figsize=(12, 8))


In [None]:
#The most promising attribute to predict the median house value is the median income, so let’s zoom in on their correlation scatterplot 
housing.plot(kind="scatter", x="median_income", y="median_house_value",             alpha=0.1)
#the correlation is indeed very strong


In [None]:
#Experimenting with Attribute Combinations 
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"] 
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] 
housing["population_per_household"]=housing["population"]/housing["households"]
corr_matrix = housing.corr()  
corr_matrix["median_house_value"].sort_values(ascending=False)
# houses with a lower bedroom/room ratio tend to be more expensive

In [None]:
#Prepare the Data for Machine Learning Algorithms 
#separate the predictors and the labels
housing = strat_train_set.drop("median_house_value", axis=1) 
housing_labels = strat_train_set["median_house_value"].copy() 


In [None]:
#Data Cleaning
# ML algorithms cannot work with missing features, so let’s create a few functions to take care of them
#1. Get rid of the corresponding districts. 2. Get rid of the whole attribute. 3. Set the values to some value (zero, the mean, the median, etc.). 
#housing.dropna(subset=["total_bedrooms"])    # option 1 housing.drop("total_bedrooms", axis=1)       # option 2
# median = housing["total_bedrooms"].median()  # option 3 housing["total_bedrooms"].fillna(median, inplace=True) 
#I choose option3
median = housing["total_bedrooms"].median()  # option 3 housing["total_bedrooms"].fillna(median, inplace=True) 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num) 

In [None]:
 imputer.statistics_ 

In [None]:
 housing_num.median().values
 #

In [None]:
#Now you can use this “trained” imputer to transform the training set by replacing missing values with the learned medians
X = imputer.transform(housing_num) 
#
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)


In [None]:
#Handling Text and Categorical Attributes
#Lets look at text attributes

housing_cat = housing[["ocean_proximity"]]  
housing_cat.head(10)
#There is no arbitrary text. So this attribute is categorical attribute

In [None]:
#One hot encoding for this categorical attribute

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder() 
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10] 

In [None]:
 ordinal_encoder.categories_ 
 #One issue is the ML algorithms will assume that two nearby values are more similar than two distinct values. Good-> bad,avg,good.Not for our data. To fix this have a binary attribute per category

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot 

In [None]:
 housing_cat_1hot.toarray() 

In [None]:
cat_encoder.categories_ 

In [None]:
#Custom Transformers
#transformer that adds the combined attributes

               

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
     def __init__(self, add_bedrooms_per_room = True):
          self.add_bedrooms_per_room = add_bedrooms_per_room
     def fit(self, X, y=None):
          return self
     def transform(self, X):
          rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
          population_per_household = X[:, population_ix] / X[:, households_ix]
          if self.add_bedrooms_per_room:
               bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
               return np.c_[X, rooms_per_household, population_per_household,                         bedrooms_per_room]
          else:
               return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) 
housing_extra_attribs = attr_adder.transform(housing.values)   
     

In [None]:
#Feature Scaling
#to get all attributes to have the same scale, we choose standardization
#Transformation Pipelines - To execute data transformation steps in right order
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('attribs_adder', CombinedAttributesAdder()),                  ('std_scaler', StandardScaler()),
        ])
housing_num_tr = num_pipeline.fit_transform(housing_num) 

In [None]:
#To handle both numerical and catogorical in a single column
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", OneHotEncoder(), cat_attribs),
        ])
housing_prepared = full_pipeline.fit_transform(housing) 


In [None]:
#Select and Train a model on Training set
#Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression() 
lin_reg.fit(housing_prepared, housing_labels) 

In [None]:
#Lets try few instances of linear regression on training set
some_data = housing.iloc[:5] 
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))  

In [None]:
print("Labels:", list(some_labels)) 
#Predictions are not accurate with lables

In [None]:
#RMSE Error
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse  
#most districts’ median_hous ing_values range between $120,000 and $265,000, so a typical prediction error of $68,628

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor() 
tree_reg.fit(housing_prepared, housing_labels) 

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
#The erros is 0. Its evaluate using cross-Validation
#Using K-fold cross validation feature
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
#Lets look at the results
def display_scores(scores):
      print("Scores:", scores)
      print("Mean:", scores.mean())
      print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)
#Worse than Linear Regression

In [None]:
#scores for Linear Regression
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,scoring="neg_mean_squared_error", cv=10)                                   
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores) 

In [None]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels) 

In [None]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [57]:
scores = cross_val_score(forest_reg, housing_prepared, housing_labels,                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)

In [None]:
display_scores(forest_rmse_scores) 