In [None]:
import pandas as pd

housing_df = pd.read_csv('datasets/housing/housing.csv')
housing_df.info()
housing_df.head()

In [None]:
housing_df['ocean_proximity'].value_counts()
housing_df.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

housing_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Stratified Sampling of the "median_income"
import numpy as np
housing_df['income_cat'] = pd.cut(housing_df['median_income'], bins=[0, 1.5, 3.0, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])

housing_df['income_cat'].hist()

In [None]:
# Random Split
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing_df, train_size=0.8, random_state=42)
test_set['income_cat'].hist()

In [None]:
# Stratified Split
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=42)

for train_index, test_index in split.split(housing_df, housing_df['income_cat']):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

len(strat_test_set)
len(strat_train_set)

strat_test_set['income_cat'].hist()
strat_train_set['income_cat'].hist()

for set_ in (strat_test_set, strat_train_set):
    set_.drop('income_cat', axis=1, inplace=True)

In [None]:
# Visual Exploration

housing = strat_train_set.copy()
housing.plot(kind='scatter', x='longitude', y='latitude', alpha = 0.2)

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha = 0.5, 
            s=housing['population']/100, label='population', figsize=(9,6),
            c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)

In [None]:
# Correlations

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values()

In [None]:
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes])

In [None]:
# Creaeting New, Useful Attributes
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']

housing.corr()['median_house_value'].sort_values()

In [None]:
# DATA PREPARATION

#Separating labels from the data
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value']

In [None]:
# Filling up empty values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

housing_num = housing.drop('ocean_proximity', axis=1)

X = imputer.fit_transform(housing_num)

In [23]:
# OneHotEncoding the categorical 'ocean_proximity'
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder()
housing_cat_1hot = oneHotEncoder.fit_transform(housing[["ocean_proximity"]])
# oneHotEncoder.categories_


In [35]:
# Custom Transformer
from CombinedAttributesAdder import CombinedAttributesAdder
combinedAttributesAdder = CombinedAttributesAdder(add_bedrroms_per_room=True)
housing_extra_attributes = combinedAttributesAdder.fit_transform(housing.values)
housing_extra_attributes[:5]

array([[-121.89, 37.29, 38.0, 1568.0, 351.0, 710.0, 339.0, 2.7042,
        '<1H OCEAN', 4.625368731563422, 2.094395280235988,
        0.22385204081632654],
       [-121.93, 37.05, 14.0, 679.0, 108.0, 306.0, 113.0, 6.4214,
        '<1H OCEAN', 6.008849557522124, 2.7079646017699117,
        0.15905743740795286],
       [-117.2, 32.77, 31.0, 1952.0, 471.0, 936.0, 462.0, 2.8621,
        'NEAR OCEAN', 4.225108225108225, 2.0259740259740258,
        0.24129098360655737],
       [-119.61, 36.31, 25.0, 1847.0, 371.0, 1460.0, 353.0, 1.8839,
        'INLAND', 5.232294617563739, 4.135977337110481,
        0.20086626962642123],
       [-118.59, 34.23, 17.0, 6592.0, 1525.0, 4459.0, 1463.0, 3.0347,
        '<1H OCEAN', 4.50580997949419, 3.047846889952153,
        0.23134101941747573]], dtype=object)

In [43]:
# Transformation Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# For the numerical data
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)
list(housing_num)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [48]:
# Numerical and Categorical Transformer together
from sklearn.compose import ColumnTransformer

num_attributes = list(housing_num)
cat_attributes = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', OneHotEncoder(), cat_attributes)
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])