In [26]:
import os
import urllib
import tarfile

In [27]:
import pickle
import pandas as pd
import numpy as np

In [28]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
FILE_PATH = os.path.join("datasets", "housing")
FILE_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
TGZ_NAME = "housing.tgz"

In [29]:
def fetch_data(file_url, file_path, file_name):
    os.makedirs(file_path, exist_ok=True)
    tgz_path = os.path.join(file_path, file_name)
    urllib.request.urlretrieve(file_url, tgz_path)
    file_tgz = tarfile.open(tgz_path)
    file_tgz.extractall(path=file_path)
    file_tgz.close()

In [30]:
fetch_data(FILE_URL, FILE_PATH, TGZ_NAME)

In [31]:
def load_data(file_path, file_name):
    csv_path = os.path.join(file_path, file_name)
    return pd.read_csv(csv_path)

In [32]:
CSV_NAME = "housing.csv"
df = load_data(FILE_PATH, CSV_NAME)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [33]:
# make a copy of the train set
house_df = df.copy()

In [34]:
sample = house_df.iloc[:5]

In [35]:
sample.shape

(5, 10)

In [36]:
house_labels = house_df["median_house_value"].copy()
house_df = house_df.drop("median_house_value", axis=1)

In [37]:
house_df.shape

(20640, 9)

In [48]:
house_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND


In [38]:
house_cat = house_df[["ocean_proximity"]]
house_num = house_df.drop("ocean_proximity", axis=1)

In [40]:
house_cat.shape

(20640, 1)

In [41]:
house_num.shape

(20640, 8)

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

In [42]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.rooms_idx, self.bedrooms_idx, self.population_idx, self.households_idx = 3, 4, 5, 6
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, self.rooms_idx]/X[:, self.households_idx]
        population_per_household = X[:, self.population_idx]/X[:, self.households_idx]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, self.bedrooms_idx]/X[:, self.rooms_idx]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [43]:
num_pl = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attrib_adder', CombinedAttributesAdder()),
    ('minmax_scaler', MinMaxScaler()),
])

In [44]:
num_attribs = list(house_num)
cat_attribs = ["ocean_proximity"]

In [45]:
full_pl = ColumnTransformer([
    ("num", num_pl, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [46]:
house_prep = full_pl.fit_transform(house_df)

In [47]:
house_prep.shape

(20640, 16)

In [18]:
param_distribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=2)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                               n_iter=10, cv=3, scoring='neg_mean_squared_error',
                               random_state=2)
rnd_search.fit(house_prep, house_labels)

In [19]:
prep_select_and_predict_pl = Pipeline([
    ('preparation', full_pl),
    ('forest_reg', RandomForestRegressor(**rnd_search.best_params_)),
])

In [20]:
prep_select_and_predict_pl.fit(house_df, house_labels)

In [21]:
print("Predictions:\t", prep_select_and_predict_pl.predict(sample))

Predictions:	 [431692.72483221 384699.4295302  386686.7114094  332774.51677852
 309201.34899329]


In [22]:
with open(r'C:\Users\chonl\OneDrive\Documents\GitHub\House price predictor\ML models\rndf_house_price_estimator.pkl', 'wb') as f:
    pickle.dump(prep_select_and_predict_pl, f)

In [23]:

def check_library_versions():
    libraries = {
        'os': os,
        'urllib': urllib,
        'tarfile': tarfile,
        'pickle': pickle,
        'pandas': pd,
        'numpy': np,
        'sklearn': {
            'base': BaseEstimator,
            'preprocessing': {
                'OneHotEncoder': OneHotEncoder,
                'SimpleImputer': SimpleImputer
            },
            'model_selection': {
                'RandomizedSearchCV': RandomizedSearchCV
            },
            'ensemble': {
                'RandomForestRegressor': RandomForestRegressor
            },
            'pipeline': Pipeline,
            'compose': {
                'ColumnTransformer': ColumnTransformer
            }
        }
    }

    for library, modules in libraries.items():
        if isinstance(modules, dict):
            print(f'{library}:')
            for module, submodules in modules.items():
                if isinstance(submodules, dict):
                    print(f'  {module}:')
                    for submodule, obj in submodules.items():
                        try:
                            version = obj.__version__
                            print(f'    {submodule}: {version}')
                        except AttributeError:
                            print(f'    {submodule}: Version not found')
                else:
                    try:
                        version = submodules.__version__
                        print(f'  {module}: {version}')
                    except AttributeError:
                        print(f'  {module}: Version not found')
        else:
            try:
                version = modules.__version__
                print(f'{library}: {version}')
            except AttributeError:
                print(f'{library}: Version not found')


check_library_versions()

os: Version not found
urllib: Version not found
tarfile: Version not found
pickle: Version not found
pandas: 1.5.3
numpy: 1.23.5
sklearn:
  base: Version not found
  preprocessing:
    OneHotEncoder: Version not found
    SimpleImputer: Version not found
  model_selection:
    RandomizedSearchCV: Version not found
  ensemble:
    RandomForestRegressor: Version not found
  pipeline: Version not found
  compose:
    ColumnTransformer: Version not found


In [25]:
import sklearn
print(sklearn.__version__)

1.2.1


In [26]:
import sys
print(sys.version)

3.10.9 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:15) [MSC v.1916 64 bit (AMD64)]
