# California House Prising

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hashlib
# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
%matplotlib inline
housing = pd.read_csv('./data_store/housing.csv')

In [None]:
# housing['ocean_proximity'].info()
housing.describe()

In [None]:
# housing.hist(column="median_income",by="ocean_proximity",bins=25,grid=False,figsize=(20,20))
# housing["ocean_proximity"].hist(bins=50,figsize=(20,15))

In [None]:
housing.hist(bins=50,figsize=(20,20))
plt.show()

In [None]:
# # test set
# def test_train_split(*,data:pd.core.frame.DataFrame,test_ratio:int|float):
#     shuffled_indices = np.random.permutation(len(data))
#     test_set_size = int(len(data)*test_ratio)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
#     return data.iloc[test_indices],data.iloc[train_indices]

In [None]:
# test_data,train_data = (test_train_split(data=housing,test_ratio=0.2))

#histogram
# housing_with_id.hist(bins=50,figsize=(20,15))

In [None]:
# def test_set_check(identifier,test_ratio,hash):
#     return hash(np.int64(identifier)).digest()[-1] < (256 * test_ratio )
# def test_train_split(data,test_ratio,id_column,hash=hashlib.md5):
#     ids = data[id_column]
#     in_test_set = ids.apply(lambda id_ : test_set_check(id_,test_ratio,hash))
#     return data.loc[~in_test_set],data.loc[in_test_set]

In [None]:
# #this approach uses increasing index column (0,1,2,3,4,5,...) as id. If we go with this we always needs to ensure that new data only appends to the data otherwise this fail to persist train and test data..
# housing_with_id = housing.reset_index()
# train_set,test_set = test_train_split(housing_with_id,0.2,"index")

# #this approach uses latitude and longitude to comute an unique id
# housing_with_id = housing
# housing_with_id["id"] = housing["longitude"]*1000 + housing["latitude"]
# train_set,test_set = test_train_split(housing_with_id,0.2,"id")
# #this approach too is not give absolute uniqueness

# using hashing 
# housing_with_id = housing
# housing_with_id['id'] = housing['longitude'].astype('str')+'_'+housing['latitude'].astype('str')
# housing_with_id['id'] = housing['id'].apply(lambda _id: abs(hash(_id)))
# train_set,test_set = test_train_split(housing_with_id,0.2,"id")

# using sklearn.model_selection import train_test_split
# train_set,test_set = train_test_split(housing,test_size=0.2,random_state=42)

In [None]:
# old method for categorization
# categorizer = 3
# categorizer = 1.5
# housing["income_cat"] = np.ceil(housing["median_income"]/categorizer)
# housing["income_cat"] = housing["income_cat"].where(housing["income_cat"]<5,5.0)

In [None]:
# new method for categorization
housing["income_cat"] = pd.cut(housing["median_income"],
                                    bins=[0.,1.5,3.0,4.5,6.,np.inf],
                                    labels=[1,2,3,4,5])

In [None]:
housing["income_cat"].hist(bins=20,figsize=(10,5))

In [None]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_indices,test_indices in split.split(housing,housing["income_cat"]):
    # print(train_indices,test_indices)
    strat_train_set = housing.loc[train_indices]
    strat_test_set = housing.loc[test_indices]

In [None]:
#Stratified set income range proportions
strat_train_set["income_cat"].value_counts() / len(strat_train_set)
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_test_set,strat_train_set):
    set_.drop("income_cat",axis=1,inplace=True,errors='raise')
housing_train_set = strat_train_set.copy()

In [None]:
print(len(housing_train_set))
housing_train_set.plot(kind="scatter",x="longitude",y="latitude",color="skyblue",edgecolors='black')
housing_train_set.plot(kind="scatter",x="longitude",y="latitude",color="skyblue",edgecolors='black',alpha=0.1)

In [None]:
housing_train_set.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    edgecolor="navy",
    alpha=0.4,
    s=housing_train_set["population"]/100,
    label="population",figsize=(10,7),
    c="median_house_value",
    cmap=plt.get_cmap("jet"),
)
plt.legend()

In [None]:
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer

In [None]:
# print(housing_train_set.columns)
attributes = ['housing_median_age', 'total_rooms', 'median_income','median_house_value']
scatter_matrix(housing_train_set[attributes],figsize=(12,8))

In [None]:
housing_train_set.plot(kind="scatter",x="median_income",y="median_house_value", alpha=0.2)

In [None]:
# we are creating useful features from fetures that seems not useful

housing_train_set["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing_train_set["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing_train_set["population_per_household"] = housing["population"]/housing["households"]

In [None]:
corr_matrix = housing_train_set.select_dtypes(exclude=['object']).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("median_house_value",axis=1)
housing_lables = strat_train_set["median_house_value"].copy()

In [None]:
# # drops rows with null values in total_bedrooms column
# housing.dropna(subset=["total_bedrooms"])

# # drops the column total_bedrooms as a whole
# housing.drop(["total_bedrooms"],axis=1)

# # filling emptys with median in total_bedrooms column
# bedrooms_median = housing["total_bedrooms"].median()
# housing["total_bedrooms"].fillna(bedrooms_median)


In [None]:
# we can't always assure that only total_bedrooms column can have a missing values 
# so we can use sklearn's imputing concept to fill out dataframe 
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity",axis=1)

imputer.fit(housing_num)

In [None]:
print(imputer.statistics_)
print(housing_num.median().values)
print(type(imputer.statistics_ == housing_num.median().values))

In [None]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X,columns=housing_num.columns)

In [None]:
housing_cat = housing["ocean_proximity"]
housing_cat.head(10)

In [None]:
housing_cat_encoded,housing_categories = housing_cat.factorize()
print(housing_cat_encoded[:10])
print(housing_categories)

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot.toarray()

In [None]:
# import sys,os
# sys.path.append(os.getcwd())
from housing_transformers.CombinedAttributesAdder import CombinedAttributesAdder
from housing_transformers.DataFrameSelector import DataFrameSelector
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
# num_pipeline = Pipeline([
#     ('imputer',SimpleImputer(strategy='median')),
#     ('attribs_adder',CombinedAttributesAdder()),
#     ('std_scaler',StandardScaler()),
# ])

# housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
num_attributes = list(housing_num)
cat_attributes = ["ocean_proximity"]

In [None]:
num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_attributes)),
    ('imputer',SimpleImputer(strategy='median')),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scaler',StandardScaler()),
])

In [None]:
cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_attributes)),
    ('cat_encoder',OneHotEncoder(sparse_output=False))
])

In [None]:
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)

# Model training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_lables)

In [None]:
some_data  = housing.iloc[:5]
some_labeles = housing_lables.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("predictions : ",lin_reg.predict(some_data_prepared))
print("labels : ",list(some_labeles))

In [None]:
from sklearn.metrics import mean_squared_error
# linear regression mean squared error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_lables,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_lables)
tree_reg #underfiting

In [None]:
#decission tree regressor mse
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_lables,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse #overfiting

In [None]:
from sklearn.model_selection import cross_val_score
from housing_helper import display_scores

In [None]:
# DecisionTreeRegressor cross validation scores
scores = cross_val_score(tree_reg,housing_prepared,housing_lables,scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)

In [None]:
# Linear Regression model cross validation scores
lin_scores = cross_val_score(lin_reg,housing_prepared,housing_lables,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_lables)

In [None]:
# random forest mse (too heavy to fit toook more than 30s to fit |:(| )
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_lables,housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
# random forest cross validation 
# commenting because took ~4min to execute too expensive for my system
# forest_scores = cross_val_score(forest_reg,housing_prepared,housing_lables,scoring="neg_mean_squared_error",cv=10)
# forest_rmse_scores = np.sqrt(-forest_scores)
# display_scores(forest_rmse_scores)

# Fine tuning 

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg,param_grid,cv=5,scoring='neg_mean_squared_error')

grid_search.fit(housing_prepared,housing_lables)

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print(np.sqrt(-mean_score),params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
# print(len(feature_importances),feature_importances)
extra_attribs = ["rooms_per_hhold","pop_per_hhold","bedrooms_per_room"]
cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attributes = list(cat_encoder.categories_[0])
attributes = num_attributes+extra_attribs+cat_one_hot_attributes
# print(len(attributes),attributes)

In [None]:
sorted(zip(feature_importances,attributes),reverse=True)

# Evaluation (finally!)

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value",axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test,final_predictions)
final_rmse = np.sqrt(final_mse)

print("Final Root Mean Suqared Error : ",final_rmse)

# Am I just got pranked ??

# saving the Model

In [None]:
import joblib,os
if not os.path.exists('model_outcomes'):
    os.makedirs('model_outcomes')

joblib.dump(final_model,"model_outcomes/housing_predictor_model.pkl")

# to use this import like
# housing_predictor_model = joblib.load("model_outcomes/housing_predictor_model.pkl")

# END