## Preprocessing

In [142]:
import pandas as pd
df=pd.read_csv("data.csv")
df.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [193]:
median = df['total_bedrooms'].median()
df['total_bedrooms'].fillna(median,inplace=True)
df.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [179]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
   ])

In [180]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
housing_num = df.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
X= full_pipeline.fit_transform(df)

In [181]:
y=df["median_house_value"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Prediction

In [182]:
from sklearn.tree import DecisionTreeRegressor
dtr_reg = DecisionTreeRegressor(max_depth=20)

In [183]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)

In [184]:
from sklearn.ensemble import AdaBoostRegressor
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300)

In [185]:
from sklearn.ensemble import GradientBoostingRegressor
GBR_reg = GradientBoostingRegressor(random_state=0)

In [186]:
import xgboost as xgb
XGB_reg=xgb.XGBRegressor() 

## Validation

In [187]:
from sklearn.model_selection import cross_val_score
for reg in (dtr_reg,forest_reg,ada_regr,GBR_reg,XGB_reg):
    reg.fit(X_train, y_train)
    print(reg.__class__.__name__, cross_val_score(reg, X_train, y_train, cv=2))

DecisionTreeRegressor [0.99999748 0.99998427]
RandomForestRegressor [0.99999874 0.99998295]
AdaBoostRegressor [0.99982404 0.99974743]
GradientBoostingRegressor [0.9999427  0.99993077]
XGBRegressor [0.99998599 0.99997503]


In [188]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
for reg in (dtr_reg,forest_reg,ada_regr,GBR_reg,XGB_reg):
    housing_predictions = reg.predict(X_train)
    forest_mse = mean_squared_error(y_train, housing_predictions)
    forest_rmse = np.sqrt(forest_mse)
    print(reg.__class__.__name__, " RMSE : ", forest_rmse)

DecisionTreeRegressor  RMSE :  0.0
RandomForestRegressor  RMSE :  56.41176140914266
AdaBoostRegressor  RMSE :  2225.7627438761638
GradientBoostingRegressor  RMSE :  832.1994884528523
XGBRegressor  RMSE :  222.17805399135497
