In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("datasets/housing/housing.csv")

# Target variable
TARGET = "median_house_value"

X = df.drop(columns=[TARGET])
y = df[TARGET]

print(df.head())
print(df.info())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Identify column types
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_features),
    ("cat", categorical_pipeline, cat_features)
])


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001),
    "DecisionTree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "RandomForest": RandomForestRegressor(
        n_estimators=200, max_depth=20, random_state=42, n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(kernel="rbf", C=100, gamma=0.1)
}


In [7]:
from sklearn.model_selection import cross_val_score
import numpy as np

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])
    
    scores = cross_val_score(
        pipe,
        X_train,
        y_train,
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1
    )
    
    rmse = -scores.mean()
    results[name] = rmse
    print(f"{name:20s} RMSE: {rmse:,.2f}")


LinearRegression     RMSE: 68,622.54
Ridge                RMSE: 68,621.68


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso                RMSE: 68,622.54
DecisionTree         RMSE: 63,057.31
RandomForest         RMSE: 49,204.28
GradientBoosting     RMSE: 55,036.03
SVR                  RMSE: 97,183.88


In [9]:
results

{'LinearRegression': np.float64(68622.53528344534),
 'Ridge': np.float64(68621.68065408064),
 'Lasso': np.float64(68622.53527335648),
 'DecisionTree': np.float64(63057.306122841044),
 'RandomForest': np.float64(49204.27915995554),
 'GradientBoosting': np.float64(55036.02778321444),
 'SVR': np.float64(97183.8849955124)}

In [8]:
best_model_name = min(results, key=results.get)
print("\nBest Model:", best_model_name)



Best Model: RandomForest


In [10]:
best_model = models[best_model_name]

final_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", best_model)
])

final_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = final_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nTest MSE: {mse:,.2f}")
print(f"Test R²  : {r2:.3f}")



Test MSE: 2,388,544,992.17
Test R²  : 0.818


In [17]:
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [42]:
y_pred

array([ 51192.49454034,  69784.56584697, 467048.41333333, ...,
       498241.975     ,  70265.20095338, 170223.64631624], shape=(4128,))

In [43]:
sum(y_test)/len(y_test)

205500.30959302327

In [44]:
y_avg = [205500.30959302327]*len(y_test)

In [40]:
mse_avg_predictor = mean_squared_error(y_test, y_avg)

In [45]:
mse_avg_predictor

13104089782.408998

In [46]:
mse

2388544992.1695766

In [47]:
1 - mse/mse_avg_predictor

0.8177252268695554

In [53]:
if hasattr(best_model, "feature_importances_"):
    importances = final_pipeline.named_steps["model"].feature_importances_
    print("Feature importances available", importances)


Feature importances available [1.05414983e-01 1.01193146e-01 5.20290596e-02 2.33971492e-02
 2.36247603e-02 3.20021651e-02 1.79740935e-02 4.91598712e-01
 3.45480938e-03 1.41919375e-01 3.70164005e-04 6.87584745e-04
 6.33399749e-03]


In [54]:
final_pipeline.named_steps["model"]

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [55]:
import joblib
joblib.dump(final_pipeline, "california_housing_model.joblib")

['california_housing_model.joblib']

In [56]:
# https://chatgpt.com/share/6946b850-c6dc-8010-b82c-96f26cc8c54b

In [57]:
rf = final_pipeline.named_steps["model"]
importances = rf.feature_importances_

In [58]:
importances

array([1.05414983e-01, 1.01193146e-01, 5.20290596e-02, 2.33971492e-02,
       2.36247603e-02, 3.20021651e-02, 1.79740935e-02, 4.91598712e-01,
       3.45480938e-03, 1.41919375e-01, 3.70164005e-04, 6.87584745e-04,
       6.33399749e-03])

In [59]:
preprocess = final_pipeline.named_steps["preprocess"]

feature_names = []

# Numeric features (unchanged names)
num_features = preprocess.transformers_[0][2]
feature_names.extend(num_features)

# Categorical features (one-hot expanded)
cat_transformer = preprocess.transformers_[1][1]
cat_features = preprocess.transformers_[1][2]

ohe = cat_transformer.named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(cat_features)

feature_names.extend(cat_feature_names)

In [60]:
feature_names

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity_<1H OCEAN',
 'ocean_proximity_INLAND',
 'ocean_proximity_ISLAND',
 'ocean_proximity_NEAR BAY',
 'ocean_proximity_NEAR OCEAN']

In [61]:
import pandas as pd

fi = (
    pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

print(fi.head(15))


                       feature  importance
0                median_income    0.491599
1       ocean_proximity_INLAND    0.141919
2                    longitude    0.105415
3                     latitude    0.101193
4           housing_median_age    0.052029
5                   population    0.032002
6               total_bedrooms    0.023625
7                  total_rooms    0.023397
8                   households    0.017974
9   ocean_proximity_NEAR OCEAN    0.006334
10   ocean_proximity_<1H OCEAN    0.003455
11    ocean_proximity_NEAR BAY    0.000688
12      ocean_proximity_ISLAND    0.000370


In [62]:
final_pipeline

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [73]:
col_xform = final_pipeline.steps[0][1]

In [77]:
col_xform.transformers[1]

('cat',
 Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                 ('onehot', OneHotEncoder(handle_unknown='ignore'))]),
 Index(['ocean_proximity'], dtype='object'))

In [None]:
# The best thing to do is re-train the model from scratch


# Should I continue the training or start from the begining?

# Why would I continue the existing model instead of fresh training?
# Sometimes your data is too big. Let's you have 10 million records and you can't load them in memory.
# Sometimes, since it is taking too much time to train, 
# sometimes it is both

# mean -> rolling mean
old_mean = mean(first_batch)
new_mean = mean(second_batch)
final_mean = 0.5 * old_mean + 0.5 * new_mean ()

In [None]:

Model1.fit(First_batch)
Model2.fit(Second_batch)
Model3.fit(Third_batch)

y1 = model1.predict(X)
y2 = model2.predict(X)
y3 = model3.predict(X)

return mean_or_mode(y1, y2, y3)


In [None]:
model.fit(First_batch)

model.fit(Second_batch, continue=True)

model.fit(Third_batch, continue=True)

