In [1]:
%matplotlib inline

import numpy as np, pandas as pd
import matplotlib.pyplot as plt 
from pathlib import Path
import seaborn as sns 
import sklearn

In [2]:
NB_DIR = Path.cwd()
DATA = NB_DIR/'data'
DATA.mkdir(exist_ok=True)

In [9]:
from sklearn.datasets import fetch_california_housing
house_dataset = fetch_california_housing()

In [21]:
house_df = pd.DataFrame(house_dataset['data'], columns=house_dataset['feature_names'])
house_df.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [22]:
house_df['HousePrice'] = house_dataset['target']
house_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,HousePrice
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [25]:
X = house_df.drop('HousePrice', axis=1)
y = house_df['HousePrice']

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [28]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [30]:
rf.fit(X_train, y_train)

In [31]:
y_pred = rf.predict(X_test)

In [34]:
mean_squared_error(y_test, y_pred)

0.2545599452819612

In [35]:
#skal prøve å få bedre resultater ved å bruke hyperparameter optimalisering.
#skal prøve gridsearch, randomsearch og Bayesian optimization

In [36]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 150, 200],
    'max_depth': [5, 50, None],
    'max_features': [2, 3, None]
}

In [37]:
gs_reg = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)

In [38]:
gs_reg.fit(X_train, y_train)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


Exception ignored in: <function ResourceTracker.__del__ at 0x1115e9bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


In [39]:
best_reg = gs_reg.best_estimator_

In [40]:
gs_reg.best_params_

{'max_depth': 50, 'max_features': 2, 'n_estimators': 200}

In [41]:
y_pred_best = best_reg.predict(X_test)
mean_squared_error(y_test, y_pred_best)

0.244300301104093

In [43]:
#ser at modellen ble litt bedre etter grid search
#skal nå prøve random serach:

In [44]:
from sklearn.model_selection import RandomizedSearchCV

In [45]:
n_iter = 5

In [46]:
rs_reg = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                            n_iter=n_iter, cv=3, verbose=1, n_jobs=-1, random_state=42)

In [47]:
rs_reg.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [48]:
rs_reg.best_estimator_

In [49]:
mean_squared_error(y_test, rs_reg.predict(X_test))

0.24415237309057058

In [50]:
#ser at vi fikk bedre resultater med random search.
#skal nå prøve den "beste" hyperparameter optimaliseringen: Bayesian optimization:

In [51]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval

In [52]:
from sklearn.model_selection import cross_val_score

def objective(params):
    # Our objective is to find the hyperparameters that gives the lowest mean squared error 
    # on validation data
    
    print(f"Using parameters {params}")
    # Our model
    model = RandomForestRegressor(**params, n_jobs=-1, random_state=42)
    # The max cross-val score with current paramaters:
    score = cross_val_score(model, X_train, y_train, cv=3, scoring="neg_mean_squared_error").mean()
    # We want to minimize the mean absolute error loss, not its negative:
    loss = -score
    return {'loss': loss, 'status': STATUS_OK}

In [53]:
param_space = {
    'n_estimators': hp.choice('n_estimators', [10, 50, 100, 150, 200]),
    'max_depth': hp.choice('max_depth', [5, 50, None]),
    'max_features': hp.choice('max_features', [2, 3, None])
}

In [54]:
import hyperopt.pyll.stochastic

In [55]:
print(hyperopt.pyll.stochastic.sample(param_space))

{'max_depth': 5, 'max_features': None, 'n_estimators': 150}


In [56]:
# By using the trials object we can check what goes on in each trial:
trials = Trials()

In [57]:
best = fmin(fn=objective, space=param_space, algo=tpe.suggest, 
            max_evals=8, trials=trials)

Using parameters {'max_depth': 50, 'max_features': None, 'n_estimators': 10}    
Using parameters {'max_depth': 5, 'max_features': 3, 'n_estimators': 10}        
Using parameters {'max_depth': 5, 'max_features': None, 'n_estimators': 50}     
Using parameters {'max_depth': 50, 'max_features': None, 'n_estimators': 100}   
Using parameters {'max_depth': None, 'max_features': None, 'n_estimators': 100} 
Using parameters {'max_depth': 5, 'max_features': None, 'n_estimators': 200}    
Using parameters {'max_depth': 5, 'max_features': None, 'n_estimators': 10}     
Using parameters {'max_depth': 50, 'max_features': 2, 'n_estimators': 150}      
100%|███████████| 8/8 [00:08<00:00,  1.07s/trial, best loss: 0.2545382193649211]


In [58]:
best

{'max_depth': np.int64(1),
 'max_features': np.int64(0),
 'n_estimators': np.int64(3)}

In [59]:
space_eval(param_space, best)

{'max_depth': 50, 'max_features': 2, 'n_estimators': 150}

In [60]:
model = RandomForestRegressor(**space_eval(param_space, best), random_state=42, n_jobs=-1)

In [61]:
model.fit(X_train, y_train)
mean_squared_error(y_test, model.predict(X_test))

0.24504076769745117

In [62]:
#dette var det dårligste resultatet så langt, men det var også den raskeste. forskjellen var veldig liten
#mellom de forskjellige optimaliseringene.