In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

data = pd.read_csv('./housing.csv')

near_ocean_houses = data[data['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
housing_data = near_ocean_houses

housing_data['total_bedrooms'] = housing_data['total_bedrooms'].fillna(0)
housing_data.isna().sum()

housing_data['median_house_value'] = np.log1p(housing_data['median_house_value'])

X = housing_data.drop('median_house_value', axis=1)
y = housing_data['median_house_value']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

vectorizer = DictVectorizer(sparse=False)

X_train = vectorizer.fit_transform(train_dict)
X_val = vectorizer.fit_transform(val_dict)
X_test = vectorizer.fit_transform(test_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_data['total_bedrooms'] = housing_data['total_bedrooms'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_data['median_house_value'] = np.log1p(housing_data['median_house_value'])


## Question 1

In [22]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(max_depth=1, random_state=1)
regressor.fit(X_train, y_train)

splitting_feature_index = regressor.tree_.feature[0]  # 0 represents the root node
splitting_feature = X.columns[splitting_feature_index]

print(splitting_feature)

population


## Question 2

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

y_val_pred = rf_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(rmse)

0.23550993513297871


## Question 3

In [27]:
best_rmse = float('inf')
best_n_estimators = 0
n_estimators_range = range(10, 201, 10)

for n_estimators in n_estimators_range:
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    rf_model.fit(X_train, y_train)

    y_val_pred = rf_model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    if rmse < best_rmse:
        best_rmse = rmse
        best_n_estimators = n_estimators
    else:
        break

print(best_n_estimators)

80


## Question 4

In [28]:
max_depth_values = [10, 15, 20, 25]
n_estimators_range = range(10, 201, 10)

best_mean_rmse = float('inf')
best_max_depth = None

for max_depth in max_depth_values:
    rmse_values = []  
    for n_estimators in n_estimators_range:
        mean_rmse = 0
        num_iterations = 5  

        for _ in range(num_iterations):
            rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
            rf_model.fit(X_train, y_train)

            y_val_pred = rf_model.predict(X_val)

            rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
            mean_rmse += rmse

        mean_rmse /= num_iterations
        rmse_values.append(mean_rmse)

    best_n_estimators_index = np.argmin(rmse_values)
    best_n_estimators = n_estimators_range[best_n_estimators_index]

    best_mean_rmse = min(rmse_values)
    best_max_depth = max_depth

print(best_max_depth)

KeyboardInterrupt: 

## Question 5

In [30]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

feature_importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_dict = dict(zip(feature_names, feature_importances))
most_important_feature = max(feature_importance_dict, key=feature_importance_dict.get)

print(most_important_feature)

total_bedrooms


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Create a watchlist
watchlist = [(dtrain, 'train'), (dval, 'validation')]

# Parameters for the XGBoost model
xgb_params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

# List of eta values to experiment with
etas = [0.3, 0.1]

best_rmse = float('inf')
best_eta = None

for eta in etas:
    xgb_params['eta'] = eta
    num_round = 100  # Number of boosting rounds
    
    # Train the XGBoost model
    model = xgb.train(xgb_params, dtrain, num_round, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)
    
    # Predict on the validation set
    y_val_pred = model.predict(dval, ntree_limit=model.best_ntree_limit)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_eta = eta

print("The best eta for RMSE on the validation dataset is:", best_eta)
