In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("boston.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [4]:
X = df.drop(columns=["MEDV"])
y = df["MEDV"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
regressor = RandomForestRegressor(max_depth = 8, bootstrap=True)
regressor.fit(X_train, y_train)

In [6]:
print(regressor.score(X_train, y_train))
print(regressor.score(X_test, y_test))

0.9663622022096898
0.8781136027517966


In [7]:
y_pred = regressor.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

8.93839062599273
0.8781136027517966


In [8]:
df["MEDV"].describe()

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: MEDV, dtype: float64

In [9]:
param_grid = {
    'criterion': ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    'max_features': ["sqrt", "log2", None],
    'n_estimators': [10, 50, 100]
}
grid_search_rf = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=8, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

In [10]:
grid_search_rf.best_params_, grid_search_rf.best_score_

({'criterion': 'friedman_mse', 'max_features': 'log2', 'n_estimators': 10},
 -12.56034377764546)

In [11]:
y_pred_grid = grid_search_rf.predict(X_test)
y_pred_train_grid = grid_search_rf.predict(X_train)
print(f"Train MSE: {mean_squared_error(y_train, y_pred_train_grid)}, Train R^2 Score: {r2_score(y_train, y_pred_train_grid)}")
print(f"Test MSE: {mean_squared_error(y_test, y_pred_grid)}, Test R^2 Score: {r2_score(y_test, y_pred_grid)}")

Train MSE: 5.387940074362216, Train R^2 Score: 0.9379794063932123
Test MSE: 13.843695835655273, Test R^2 Score: 0.8112234874697509


In [12]:
y_pred_train_grid = grid_search_rf.predict(X_train)
print(mean_squared_error(y_train, y_pred_train_grid))
print(r2_score(y_train, y_pred_train_grid))

5.387940074362216
0.9379794063932123


In [13]:
best_regressor = RandomForestRegressor(
    criterion = 'poisson',
    max_features = "log2",
    max_depth = 10,
    n_estimators = 50
)

In [14]:
best_regressor.fit(X_train, y_train)
print(best_regressor.score(X_train, y_train))
print(best_regressor.score(X_test, y_test))

0.9727334288398314
0.861192324547161


In [15]:
rgs_xgb = xgb.XGBRegressor(booster='gbtree', seed=42, eval_metric='rmse')

xgb_param_grid = {
    'eta': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'gamma': [0, 0.25, 1]
}
grid_search_xgb = GridSearchCV(estimator=rgs_xgb, param_grid=xgb_param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=8, error_score='raise')
grid_search_xgb.fit(X_train, y_train)

In [16]:
grid_search_xgb.best_params_, grid_search_xgb.best_score_

({'eta': 0.1, 'gamma': 0, 'max_depth': 3}, -11.760544974885285)

In [17]:
xgb_param_grid = {
    'eta': [0.1, 0.15, 0.2],
    'max_depth': [1, 2, 3]
}
grid_search2_xgb = GridSearchCV(estimator=rgs_xgb, param_grid=xgb_param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=8, error_score='raise')
grid_search2_xgb.fit(X_train, y_train)

In [18]:
grid_search2_xgb.best_params_, grid_search2_xgb.best_score_

({'eta': 0.2, 'max_depth': 3}, -11.702976667226586)

In [19]:
xgb_best = xgb.XGBRegressor(booster='gbtree', seed=42, eval_metric='rmse',
                      eta=0.2, max_depth=3, gamma=0)
xgb_best.fit(X_train, y_train)

In [20]:
y_pred_xgb = xgb_best.predict(X_test)
y_pred_train_xgb = xgb_best.predict(X_train)
print(f"Train MSE: {mean_squared_error(y_train, y_pred_train_xgb)}, Train R^2 Score: {r2_score(y_train, y_pred_train_xgb)}")
print(f"Test MSE: {mean_squared_error(y_test, y_pred_xgb)}, Test R^2 Score: {r2_score(y_test, y_pred_xgb)}")

Train MSE: 0.9712263345226365, Train R^2 Score: 0.9888202108853691
Test MSE: 5.068823737993415, Test R^2 Score: 0.9308801002818591


In [21]:
y_pred_xgb = grid_search2_xgb.predict(X_test)
y_pred_train_xgb = grid_search2_xgb.predict(X_train)

In [22]:
print(mean_squared_error(y_train, y_pred_train_xgb))
print(r2_score(y_train, y_pred_train_xgb))

0.9712263345226365
0.9888202108853691


In [23]:
print(mean_squared_error(y_test, y_pred_xgb))
print(r2_score(y_test, y_pred_xgb))

5.068823737993415
0.9308801002818591


In [24]:
df["MEDV"].describe()

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: MEDV, dtype: float64