In [1]:
import pandas as pd

data = pd.read_csv("data/regression_preprocessed.csv")

In [2]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,5,6,2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,4,6,3,59.8,61.0,3.89,3.84,2.31,326
2,0.23,2,6,5,56.9,65.0,4.05,4.07,2.31,327
3,0.29,4,2,4,62.4,58.0,4.20,4.23,2.63,334
4,0.31,2,1,2,63.3,58.0,4.34,4.35,2.75,335
...,...,...,...,...,...,...,...,...,...,...
53789,0.72,5,7,3,60.8,57.0,5.75,5.76,3.50,2757
53790,0.72,2,7,3,63.1,55.0,5.69,5.75,3.61,2757
53791,0.70,3,7,3,62.8,60.0,5.66,5.68,3.56,2757
53792,0.86,4,3,2,61.0,58.0,6.15,6.12,3.74,2757


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
X = data.drop('price', axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .3, random_state=20)

In [5]:
import numpy as np
from sklearn.base import clone
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

class CustomGradientBoostingRegressor:
    def __init__(self, base_estimator=None, n_estimators=100, learning_rate=0.1):
        self.base_estimator = base_estimator if base_estimator is not None else DecisionTreeRegressor(max_depth=3)
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimators = []
        self.initial_prediction = None

    def fit(self, X, y):
        self.initial_prediction = np.mean(y)
        residuals = y - self.initial_prediction

        for _ in range(self.n_estimators):
            estimator = clone(self.base_estimator)
            estimator.fit(X, residuals)
            predictions = estimator.predict(X)
            residuals -= self.learning_rate * predictions
            self.estimators.append(estimator)

    def predict(self, X):
        y_pred = np.full(X.shape[0], self.initial_prediction)
        for estimator in self.estimators:
            y_pred += self.learning_rate * estimator.predict(X)
        return y_pred


In [9]:
gbr_custom = CustomGradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
gbr_custom.fit(X_train, y_train)
y_pred = gbr_custom.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("С DecisionTreeRegressor в качестве weak learner(по умолчанию):")

print(f"mse: {mse}\nr2: {r2}\n")


from sklearn.linear_model import LinearRegression
gbr_custom_linear = CustomGradientBoostingRegressor(base_estimator=LinearRegression(), n_estimators=100, learning_rate=0.1)
gbr_custom_linear.fit(X_train, y_train)
y_pred_custom = gbr_custom_linear.predict(X_test)
mse_custom = mean_squared_error(y_test, y_pred_custom)
r2_custom = r2_score(y_test, y_pred_custom)
print("С линейной регрессией в качестве weak learner:")
print(f"mse: {mse_custom}\nr2: {r2_custom}")

С DecisionTreeRegressor в качестве weak learner(по умолчанию):
mse: 370448.28729168524
r2: 0.9767866339254379

С линейной регрессией в качестве weak learner:
mse: 1489292.093640286
r2: 0.9066766300517353


In [11]:
from sklearn.ensemble import GradientBoostingRegressor # for evaluation purposes
gbr_lib = GradientBoostingRegressor(n_estimators=100)
gbr_lib.fit(X_train, y_train)
y_lib_pred = gbr_lib.predict(X_test)

mse_lib = mean_squared_error(y_test, y_lib_pred)
r2_lib = r2_score(y_test, y_lib_pred)
print(f'Библиотечная модель: \nr2: {r2_lib} \nmse: {mse_lib}')

Библиотечная модель: 
r2: 0.9767866339254379 
mse: 370448.28729168524


Результат совпадает до последних цифр mse

In [8]:
import joblib 

In [12]:
joblib.dump(gbr_custom, "models/gbr_custom")
joblib.dump(gbr_custom_linear, "models/gbr_custom_linear")
joblib.dump(gbr_lib, "models/gbr_lib")

['models/gbr_lib']