In [3]:
import pandas as pd
data = pd.read_csv("data/classification_preprocessed.csv")
data

Unnamed: 0,ratio_to_median_purchase_price,distance_from_last_transaction,distance_from_home,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,1.945940,0.311140,57.877857,1,1,0,0,0
1,1.294219,0.175592,10.829943,1,0,0,0,0
2,0.427715,0.805153,5.091079,1,0,0,1,0
3,0.362663,5.600044,2.247564,1,1,0,1,0
4,2.222767,0.566486,44.190936,1,1,0,1,0
...,...,...,...,...,...,...,...,...
718745,1.626798,0.112651,2.207101,1,1,0,0,0
718746,2.778303,2.683904,19.872726,1,1,0,0,0
718747,0.218075,1.472687,2.914857,1,1,0,1,0
718748,0.475822,0.242023,4.258729,1,0,0,1,0


In [21]:
shrinked_data = data[:100_000]

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [22]:
X = shrinked_data.drop('fraud', axis=1)
y = shrinked_data['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import clone

class CustomGradientBoostingClassifier:
    def __init__(self, base_estimator=None, n_estimators=100, learning_rate=0.1):
        self.base_estimator = base_estimator if base_estimator is not None else DecisionTreeRegressor(max_depth=3)
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimators = []
        self.initial_prediction = None

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        assert len(self.classes_) == 2, "This implementation supports only binary classification"

        # Transform y to {-1, 1}
        y_transformed = np.where(y == self.classes_[0], -1, 1)
        
        # Initial prediction: log(odds)
        self.initial_prediction = np.log((y_transformed == 1).sum() / (y_transformed == -1).sum())
        f_m = np.full(y.shape, self.initial_prediction)
        
        for _ in range(self.n_estimators):
            # transforming into logistic function
            p_m = 1 / (1 + np.exp(-f_m)) 
            residuals = y_transformed - p_m
            estimator = clone(self.base_estimator)
            estimator.fit(X, residuals)
            self.estimators.append(estimator)
            f_m += self.learning_rate * estimator.predict(X)

    def predict_proba(self, X):
        f_m = np.full(X.shape[0], self.initial_prediction)
        for estimator in self.estimators:
            f_m += self.learning_rate * estimator.predict(X)
        # logistic f again - into probability
        p_m = 1 / (1 + np.exp(-f_m))
        return np.vstack((1 - p_m, p_m)).T

    def predict(self, X):
        proba = self.predict_proba(X)
        return np.where(proba[:, 1] > 0.5, self.classes_[1], self.classes_[0])

if __name__ == "__main__":
    gbc_custom = CustomGradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
    gbc_custom.fit(X_train, y_train)
    y_pred_custom = gbc_custom.predict(X_test)
    report_custom = classification_report(y_test, y_pred_custom, zero_division=1)
    confusion_custom = confusion_matrix(y_test, y_pred_custom)
    print(f"С DecisionTreeRegressor в качестве weak learner(по умолчанию): \
          \n classification report: {report_custom} \n confusion matrix: \n{confusion_custom}")

    from sklearn.linear_model import Ridge
    gbc_custom_ridge = CustomGradientBoostingClassifier(base_estimator=Ridge(), n_estimators=100, learning_rate=0.1)
    gbc_custom_ridge.fit(X_train, y_train)
    y_pred_custom_ridge = gbc_custom_ridge.predict(X_test)


    report_custom_ridge = classification_report(y_test, y_pred_custom_ridge, zero_division=1)
    confusion_custom_ridge = confusion_matrix(y_test, y_pred_custom_ridge)
    print(f"С Ridge в качестве weak learner: \
          \n classification report: {report_custom_ridge} \n confustion matrix: \n{confusion_custom_ridge}")



С DecisionTreeRegressor в качестве weak learner(по умолчанию):           
 classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19709
           1       1.00      0.98      0.99       291

    accuracy                           1.00     20000
   macro avg       1.00      0.99      0.99     20000
weighted avg       1.00      1.00      1.00     20000
 
 confusion matrix: 
[[19709     0]
 [    6   285]]
С Ridge в качестве weak learner:           
 classification report:               precision    recall  f1-score   support

           0       0.99      1.00      0.99     19709
           1       1.00      0.00      0.00       291

    accuracy                           0.99     20000
   macro avg       0.99      0.50      0.50     20000
weighted avg       0.99      0.99      0.98     20000
 
 confustion matrix: 
[[19709     0]
 [  291     0]]


In [30]:
from sklearn.ensemble import GradientBoostingClassifier 
lib_gbc = GradientBoostingClassifier()
lib_gbc.fit(X_train, y_train)

In [35]:
y_pred_lib = lib_gbc.predict(X_test)

report_lib = classification_report(y_test, y_pred_lib, zero_division=1)
confusion_lib = confusion_matrix(y_test, y_pred_lib)
print(f"Библиотечная модель: \n classification report: {report_lib} \
      \n confustion matrix: \n{confusion_lib}")

Библиотечная модель: 
 classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19709
           1       1.00      1.00      1.00       291

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000
       
 confustion matrix: 
[[19708     1]
 [    0   291]]


In [37]:
import joblib
joblib.dump(lib_gbc, "models/gbc_lib")
joblib.dump(gbc_custom, "models/gbc_custom")
joblib.dump(gbc_custom_ridge, "models/gbc_custom_ridge")

['models/gbc_custom_ridge']