In [1]:
import numpy as np

## Numpy way

In [2]:
class SimpleDicisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.left = None
        self.right = None
        self.feature_index = None
        self.threshold = None
        self.value = None
        
    def fit(self, X,y, depth=0):
        if depth == self.max_depth or len(np.unique(y))==1:
            self.value = np.argmax(np.bincount(y.astype(int)))
            return
        
        n_samples, n_features = X.shape
        best_gini = 1.0
        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold
            if sum(left_indices) == 0 or sum(right_indices)==0:
                continue
            gini = self._calculate_gini(y[left_indices], y[right_indices])
            if gini < best_gini:
                best_gini = gini
                self.feature_index = feature
                self.threshold = threshold
        if self.feature_index is not None:
            left_indices = X[:, self.feature_index] <= self.threhold
            right_indices = X[:, self.feature_index] > self.threhold
            self.left = SimpleDicisionTree(max_depth=self.max_depth)
            self.left.fit(X[left_indices], y[left_indices], deph+1)
            self.right = SimpleDicisionTree(max_depth=self.max_depth)
            self.right.fit(X[right_indices], y[right_indices], deph+1)
        else:
            self.value = np.argmax(np.bincount(y.astype(int)))
            
    def _calulate_gini(self, left_labels, right_labels):
        total = len(left_labels) + len(right_labels)
        gini_left = 1.0 - sum([(left_labels == c).mean()**2 for c in np.unique(left_labels)])
        gini_right = 1.0 - sum([(right_labels == c).mean()**2 for c in np.unique(right_labels)])
        return (len(left_labels) * gini_left + len(right_labels) * gini_right)/total
        
    def predict(self, X):
        if self.feature_index is None:
            return np.array([self.value] * len(X))
        else:
            left_indices = X[:, self.feature_index] <= self.threshold
            right_indices = X[:, self.feature_index] > self.threshold
            predictions = np.zeros(len(X), dtype=int)
            predictions[left_indices] = self.left.predict(X[left_indices])
            predictions[right_indices] = self.right.predict(X[right_indices])
            return predictions

In [7]:
class SimpleCatBoost:
    def __init__(self, n_estimator=100, learning_rate=0.1):
        self.n_estimator = n_estimator
        self.learning_rate = learning_rate
        self.estimators = []
        self.target_means = {}
        
    def fit(self, X, y, cat_features):
        y_pred = np.zeros(len(y))
        
        for feature in cat_features:
            self.target_means[feature] = {}
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                mask = X[:, feature]==value
                self.target_means[feature][value] = np.mean(y[mask])
                
        for _ in range(self.n_estimator):
            residual = y - y_pred
            tree = SimpleDicisionTree()
            tree.fit(X, residual)
            y_pred += self.learning_rate * tree.predict(X)
            self.estimators.append(tree)
            
    def predict(self, X):
        y_pred = np.zeros(len(X))
        for tree in self.estimators:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred

In [3]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
#Load the dataset
wine = load_wine()
X, y = wine.data, wine.target

In [5]:
#Split the data into training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
cat_features = []

In [8]:
#Initialize the simplecatboost model
model = SimpleCatBoost(n_estimator = 100, learning_rate=0.1)

In [9]:
model.fit(X_train, y_train, cat_features)

In [10]:
#Make predictions
y_pred = model.predict(X_test)

In [11]:
#Round predicion to the nearest integer for classification
y_pred_rounded = np.round(y_pred)

In [12]:
#Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_rounded)
print(f"Accuracy: {accuracy}")

Accuracy: 0.3888888888888889


## 2. sklearn way

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [2]:
#Load the dataset
iris = load_iris()
X, y = iris.data, iris.target

In [3]:
#Split the data into training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
#Initialize catboost classifier model and fit
catboost_model = CatBoostClassifier(n_estimators=100, learning_rate=0.1, depth=5)
catboost_model.fit(X_train, y_train)

0:	learn: 0.9726042	total: 137ms	remaining: 13.6s
1:	learn: 0.8721248	total: 139ms	remaining: 6.8s
2:	learn: 0.7954929	total: 139ms	remaining: 4.51s
3:	learn: 0.7257164	total: 140ms	remaining: 3.36s
4:	learn: 0.6618108	total: 141ms	remaining: 2.67s
5:	learn: 0.6062682	total: 142ms	remaining: 2.22s
6:	learn: 0.5574738	total: 142ms	remaining: 1.89s
7:	learn: 0.5222422	total: 143ms	remaining: 1.65s
8:	learn: 0.4880434	total: 144ms	remaining: 1.45s
9:	learn: 0.4533366	total: 144ms	remaining: 1.3s
10:	learn: 0.4235642	total: 145ms	remaining: 1.17s
11:	learn: 0.4012245	total: 145ms	remaining: 1.06s
12:	learn: 0.3792420	total: 146ms	remaining: 974ms
13:	learn: 0.3584687	total: 146ms	remaining: 897ms
14:	learn: 0.3403932	total: 146ms	remaining: 830ms
15:	learn: 0.3229021	total: 147ms	remaining: 772ms
16:	learn: 0.3077965	total: 148ms	remaining: 721ms
17:	learn: 0.2923473	total: 148ms	remaining: 675ms
18:	learn: 0.2778841	total: 149ms	remaining: 634ms
19:	learn: 0.2638524	total: 149ms	remaining

<catboost.core.CatBoostClassifier at 0x233e3d62990>

In [5]:
y_Pred = catboost_model.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score

In [9]:
accuracy = accuracy_score(y_test, y_Pred)
print(f"Accracy: {accuracy}")

Accracy: 1.0
