In [2695]:
import enum
import typing as tp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTreeClassifier:
    def __init__(self, max_features, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_arguments=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.root = None
        self.n_classes_ = None
        self.max_arguments = max_arguments
        self.max_features = max_features

    def _gini(self, y):
        if len(y) == 0:
            return 0
        classes = np.unique(y)
        gini = 1.0
        for cls in classes:
            p = np.sum(y == cls) / len(y)
            gini -= p ** 2
        return gini

    def _information_gain(self, y, y_left, y_right):
        impurity = self._gini(y)
        impurity_left = self._gini(y_left)
        impurity_right = self._gini(y_right)
        n = len(y)
        if n == 0:
            return 0
        n_left, n_right = len(y_left), len(y_right)
        gain = impurity - (n_left / n * impurity_left + n_right / n * impurity_right)
        return gain

    def _best_split(self, X, y):
        best_gain = -1
        best_feature_idx = None
        best_threshold = None

        n_samples, n_features = X.shape

        if n_samples <= 1:
            return None, None

        for feature_idx in np.random.choice(n_features, self.max_features, replace=False):
            feature_values = X[:, feature_idx]
            unique_values = np.unique(feature_values)

            for threshold in unique_values:
                left_mask = feature_values <= threshold
                right_mask = feature_values > threshold

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                    continue

                gain = self._information_gain(y, y[left_mask], y[right_mask])

                if gain > best_gain:
                    best_gain = gain
                    best_feature_idx = feature_idx
                    best_threshold = threshold

        return best_feature_idx, best_threshold

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        if (self.max_depth is not None and depth >= self.max_depth) or \
           n_samples < self.min_samples_split or \
           n_classes == 1:
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)
        feature_idx, threshold = self._best_split(X, y)

        if feature_idx is None:
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        left_mask = X[:, feature_idx] <= threshold
        right_mask = X[:, feature_idx] > threshold

        if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return Node(feature_index=feature_idx, threshold=threshold,
                    left=left_subtree, right=right_subtree)

    def _most_common_label(self, y):
        if len(y) == 0:
            return 0
        unique, counts = np.unique(y, return_counts=True)
        return unique[np.argmax(counts)]

    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.root = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._predict_single(x, self.root) for x in X])

    def _predict_single(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._predict_single(x, node.left)
        else:
            return self._predict_single(x, node.right)

In [2696]:
class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2,
                 min_samples_leaf=1, random_state=None, max_features = 'sqrt'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
        self.trees = []
        self.feature_importances_ = None
        self.n_classes_ = None
        self.max_features = max_features

    def _bootstrap_samples(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

    def fit(self, X, y):
        self.trees = []
        self.n_classes_ = len(np.unique(y))
        n_features = X.shape[1]
        if self.max_features == 'sqrt':
            max_features = int(np.sqrt(n_features))
        else:
            max_features = n_features
        for i in range(self.n_estimators):
            X_boot, y_boot = self._bootstrap_samples(X, y)
            tree = DecisionTreeClassifier(max_features=max_features,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
            )
            tree.fit(X_boot, y_boot)
            self.trees.append(tree)

    def predict(self, X):
        if not self.trees:
            raise ValueError("Модель не обучена. Сначала вызовите fit().")
        tree_preds = []
        for tree in self.trees:
            pred = tree.predict(X)
            tree_preds.append(pred)

        tree_preds = np.array(tree_preds).T
        y_pred = [np.bincount(preds).argmax() for preds in tree_preds]
        return np.array(y_pred)
    
        
    def _stratified_bootstrap(self, X, y):
        n_samples = len(y)
        class_indices = {}
        for class_label in np.unique(y):
            class_indices[class_label] = np.where(y == class_label)[0]
        bootstrap_indices = []
        for class_label, indices in class_indices.items():
            n_class_samples = len(indices)
            class_bootstrap = np.random.choice(
                indices, 
                n_class_samples, 
                replace=True
            )
            bootstrap_indices.extend(class_bootstrap)
        bootstrap_indices = np.array(bootstrap_indices)
        np.random.shuffle(bootstrap_indices)
        return X[bootstrap_indices] , y[bootstrap_indices]
    
    def stratified_fit(self, X, y):
        self.trees = []
        self.n_classes_ = len(np.unique(y))
        n_features = X.shape[1]        
        if self.max_features == 'sqrt':
            max_features = int(np.sqrt(n_features))
        else:
            max_features = n_features
        for i in range(self.n_estimators):
            X_boot, y_boot = self._stratified_bootstrap(X, y)
            tree = DecisionTreeClassifier(max_features=max_features,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf
            )
            tree.fit(X_boot, y_boot)
            self.trees.append(tree)


In [2697]:
df = pd.read_csv('./datasets/famcs_students.csv')
df.head()

Unnamed: 0,cource,group,stream,sex,age,ss,interest,os,weekend_study,bad_sleep,...,social,sport,miss,study_form,foot_size,eye_color,score,retake,hostel,literature
0,4,11,Прикладная информатика,М,20.0,Нет,Математика,MacOS,Да,Да,...,Экстраверт,"Редко, легкая физкультура",3.0,Бюджет,48.0,Карие,9.2,0,"Нет, я из Минска",Да
1,4,11,Прикладная информатика,Ж,20.0,Нет,Программирование,MacOS,Нет,Нет,...,Экстраверт,"Да, я спортсмен",5.0,Бюджет,39.0,Зеленые,8.8,0,"Нет, я из Минска",Да
2,4,11,Прикладная информатика,Ж,19.0,Нет,Программирование,MacOS,Да,Нет,...,Экстраверт,Вообще нет,10.0,Бюджет,41.0,Карие,8.8,0,"Нет, я из Минска",Да
3,4,11,Прикладная информатика,Ж,20.0,Нет,Математика,MacOS,Да,Да,...,Экстраверт,"Да, я спортсмен",3.0,Бюджет,36.0,Карие,8.8,0,"Нет, я из Минска",Да
4,4,11,Прикладная информатика,М,20.0,Нет,Математика,Windows,Да,Нет,...,Интроверт,Вообще нет,1.0,Бюджет,46.0,Зеленые,9.0,0,"Нет, я из Минска",Да


In [2698]:
print(df.columns)

Index(['cource', 'group', 'stream', 'sex', 'age', 'ss', 'interest', 'os',
       'weekend_study', 'bad_sleep', 'glasses', 'work_experience', 'ai',
       'height', 'anime', 'social', 'sport', 'miss', 'study_form', 'foot_size',
       'eye_color', 'score', 'retake', 'hostel', 'literature'],
      dtype='object')


In [2699]:
binary_cols = ['ss', 'interest', 'weekend_study', 'bad_sleep', 'glasses', 'anime', 'study_form', 'literature', 'sex']
N = 8
target_col = binary_cols[N % 9]

In [2700]:
for col in binary_cols:
    le_target = LabelEncoder()
    if df[col].dtype == 'object' or not np.issubdtype(df[col].dtype, np.number):
        df[col] = le_target.fit_transform(df[col].astype(str))
    else:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

x_raw = df.drop(columns=[target_col]).copy()
y = df[target_col].values

cat_cols = x_raw.select_dtypes(include=['object', 'category']).columns.tolist()
if len(cat_cols) > 0:
    x_encoded = pd.get_dummies(x_raw, columns=cat_cols, drop_first=True)
else:
    x_encoded = x_raw.copy()

for c in x_encoded.columns:
    x_encoded[c] = pd.to_numeric(x_encoded[c], errors='coerce')
x_encoded = x_encoded.fillna(x_encoded.median())

print("после кодирования shape x:", x_encoded.shape)

после кодирования shape x: (127, 45)


In [2701]:
x = x_encoded.copy()
x = np.asarray(x)
y = np.asarray(y)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.30, stratify=y
)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, stratify=y_temp
)

print("sizes: train", x_train.shape, "val", x_val.shape, "test", x_test.shape)

sizes: train (88, 45) val (19, 45) test (20, 45)


In [2702]:
forest = RandomForestClassifier(100, 4, max_features='sqrt')
forest.fit(x_train, y_train)

In [2703]:
y_pred = forest.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

[[ 4  3]
 [ 0 13]]
0.7857142857142857


In [2704]:
y_res = forest.predict(x_val)
print(confusion_matrix(y_val, y_res))
print(roc_auc_score(y_val, y_res))

[[ 6  0]
 [ 0 13]]
1.0
