# Random Forest

In [None]:
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Simple Decision Tree for classification with max depth
class DecisionTree:
    def __init__(self, max_depth=3):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        num_classes = len(np.unique(y))

        # Stop criteria
        if depth >= self.max_depth or num_classes == 1 or n_samples <= 1:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'type': 'leaf', 'class': leaf_value}

        # Find best split
        best_feat, best_thresh, best_gain = None, None, 0
        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                gain = self._information_gain(X[:, feature], y, t)
                if gain > best_gain:
                    best_gain = gain
                    best_feat = feature
                    best_thresh = t

        if best_gain == 0:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'type': 'leaf', 'class': leaf_value}

        left_idx = X[:, best_feat] <= best_thresh
        right_idx = X[:, best_feat] > best_thresh
        left_tree = self._build_tree(X[left_idx], y[left_idx], depth+1)
        right_tree = self._build_tree(X[right_idx], y[right_idx], depth+1)
        return {'type': 'node', 'feature': best_feat, 'threshold': best_thresh,
                'left': left_tree, 'right': right_tree}

    def _entropy(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        probs = probs[probs > 0]
        return -np.sum(probs * np.log2(probs))

    def _information_gain(self, feature_col, y, threshold):
        parent_entropy = self._entropy(y)
        left_idx = feature_col <= threshold
        right_idx = feature_col > threshold
        n = len(y)
        n_left, n_right = sum(left_idx), sum(right_idx)
        if n_left == 0 or n_right == 0:
            return 0
        child_entropy = (n_left/n)*self._entropy(y[left_idx]) + (n_right/n)*self._entropy(y[right_idx])
        return parent_entropy - child_entropy

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def _predict_sample(self, x, node):
        if node['type'] == 'leaf':
            return node['class']
        if x[node['feature']] <= node['threshold']:
            return self._predict_sample(x, node['left'])
        else:
            return self._predict_sample(x, node['right'])

# Random Forest Classifier
class RandomForest:
    def __init__(self, n_trees=10, max_features=None, max_depth=3):
        self.n_trees = n_trees
        self.max_features = max_features
        self.max_depth = max_depth
        self.trees = []

    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idx = np.random.choice(n_samples, n_samples, replace=True)
        return X[idx], y[idx]

    def fit(self, X, y):
        self.trees = []
        n_features = X.shape[1]
        max_features = self.max_features or int(np.sqrt(n_features))

        for _ in range(self.n_trees):
            X_sample, y_sample = self.bootstrap_sample(X, y)
            feature_idx = np.random.choice(n_features, max_features, replace=False)
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_sample[:, feature_idx], y_sample)
            tree.feature_idx_map = feature_idx
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = []
        for tree in self.trees:
            X_subset = X[:, tree.feature_idx_map]
            tree_preds.append(tree.predict(X_subset))
        tree_preds = np.array(tree_preds)
        y_pred = []
        for sample_preds in tree_preds.T:
            vote = Counter(sample_preds).most_common(1)[0][0]
            y_pred.append(vote)
        return np.array(y_pred)

# Example usage
if __name__ == "__main__":
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split

    data = load_iris()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    rf = RandomForest(n_trees=15, max_features=2, max_depth=3)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Confusion Matrix:
 [[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]


# Adaboost

In [2]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------
# Simple Decision Stump (weak learner)
# -----------------------------
class DecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.polarity = 1
        self.alpha = None

    def predict(self, X):
        n_samples = X.shape[0]
        preds = np.ones(n_samples)
        if self.polarity == 1:
            preds[X[:, self.feature_index] <= self.threshold] = -1
        else:
            preds[X[:, self.feature_index] > self.threshold] = -1
        return preds


# -----------------------------
# AdaBoost Classifier
# -----------------------------
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.stumps = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Convert y to {-1, 1}
        y = np.where(y == 0, -1, y)  # in case labels start at 0

        # Initialize uniform weights
        w = np.full(n_samples, (1 / n_samples))

        self.stumps = []
        for _ in range(self.n_estimators):
            stump = DecisionStump()
            min_error = float("inf")

            # Find best stump
            for feature_i in range(n_features):
                feature_values = X[:, feature_i]
                thresholds = np.unique(feature_values)
                for threshold in thresholds:
                    for polarity in [1, -1]:
                        preds = np.ones(n_samples)
                        if polarity == 1:
                            preds[feature_values <= threshold] = -1
                        else:
                            preds[feature_values > threshold] = -1

                        error = np.sum(w[y != preds])

                        if error < min_error:
                            stump.polarity = polarity
                            stump.threshold = threshold
                            stump.feature_index = feature_i
                            min_error = error

            # Compute alpha (learner weight)
            EPS = 1e-10
            stump.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))

            # Update sample weights
            preds = stump.predict(X)
            w *= np.exp(-stump.alpha * y * preds)
            w /= np.sum(w)  # normalize

            self.stumps.append(stump)

    def predict(self, X):
        stump_preds = np.array([stump.alpha * stump.predict(X) for stump in self.stumps])
        y_pred = np.sign(np.sum(stump_preds, axis=0))
        return np.where(y_pred == -1, 0, y_pred)  # convert back {0,1,...}


# -----------------------------
# Example usage
# -----------------------------
if __name__ == "__main__":
    # Binary classification: Iris-setosa vs non-setosa
    iris = load_iris()
    X = iris.data
    y = (iris.target != 0).astype(int)  # Setosa=0, others=1

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    clf = AdaBoost(n_estimators=20)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        30

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Confusion Matrix:
 [[15  0]
 [ 0 30]]


# Voting Classifier

In [1]:
# Voting Classifier Example with Scikit-learn
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Base models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Define base learners
log_clf = LogisticRegression(max_iter=200, random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=5)
dt_clf = DecisionTreeClassifier(max_depth=4, random_state=42)

# --- Hard Voting ---
hard_voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("knn", knn_clf), ("dt", dt_clf)],
    voting="hard"
)

# --- Soft Voting ---
soft_voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("knn", knn_clf), ("dt", dt_clf)],
    voting="soft"
)

# Train and evaluate both
for clf, label in [(hard_voting_clf, "Hard Voting"),
                   (soft_voting_clf, "Soft Voting")]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("="*40)
    print(label)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Hard Voting
Accuracy: 0.9555555555555556
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.93      0.93      0.93        15
           2       0.93      0.93      0.93        15

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

Confusion Matrix:
 [[15  0  0]
 [ 0 14  1]
 [ 0  1 14]]
Soft Voting
Accuracy: 0.9333333333333333
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.93      0.87      0.90        15
           2       0.88      0.93      0.90        15

    accuracy                           0.93        45
   macro avg       0.93      0.93      0.93        45
weighted avg       0.93      0.93      0.93        45

Confusion Matrix:
 [[15  0  0]
 [ 0 13  2]
 [ 0  1 1