In [None]:
import numpy as np

class Node:
    def __init__(self, feature=None, threshold=None, label=None):
        self.feature = feature
        self.threshold = threshold
        self.label = label
        self.left = None
        self.right = None

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def _calculate_gini_index(self, labels):
        classes, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        gini_index = 1 - np.sum(probabilities ** 2)
        return gini_index

    def _split_dataset(self, X, y, feature, threshold):
        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

    def _find_best_split(self, X, y):
        best_gini_index = float('inf')
        best_feature = None
        best_threshold = None

        for feature in range(X.shape[1]):
            unique_values = np.unique(X[:, feature])
            for threshold in unique_values:
                X_left, y_left, X_right, y_right = self._split_dataset(X, y, feature, threshold)
                gini_index = (len(y_left) * self._calculate_gini_index(y_left) +
                              len(y_right) * self._calculate_gini_index(y_right)) / len(y)
                if gini_index < best_gini_index:
                    best_gini_index = gini_index
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            label = np.argmax(np.bincount(y))
            return Node(label=label)

        feature, threshold = self._find_best_split(X, y)
        X_left, y_left, X_right, y_right = self._split_dataset(X, y, feature, threshold)

        node = Node(feature=feature, threshold=threshold)
        node.left = self._build_tree(X_left, y_left, depth + 1)
        node.right = self._build_tree(X_right, y_right, depth + 1)

        return node

    def fit(self, X, y):
        self.root = self._build_tree(X, y, 0)

    def _predict_instance(self, x, node):
        if node.label is not None:
            return node.label

        if x[node.feature] <= node.threshold:
            return self._predict_instance(x, node.left)
        else:
            return self._predict_instance(x, node.right)

    def predict(self, X):
        predictions = []
        for x in X:
            prediction = self._predict_instance(x, self.root)
            predictions.append(prediction)
        return np.array(predictions)

In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
#Loading the iris data
data = load_iris()
print('Classes to predict: ', data.target_names)

Classes to predict:  ['setosa' 'versicolor' 'virginica']


In [3]:
#Extracting data attributes
X = data.data
### Extracting target/ class labels
y = data.target

print('Number of examples in the data:', X.shape[0])

Number of examples in the data: 150


In [5]:
#First four rows in the variable 'X'
X[:4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

In [6]:
#Using the train_test_split to create train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

In [7]:
#Importing the Decision tree classifier from the sklearn library.
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = 'entropy')

In [9]:
#Training the decision tree classifier. 
clf.fit(X_train, y_train)

In [10]:
#Predicting labels on the test set.
y_pred =  clf.predict(X_test)

In [11]:
#Importing the accuracy metric from sklearn.metrics library

from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.9736842105263158


In [12]:
clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=50)
clf.fit(X_train, y_train)
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on the test data: ', accuracy_score(y_true=y_test, y_pred=clf.predict(X_test)))


Accuracy Score on train data:  0.9553571428571429
Accuracy Score on the test data:  0.9736842105263158


In [None]:
import numpy as np

class DecisionTree:
    # Implementation of Decision Tree class

    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    # Rest of the Decision Tree implementation...


class RandomForest:
    def __init__(self, num_trees=100, max_depth=None):
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        num_samples = X.shape[0]
        num_features = X.shape[1]

        for _ in range(self.num_trees):
            # Randomly select a subset of samples with replacement
            indices = np.random.choice(num_samples, num_samples, replace=True)
            X_subset = X[indices]
            y_subset = y[indices]

            # Randomly select a subset of features
            selected_features = np.random.choice(num_features, int(np.sqrt(num_features)), replace=False)
            X_subset = X_subset[:, selected_features]

            # Create and train a decision tree on the subset of data
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_subset, y_subset)

            # Add the trained tree to the forest
            self.trees.append(tree)

    def predict(self, X):
        predictions = []
        for tree in self.trees:
            # Make predictions using each tree in the forest
            predictions.append(tree.predict(X))

        # Aggregate predictions by majority voting
        predictions = np.array(predictions)
        majority_votes = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=0, arr=predictions)
        return majority_votes


In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
