In [1]:
# Step 1: Import required libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Step 2: Custom Decision Tree class
class CustomDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth  # limit tree depth
        self.tree = None            # store trained tree

    def fit(self, X, y):
        # Step 3: Train the model
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        # Step 4: Get number of samples and features
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        # Step 5: Stop if only one class left
        if len(unique_classes) == 1:
            return {'class': unique_classes[0]}

        # Step 6: Stop if max depth reached
        if self.max_depth is not None and depth >= self.max_depth:
            return {'class': np.bincount(y).argmax()}

        best_gain = -1
        best_split = None

        # Step 7: Try every feature and threshold
        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                # Step 8: Split the data
                left = X[:, feature] <= threshold
                right = X[:, feature] > threshold

                if len(y[left]) == 0 or len(y[right]) == 0:
                    continue

                # Step 9: Calculate information gain
                gain = self._information_gain(y, y[left], y[right])

                # Step 10: Store best split
                if gain > best_gain:
                    best_gain = gain
                    best_split = (feature, threshold, left, right)

        # Step 11: If no good split, return majority class
        if best_split is None:
            return {'class': np.bincount(y).argmax()}

        feature, threshold, left, right = best_split

        # Step 12: Recursively build left and right tree
        return {
            'feature_idx': feature,
            'threshold': threshold,
            'left_tree': self._build_tree(X[left], y[left], depth+1),
            'right_tree': self._build_tree(X[right], y[right], depth+1)
        }

    def _information_gain(self, parent, left, right):
        # Step 13: Entropy formula
        def entropy(y):
            probs = np.bincount(y) / len(y)
            return -np.sum(probs * np.log2(probs + 1e-9))

        # Step 14: Calculate Information Gain
        return entropy(parent) - (len(left)/len(parent))*entropy(left) - (len(right)/len(parent))*entropy(right)

    def predict(self, X):
        # Step 15: Predict for all samples
        return [self._predict(x, self.tree) for x in X]

    def _predict(self, x, tree):
        # Step 16: If leaf node, return class
        if 'class' in tree:
            return tree['class']

        # Step 17: Traverse left or right
        if x[tree['feature_idx']] <= tree['threshold']:
            return self._predict(x, tree['left_tree'])
        return self._predict(x, tree['right_tree'])


In [2]:
# Step 18: Load Iris dataset
data = load_iris()
X = data.data
y = data.target

# Step 19: Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [3]:
# Step 20: Train custom decision tree
custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)

# Step 21: Predict using custom model
y_pred_custom = custom_tree.predict(X_test)

# Step 22: Accuracy of custom model
accuracy_custom = accuracy_score(y_test, y_pred_custom)
print("Custom Tree Accuracy:", accuracy_custom)


Custom Tree Accuracy: 1.0


In [4]:
# Step 23: Train Scikit-learn Decision Tree
sk_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sk_tree.fit(X_train, y_train)

# Step 24: Predict using sklearn model
y_pred_sklearn = sk_tree.predict(X_test)

# Step 25: Accuracy of sklearn model
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print("Sklearn Tree Accuracy:", accuracy_sklearn)


Sklearn Tree Accuracy: 1.0


In [5]:
# Step 26: Compare both results
print("Custom:", accuracy_custom)
print("Sklearn:", accuracy_sklearn)


Custom: 1.0
Sklearn: 1.0


In [6]:
# Step 27: Import ensemble models
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


In [7]:
# Step 28: Load Wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Step 29: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
# Step 30: Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Step 31: Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Step 32: Predictions
dt_preds = dt.predict(X_test)
rf_preds = rf.predict(X_test)

# Step 33: Compare F1 scores
print("Decision Tree F1:", f1_score(y_test, dt_preds, average='weighted'))
print("Random Forest F1:", f1_score(y_test, rf_preds, average='weighted'))


Decision Tree F1: 0.9439974457215836
Random Forest F1: 1.0


In [9]:
# Step 34: Import GridSearchCV
from sklearn.model_selection import GridSearchCV


In [10]:
# Step 35: Define parameters to test
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}


In [11]:
# Step 36: Apply GridSearchCV
grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1_weighted'
)

# Step 37: Train grid search
grid.fit(X_train, y_train)

# Step 38: Show best parameters
print("Best Parameters:", grid.best_params_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


In [12]:
# Step 39: Import regression models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


In [13]:
# Step 40: Create regression target (use alcohol as output)
y_reg = X[:, 0]
X_reg = X[:, 1:]

# Step 41: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)


In [14]:
# Step 42: Train regression models
dt_reg = DecisionTreeRegressor(random_state=42)
rf_reg = RandomForestRegressor(random_state=42)

dt_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)

# Step 43: Compare performance (R2 score)
print("Decision Tree R2:", dt_reg.score(X_test, y_test))
print("Random Forest R2:", rf_reg.score(X_test, y_test))


Decision Tree R2: 0.4774648003821349
Random Forest R2: 0.7416122628458712


In [15]:
# Step 44: Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_dist,
    n_iter=10,
    cv=5,
    random_state=42
)

# Step 45: Train random search
random_search.fit(X_train, y_train)

# Step 46: Best parameters
print("Best Parameters:", random_search.best_params_)


Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 5}
