In [78]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint
import numpy as np

# Step 1: Load wine dataset
wine = load_wine()
X = wine.data
y = wine.target

# Step 2: Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Hyperparameter tune the Decision Tree using RandomizedSearchCV
dt_classifier = DecisionTreeClassifier()
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': randint(1, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2']
}

random_search = RandomizedSearchCV(dt_classifier, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_dt_model = random_search.best_estimator_
y_pred_dt = best_dt_model.predict(X_test)

# Evaluate the Decision Tree model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of Decision Tree:", accuracy_dt)

# Step 4: Grow a Random Forest
# Create 10 subsets using ShuffleSplit
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# Train 1 Decision Tree on each subset
forest = []
for train_index, _ in ss.split(X_train):
    subset_X_train, subset_y_train = X_train[train_index], y_train[train_index]
    tree = DecisionTreeClassifier(**best_params)
    tree.fit(subset_X_train, subset_y_train)
    forest.append(tree)

# Evaluate all the trees on the test dataset
forest_predictions = np.array([tree.predict(X_test) for tree in forest])
forest_predictions_majority_vote = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=forest_predictions)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, forest_predictions_majority_vote)
print("Accuracy of Random Forest:", accuracy_rf)


Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 2}
Accuracy of Decision Tree: 0.9166666666666666
Accuracy of Random Forest: 0.9722222222222222
