In [5]:
from sklearn.datasets import load_wine
import numpy as np

dataset = load_wine()
X = dataset["data"]
y = dataset["target"]
feature_names = dataset["feature_names"]

In [6]:
print(f"There are {X.shape[0]} available records")
print(f"Class subdivision: {np.unique(y, return_counts=True)}")

There are 178 available records
Class subdivision: (array([0, 1, 2]), array([59, 71, 48], dtype=int64))


In [58]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

params ={ 
            "max_depth": [None, 2, 4, 6, 8, 10],
            "splitter": ["best", "random"],
            "min_samples_split": [0.1, 0.2, 0.3, 0.4],
            "min_samples_leaf": [0.02, 0.05, 0.1]
        }


X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
kf = KFold(12)

In [66]:
config_scores = []
for config in ParameterGrid(params):
    clf = DecisionTreeClassifier(**config)
    acc_scores = []
    
    for train_index, valid_index in kf.split(X_train_valid):
        X_train = X_train_valid[train_index]
        y_train = y_train_valid[train_index]
        X_valid = X_train_valid[valid_index]
        y_valid = y_train_valid[valid_index]

        clf = clf.fit(X_train,y_train)
        y_pred = clf.predict(X_valid)
        accuracy = accuracy_score(y_valid,y_pred)
        acc_scores.append(accuracy)
     
    avg_accuracy = np.mean(acc_scores)
    config_scores.append((config, avg_accuracy))

best_config = max(config_scores, key = lambda x : x[1])[0]
best_clf = DecisionTreeClassifier(**best_config).fit(X_train_valid, y_train_valid)
best_accuracy = max(config_scores, key = lambda x : x[1])[1]

print(f"Best configuration: {best_config}")
print(f"Accuracy: {best_accuracy}")

Best configuration: {'max_depth': 8, 'min_samples_leaf': 0.02, 'min_samples_split': 0.1, 'splitter': 'random'}
Accuracy: 0.9166666666666666


In [67]:
y_pred_new_data = best_clf.predict(X_test)
accuracy_on_new_data = accuracy_score(y_test,y_pred_new_data)
print(f"Accuracy on new data: {accuracy_on_new_data}")

Accuracy on new data: 0.9166666666666666


In [68]:
import pydot
from IPython.display import Image
from sklearn.tree import export_graphviz

dot_code = export_graphviz(best_clf, feature_names = feature_names, rounded=True)
with open("dot_code.txt", "w") as f:
    f.write(dot_code)
