In [16]:
import pandas as pd

# Wczytanie danych z pliku CSV do obiektu DataFrame za pomocą biblioteki pandas
data = pd.read_csv("mushrooms.csv")

In [17]:
from numpy import ndarray

def prepare_data_sets(number_of_used_attributes: int) -> list[ndarray, ndarray]:
    """Prepares data sets (columns/attributes) and labels for testing..

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.

    Returns:
        list[ndarray, ndarray]: dataset and target ndarray.
    """
    try:
        dataset = data.values[:, 1:number_of_used_attributes]
        target = data.values[:, 0]
        return dataset, target
    except Exception as error:
        print(f"An unexpected error: {error} while preparing data sets.")

In [18]:
from sklearn import preprocessing

def convert_labels_and_attributes_to_numeric_values(number_of_used_attributes: int, dataset: ndarray, target: ndarray) -> None:
    """Convert labels (edible/non-edible) and attributes to numeric values using LabelEncoder
    
    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        dataset (ndarray): dataset array.
        target (ndarray): target array.
    """
    try:
        label_encoder = preprocessing.LabelEncoder()
        target = label_encoder.fit_transform(target)
        for x in range(0, number_of_used_attributes - 1):
            dataset[:, x] = label_encoder.fit_transform(dataset[:, x])
            
        return dataset, target
    except Exception as error:
        print(f"An unexpected error: {error} while converting labels to numeric values.")

In [19]:
from sklearn.model_selection import train_test_split

def splits_data_for_testing(_test_size: float, dataset: ndarray, target: ndarray) -> list:
    """Splits the data into a training set and a test set.

    Args:
        _test_size (float): proportion of the dataset to include in the train split.
            dataset (ndarray): dataset array.
        target (ndarray): target array.

    Returns:
        list: list of training and testing data and target.
    """
    try:
        training_data, testing_data, training_target, testing_target = \
            train_test_split(dataset, target.reshape(-1, 1), test_size=_test_size)
        return training_data, testing_data, training_target, testing_target
    except Exception as error:
        print(f"An unexpected error: {error} while splitting data for testing.")

In [20]:
from sklearn.tree import DecisionTreeClassifier

def train_decision_tree_model(_max_depth: int, training_data: ndarray, training_target: ndarray, testing_data: ndarray, criterion: str = 'entropy') -> ndarray:
    """Initializes and trains of the decision tree model.

    Args:
        _max_depth (int): max depth of the decision tree.
        training_data (ndarray): training data.
        training_target (ndarray): training target.
        testing_data (ndarray): testing data.

    Returns:
        ndarray: the predicted class for each sample in X is returned.
    """
    try:
        decision_tree = DecisionTreeClassifier(criterion=criterion, max_depth=_max_depth)
        decision_tree.fit(training_data, training_target)
        return decision_tree.predict(testing_data), decision_tree
    except Exception as error:
        print(f"An unexpected error: {error} while training decision tree model.")

In [21]:
from sklearn.metrics import confusion_matrix

def print_confusion_matrix(decision_predict_tree_array: ndarray, testing_target: ndarray) -> None:
    """Prints confusion matrix.

    Args:
        decision_predict_tree_array (ndarray): the predicted class for each sample in X is returned.
        testing_target (ndarray): testing target.
    """
    try:
        print("Confusion Matrix:")
        print(f"{confusion_matrix(testing_target, decision_predict_tree_array)}")
    except Exception as error:
        print(f"An unexpected error: {error} while printing confusion matrix.")

In [22]:
from sklearn.metrics import accuracy_score

def print_accuracy_score(decision_predict_tree_array: ndarray, testing_target: ndarray) -> None:
    """Prints accuracy score.

    Args:
        decision_predict_tree_array (ndarray): the predicted class for each sample in X is returned.
        testing_target (ndarray): testing target.
    """
    try:
        print("Accuracy score:")
        print(accuracy_score(testing_target, decision_predict_tree_array))
    except Exception as error:
        print(f"An unexpected error: {error} while printing accuracy score.")

In [23]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from numpy import ndarray

def print_cross_val_score_with_grid_search(decision_tree: DecisionTreeClassifier, param_grid: dict, dataset: ndarray, target: ndarray) -> None:
    """Prints cross validation score with grid search.

    Args:
        decision_tree (DecisionTreeClassifier): Decision tree model.
        param_grid (dict): Dictionary of hyperparameter values for grid search.
        dataset (ndarray): Dataset array.
        target (ndarray): Target array.
    """
    try:
        # Initialize GridSearchCV with the given DecisionTreeClassifier, parameter grid, and 10-fold cross-validation
        grid_search = GridSearchCV(decision_tree, param_grid, cv=10)
        
        # Fit the grid search to the data
        grid_search.fit(dataset, target)
        
        # Print the best parameters found by grid search
        print("Best Parameters: ", grid_search.best_params_)
        
        # Print cross-validation scores for each combination of hyperparameters
        print("Cross Validation Scores:")
        print(grid_search.cv_results_['mean_test_score'])
        
    except Exception as error:
        print(f"An unexpected error occurred: {error} while printing cross-validation scores with grid search.")


In [24]:
def print_cross_val_score(decision_tree: DecisionTreeClassifier, dataset: ndarray, target: ndarray) -> None:
    """Prints cross validation score.

    Args:
        decision_tree (DecisionTreeClassifier): Decision tree model.
        dataset (ndarray): Dataset array.
        target (ndarray): Target array.
    """
    try:
        # Print cross-validation scores for each combination of hyperparameters
        print("Cross Validation Scores:")
        print(cross_val_score(decision_tree, dataset, target, cv=10))
    except Exception as error:
        print(f"An unexpected error occurred: {error} while printing cross-validation scores.")

In [25]:
def test_data_accuracy_in_case_test_size(number_of_used_attributes: int, tree_depth: int, criterion: str = 'entropy') -> None:
    """Tests data accuracy in case max depth tree.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        tree_depth (int): max depth of the decision tree.
    """
    try:
        dataset, target = prepare_data_sets(number_of_used_attributes)
        convert_labels_and_attributes_to_numeric_values(number_of_used_attributes, dataset, target)
        
        for _ in range(8):
            training_data, testing_data, training_target, testing_target = \
                splits_data_for_testing(0.1 * (_ + 1), dataset, target)
                
            print(f"\nTree max depth: {tree_depth}, test_size: {0.1 * (_ + 1)}, number of used attributes: {number_of_used_attributes}:")
            decision_predict_tree_array, tree = train_decision_tree_model(tree_depth - _, training_data, training_target, testing_data, criterion)
            print_confusion_matrix(decision_predict_tree_array, testing_target)
            # # To jest w przypadku gdy chcemy sprawdzić jakie parametry są najlepsze dla naszego modelu
            # param_grid = {
            #     'criterion': ['gini', 'entropy'],
            #     'max_depth': [None, 5, 10, 15, 20],
            #     'min_samples_split': [2, 5, 10, 15],
            #     'min_samples_leaf': [1, 2, 4, 8],
            #     'max_features': [None, 'sqrt', 'log2'],
            #     'splitter': ['best', 'random']
            # }
            # print_cross_val_score_with_grid_search(tree, param_grid, dataset, target)
            
            print_cross_val_score(tree, dataset, target)
    except Exception as error:
        print(f"An unexpected error: {error} while testing data accuracy in case of max tree depth.")

In [26]:
def test_data_accuracy_in_case_max_depth_tree(number_of_used_attributes: int, _max_depth: int, _test_size: float, criterion: str = 'entropy') -> None:
    """Tests data accuracy in case max depth tree.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        _max_depth (int): max depth of the decision tree.
        _test_size (float): test_size.
    """
    try:
        dataset, target = prepare_data_sets(number_of_used_attributes)
        convert_labels_and_attributes_to_numeric_values(number_of_used_attributes, dataset, target)
        
        training_data, testing_data, training_target, testing_target = \
            splits_data_for_testing(_test_size, dataset, target)
            
        for _ in range(_max_depth):
            print(f"\nTree max depth: {_max_depth - _}, test_size: {_test_size}, number of used attributes: {number_of_used_attributes}:")
            decision_predict_tree_array, tree = train_decision_tree_model(_max_depth - _, training_data, training_target, testing_data, criterion)
            print_confusion_matrix(decision_predict_tree_array, testing_target)
            print_accuracy_score(decision_predict_tree_array, testing_target)
            
            # To jest w przypadku gdy chcemy sprawdzić jakie parametry są najlepsze dla naszego modelu
            # param_grid = {
            #     'criterion': ['gini', 'entropy'],
            #     'max_depth': [None, 5, 10, 15, 20],
            #     'min_samples_split': [2, 5, 10, 15],
            #     'min_samples_leaf': [1, 2, 4, 8],
            #     'max_features': [None, 'sqrt', 'log2'],
            #     'splitter': ['best', 'random']
            # }
            # print_cross_val_score_with_grid_search(tree, param_grid, dataset, target)
            
            # To po prostu wyświetla wyniki cross validation score
            print_cross_val_score(tree, dataset, target)
    except Exception as error:
        print(f"An unexpected error: {error} while testing data accuracy in case of max tree depth.")

In [27]:
def test_data_accuracy_in_case_number_of_attributes(number_of_used_attributes: int, _max_depth: int, _test_size: float, criterion: str = 'entropy') -> None:
    """Tests data accuracy in case max depth tree.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        _max_depth (int): max depth of the decision tree.
        _test_size (float): test_size.
    """
    try:
        for _ in range(number_of_used_attributes - 1):
            print(f"\nTree max depth: {_max_depth}, test_size: {_test_size}, number of used attributes: {_ + 2}:")
            
            dataset, target = prepare_data_sets(_ + 2)
            convert_labels_and_attributes_to_numeric_values(_ + 2, dataset, target)
            training_data, testing_data, training_target, testing_target = \
                splits_data_for_testing(_test_size, dataset, target)
                
            decision_predict_tree_array, tree = train_decision_tree_model(_max_depth, training_data, training_target, testing_data, criterion)
            print_confusion_matrix(decision_predict_tree_array, testing_target)
            print_accuracy_score(decision_predict_tree_array, testing_target)
            # To jest w przypadku gdy chcemy sprawdzić jakie parametry są najlepsze dla naszego modelu
            
            # param_grid = {
            #     'criterion': ['gini', 'entropy'],
            #     'max_depth': [None, 5, 10, 15, 20],
            #     'min_samples_split': [2, 5, 10, 15],
            #     'min_samples_leaf': [1, 2, 4, 8],
            #     'max_features': [None, 'sqrt', 'log2'],
            #     'splitter': ['best', 'random']
            # }
            # print_cross_val_score_with_grid_search(tree, param_grid, dataset, target)
            
            # To po prostu wyświetla wyniki cross validation score
            print_cross_val_score(tree, dataset, target)
    except Exception as error:
        print(f"An unexpected error: {error} while testing data accuracy in case of max tree depth.")

In [28]:
def test_data_accuracy(number_of_used_attributes: int, _test_size: float, _max_depth: int, test_type: str, criterion: str = 'entropy') -> None:
    """Test data accuracy in case of some variables.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        _test_size (float): proportion of the dataset to include in the train split.
        _max_depth (int): max depth of the decision tree.
        test_type (str): test type in case of: test size or tree max depth or number of used attributes.
    """
    try:
        if test_type == "max_depth":
            test_data_accuracy_in_case_max_depth_tree(number_of_used_attributes, _max_depth, _test_size, criterion)
        elif test_type == "test_size":
            test_data_accuracy_in_case_test_size(number_of_used_attributes, _max_depth, criterion)
        elif test_type == "number_of_attributes":
            test_data_accuracy_in_case_number_of_attributes(number_of_used_attributes, _max_depth, _test_size, criterion)
    except IndexError as index_error:
        print(f"IndexError: {index_error} while testing data.")
    except Exception as error:
        print(f"An unexpected error: {error} while testing data.")

In [29]:
def test_best_fit_tree(number_of_used_attributes: int, _test_size: float, _max_depth: int) -> None:
    """Test best fit tree.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        _test_size (float): proportion of the dataset to include in the train split.
        _max_depth (int): max depth of the decision tree.
    """
    try:
        dataset, target = prepare_data_sets(number_of_used_attributes)
        convert_labels_and_attributes_to_numeric_values(number_of_used_attributes, dataset, target)
        
        training_data, testing_data, training_target, testing_target = \
            splits_data_for_testing(_test_size, dataset, target)
            
        decision_predict_tree_array, tree = train_decision_tree_model(_max_depth, training_data, training_target, testing_data)
        print_confusion_matrix(decision_predict_tree_array, testing_target)
        print_accuracy_score(decision_predict_tree_array, testing_target)
        # To jest w przypadku gdy chcemy sprawdzić jakie parametry są najlepsze dla naszego modelu
        param_grid = {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [2, 5, 10, 15],
            'min_samples_leaf': [1, 2, 4, 8],
            'max_features': [None, 'sqrt', 'log2'],
            'splitter': ['best', 'random']
        }
        print_cross_val_score_with_grid_search(tree, param_grid, dataset, target)
    except IndexError as index_error:
        print(f"IndexError: {index_error} while testing data.")
    except Exception as error:
        print(f"An unexpected error: {error} while testing data.")

In [30]:
for criterion in ['gini', 'entropy']:
    print(f"\nCriterion: {criterion}")
    test_data_accuracy(22, 0.1, 8, "max_depth", criterion)



Criterion: gini

Tree max depth: 8, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[431   0]
 [  0 382]]
Accuracy score:
1.0
Cross Validation Scores:
[0.68511685 0.9397294  1.         1.         1.         1.
 1.         1.         0.91133005 1.        ]

Tree max depth: 7, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[431   0]
 [  0 382]]
Accuracy score:
1.0
Cross Validation Scores:
[0.68511685 0.9397294  1.         1.         1.         1.
 1.         1.         0.89778325 1.        ]

Tree max depth: 6, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[431   0]
 [  1 381]]
Accuracy score:
0.998769987699877
Cross Validation Scores:
[0.68511685 0.88068881 0.99753998 0.99261993 0.96921182 0.97167488
 1.         1.         0.8226601  1.        ]

Tree max depth: 5, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[426   5]
 [ 16 366]]
Accuracy score:
0.974169741697417
Cross Validation Scores:
[0.68511685 0.880688

In [31]:
for criterion in ['gini', 'entropy']:
    print(f"\nCriterion: {criterion}")
    test_data_accuracy(22, 0.1, 8, "test_size", criterion)


Criterion: gini

Tree max depth: 8, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[417   0]
 [  0 396]]
Cross Validation Scores:
[0.68511685 0.9397294  1.         1.         1.         1.
 1.         1.         0.8817734  1.        ]

Tree max depth: 8, test_size: 0.2, number of used attributes: 22:
Confusion Matrix:
[[841   0]
 [  0 784]]
Cross Validation Scores:
[0.68511685 0.9397294  1.         1.         1.         1.
 1.         1.         0.8682266  1.        ]

Tree max depth: 8, test_size: 0.30000000000000004, number of used attributes: 22:
Confusion Matrix:
[[1256    0]
 [  21 1161]]
Cross Validation Scores:
[0.68511685 0.9397294  0.99753998 0.99261993 0.96921182 0.97167488
 1.         1.         0.86699507 1.        ]

Tree max depth: 8, test_size: 0.4, number of used attributes: 22:
Confusion Matrix:
[[1633   38]
 [  18 1561]]
Cross Validation Scores:
[0.68511685 0.88068881 0.99507995 0.99138991 0.96551724 0.96305419
 1.         1.         0.86699507 1. 

In [32]:
for criterion in ['gini', 'entropy']:
    print(f"\nCriterion: {criterion}")
    test_data_accuracy(22, 0.1, 8, "number_of_attributes", criterion)


Criterion: gini

Tree max depth: 8, test_size: 0.1, number of used attributes: 2:
Confusion Matrix:
[[412  26]
 [316  59]]
Accuracy score:
0.5793357933579336
Cross Validation Scores:
[0.51783518 0.51783518 0.51906519 0.51906519 0.52586207 0.52463054
 0.48891626 0.37561576 0.62561576 0.77832512]

Tree max depth: 8, test_size: 0.1, number of used attributes: 3:
Confusion Matrix:
[[260 161]
 [156 236]]
Accuracy score:
0.6100861008610086
Cross Validation Scores:
[0.63345633 0.45141451 0.46125461 0.43788438 0.70073892 0.70812808
 0.74384236 0.53694581 0.59605911 0.70320197]

Tree max depth: 8, test_size: 0.1, number of used attributes: 4:
Confusion Matrix:
[[327 113]
 [104 269]]
Accuracy score:
0.7330873308733087
Cross Validation Scores:
[0.50922509 0.52152522 0.59532595 0.59532595 0.64039409 0.62561576
 0.62315271 0.54064039 0.64039409 0.75      ]

Tree max depth: 8, test_size: 0.1, number of used attributes: 5:
Confusion Matrix:
[[354  74]
 [ 55 330]]
Accuracy score:
0.8413284132841329
C

In [33]:
#test_best_fit_tree(22, 0.1, 8)