In [1]:
import pandas as pd

# Wczytanie danych z pliku CSV do obiektu DataFrame za pomocą biblioteki pandas
data = pd.read_csv("mushrooms.csv")

In [2]:
from numpy import ndarray

def prepare_data_sets(number_of_used_attributes: int) -> list[ndarray, ndarray]:
    """Prepares data sets (columns/attributes) and labels for testing..

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.

    Returns:
        list[ndarray, ndarray]: dataset and target ndarray.
    """
    try:
        dataset = data.values[:, 1:number_of_used_attributes]
        target = data.values[:, 0]
        return dataset, target
    except Exception as error:
        print(f"An unexpected error: {error} while preparing data sets.")

In [3]:
# attributes: int = 22  # maximum value that we can test.
# dataset, target = prepare_data_sets(attributes)

In [4]:
from sklearn import preprocessing

def convert_labels_and_attributes_to_numeric_values(number_of_used_attributes: int, dataset: ndarray, target: ndarray) -> None:
    """Convert labels (edible/non-edible) and attributes to numeric values using LabelEncoder
    
    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        dataset (ndarray): dataset array.
        target (ndarray): target array.
    """
    try:
        label_encoder = preprocessing.LabelEncoder()
        target = label_encoder.fit_transform(target)
        for x in range(0, number_of_used_attributes - 1):
            dataset[:, x] = label_encoder.fit_transform(dataset[:, x])
            
        return dataset, target
    except Exception as error:
        print(f"An unexpected error: {error} while converting labels to numeric values.")

In [5]:
# convert_labels_and_attributes_to_numeric_values(attributes, dataset, target)

In [6]:
from sklearn.model_selection import train_test_split

def splits_data_for_testing(_test_size: float, dataset: ndarray, target: ndarray) -> list:
    """Splits the data into a training set and a test set.

    Args:
        _test_size (float): proportion of the dataset to include in the train split.
            dataset (ndarray): dataset array.
        target (ndarray): target array.

    Returns:
        list: list of training and testing data and target.
    """
    try:
        training_data, testing_data, training_target, testing_target = \
            train_test_split(dataset, target.reshape(-1, 1), test_size=_test_size)
        return training_data, testing_data, training_target, testing_target
    except Exception as error:
        print(f"An unexpected error: {error} while splitting data for testing.")

In [7]:
# training_data, testing_data, training_target, testing_target = \
#     splits_data_for_testing(0.1, dataset, target)

In [8]:
from sklearn.tree import DecisionTreeClassifier

def train_decision_tree_model(_max_depth: int, training_data: ndarray, training_target: ndarray, testing_data: ndarray) -> ndarray:
    """Initializes and trains of the decision tree model.

    Args:
        _max_depth (int): max depth of the decision tree.
        training_data (ndarray): training data.
        training_target (ndarray): training target.
        testing_data (ndarray): testing data.

    Returns:
        ndarray: the predicted class for each sample in X is returned.
    """
    try:
        decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=_max_depth)
        decision_tree.fit(training_data, training_target)
        return decision_tree.predict(testing_data)
    except Exception as error:
        print(f"An unexpected error: {error} while training decision tree model.")

In [9]:
# decision_predict_tree_array = train_decision_tree_model(8, training_data, training_target, testing_data)

In [10]:
from sklearn.metrics import confusion_matrix

def print_confusion_matrix(decision_predict_tree_array: ndarray, testing_target: ndarray) -> None:
    """Prints confusion matrix.

    Args:
        decision_predict_tree_array (ndarray): the predicted class for each sample in X is returned.
        testing_target (ndarray): testing target.
    """
    try:
        print("Confusion Matrix:")
        print(f"{confusion_matrix(testing_target, decision_predict_tree_array)}")
    except Exception as error:
        print(f"An unexpected error: {error} while printing confusion matrix.")

In [11]:
from sklearn.metrics import accuracy_score

def print_accuracy_score(decision_predict_tree_array: ndarray, testing_target: ndarray) -> None:
    """Prints accuracy score.

    Args:
        decision_predict_tree_array (ndarray): the predicted class for each sample in X is returned.
        testing_target (ndarray): testing target.
    """
    try:
        print("Accuracy score:")
        print(accuracy_score(testing_target, decision_predict_tree_array))
    except Exception as error:
        print(f"An unexpected error: {error} while printing accuracy score.")

In [12]:
def test_data_accuracy_in_case_test_size(number_of_used_attributes: int, tree_depth: int, ) -> None:
    """Tests data accuracy in case max depth tree.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        tree_depth (int): max depth of the decision tree.
    """
    try:
        dataset, target = prepare_data_sets(number_of_used_attributes)
        convert_labels_and_attributes_to_numeric_values(number_of_used_attributes, dataset, target)
        
        for _ in range(8):
            training_data, testing_data, training_target, testing_target = \
                splits_data_for_testing(0.1 * (_ + 1), dataset, target)
                
            print(f"\nTree max depth: {tree_depth}, test_size: {0.1 * (_ + 1)}, number of used attributes: {number_of_used_attributes}:")
            decision_predict_tree_array = train_decision_tree_model(tree_depth - _, training_data, training_target, testing_data)
            print_confusion_matrix(decision_predict_tree_array, testing_target)
            print_accuracy_score(decision_predict_tree_array, testing_target)
    except Exception as error:
        print(f"An unexpected error: {error} while testing data accuracy in case of max tree depth.")

In [13]:
def test_data_accuracy_in_case_max_depth_tree(number_of_used_attributes: int, _max_depth: int, _test_size: float) -> None:
    """Tests data accuracy in case max depth tree.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        _max_depth (int): max depth of the decision tree.
        _test_size (float): test_size.
    """
    try:
        dataset, target = prepare_data_sets(number_of_used_attributes)
        convert_labels_and_attributes_to_numeric_values(number_of_used_attributes, dataset, target)
        
        training_data, testing_data, training_target, testing_target = \
            splits_data_for_testing(_test_size, dataset, target)
            
        for _ in range(_max_depth):
            print(f"\nTree max depth: {_max_depth - _}, test_size: {_test_size}, number of used attributes: {number_of_used_attributes}:")
            decision_predict_tree_array = train_decision_tree_model(_max_depth - _, training_data, training_target, testing_data)
            print_confusion_matrix(decision_predict_tree_array, testing_target)
            print_accuracy_score(decision_predict_tree_array, testing_target)
    except Exception as error:
        print(f"An unexpected error: {error} while testing data accuracy in case of max tree depth.")

In [14]:
def test_data_accuracy_in_case_number_of_attributes(number_of_used_attributes: int, _max_depth: int, _test_size: float) -> None:
    """Tests data accuracy in case max depth tree.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        _max_depth (int): max depth of the decision tree.
        _test_size (float): test_size.
    """
    try:
        for _ in range(number_of_used_attributes):
            print(f"\nTree max depth: {_max_depth}, test_size: {_test_size}, number of used attributes: {_ + 1}:")
            
            dataset, target = prepare_data_sets(_ + 1)
            convert_labels_and_attributes_to_numeric_values(_ + 1, dataset, target)
            training_data, testing_data, training_target, testing_target = \
                splits_data_for_testing(_test_size, dataset, target)
                
            decision_predict_tree_array = train_decision_tree_model(_max_depth, training_data, training_target, testing_data)
            print_confusion_matrix(decision_predict_tree_array, testing_target)
            print_accuracy_score(decision_predict_tree_array, testing_target)
    except Exception as error:
        print(f"An unexpected error: {error} while testing data accuracy in case of max tree depth.")

In [15]:
def test_data_accuracy(number_of_used_attributes: int, _test_size: float, _max_depth: int, test_type: str) -> None:
    """Test data accuracy in case of some variables.

    Args:
        number_of_used_attributes (int): number of attributes from we'll create datasets.
        _test_size (float): proportion of the dataset to include in the train split.
        _max_depth (int): max depth of the decision tree.
        test_type (str): test type in case of: test size or tree max depth or number of used attributes.
    """
    try:
        if test_type == "max_depth":
            test_data_accuracy_in_case_max_depth_tree(number_of_used_attributes, _max_depth, _test_size)
        elif test_type == "test_size":
            test_data_accuracy_in_case_test_size(number_of_used_attributes, _max_depth)
        elif test_type == "number_of_attributes":
            test_data_accuracy_in_case_number_of_attributes(number_of_used_attributes, _max_depth, _test_size)
    except IndexError as index_error:
        print(f"IndexError: {index_error} while testing data.")
    except Exception as error:
        print(f"An unexpected error: {error} while testing data.")

In [16]:
test_data_accuracy(22, 0.1, 8, "max_depth")



Tree max depth: 8, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[401   0]
 [  0 412]]
Accuracy score:
1.0

Tree max depth: 7, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[400   1]
 [  0 412]]
Accuracy score:
0.998769987699877

Tree max depth: 6, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[400   1]
 [  1 411]]
Accuracy score:
0.997539975399754

Tree max depth: 5, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[393   8]
 [  6 406]]
Accuracy score:
0.982779827798278

Tree max depth: 4, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[401   0]
 [ 37 375]]
Accuracy score:
0.9544895448954489

Tree max depth: 3, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[378  23]
 [ 10 402]]
Accuracy score:
0.959409594095941

Tree max depth: 2, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[389  12]
 [ 58 354]]
Accuracy score:
0.9138991389913899

Tree max depth: 1

In [17]:
test_data_accuracy(22, 0.1, 8, "test_size")


Tree max depth: 8, test_size: 0.1, number of used attributes: 22:
Confusion Matrix:
[[438   0]
 [  0 375]]
Accuracy score:
1.0

Tree max depth: 8, test_size: 0.2, number of used attributes: 22:
Confusion Matrix:
[[810   2]
 [  0 813]]
Accuracy score:
0.9987692307692307

Tree max depth: 8, test_size: 0.30000000000000004, number of used attributes: 22:
Confusion Matrix:
[[1236    7]
 [   1 1194]]
Accuracy score:
0.9967186218211649

Tree max depth: 8, test_size: 0.4, number of used attributes: 22:
Confusion Matrix:
[[1663   17]
 [  36 1534]]
Accuracy score:
0.9836923076923076

Tree max depth: 8, test_size: 0.5, number of used attributes: 22:
Confusion Matrix:
[[2116    0]
 [ 185 1761]]
Accuracy score:
0.9544559330379123

Tree max depth: 8, test_size: 0.6000000000000001, number of used attributes: 22:
Confusion Matrix:
[[2433  142]
 [  73 2227]]
Accuracy score:
0.9558974358974359

Tree max depth: 8, test_size: 0.7000000000000001, number of used attributes: 22:
Confusion Matrix:
[[2883   6

In [18]:
test_data_accuracy(22, 0.1, 8, "number_of_attributes")


Tree max depth: 8, test_size: 0.1, number of used attributes: 1:
An unexpected error: Found array with 0 feature(s) (shape=(7311, 0)) while a minimum of 1 is required by DecisionTreeClassifier. while training decision tree model.
Confusion Matrix:
An unexpected error: The 'y_pred' parameter of confusion_matrix must be an array-like. Got None instead. while printing confusion matrix.
Accuracy score:
An unexpected error: The 'y_pred' parameter of accuracy_score must be an array-like or a sparse matrix. Got None instead. while printing accuracy score.

Tree max depth: 8, test_size: 0.1, number of used attributes: 2:
Confusion Matrix:
[[406  15]
 [329  63]]
Accuracy score:
0.5768757687576875

Tree max depth: 8, test_size: 0.1, number of used attributes: 3:
Confusion Matrix:
[[272 152]
 [146 243]]
Accuracy score:
0.6334563345633456

Tree max depth: 8, test_size: 0.1, number of used attributes: 4:
Confusion Matrix:
[[290 127]
 [124 272]]
Accuracy score:
0.6912669126691267

Tree max depth: 8