In [3]:
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load iris dataset
iris = datasets.load_iris()

# Create a DataFrame from the dictionary
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Add target and class to DataFrame
iris_df['target'] = iris.target
iris_df['class'] = iris.target_names[iris.target]

# Show the DataFrame
print(iris_df.head())

# Split the data into a training set and a test set
X = iris_df.drop(['target', 'class'], axis=1)
Y = iris_df['target']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize a Decision Tree classifier
dt = DecisionTreeClassifier()

# Fit the model to the training data
dt.fit(X_train, Y_train)

# Predict the labels of the test set
Y_pred = dt.predict(X_test)

# Print the accuracy of the model
print('Accuracy: ', accuracy_score(Y_test, Y_pred))


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target   class  
0       0  setosa  
1       0  setosa  
2       0  setosa  
3       0  setosa  
4       0  setosa  
Accuracy:  1.0


the accuracy score is perfect, which might indicate that there's overfitting. let's check

In [4]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
scores = cross_val_score(dt, X, Y, cv=5)

# Print the accuracy of each fold
print('Scores: ', scores)

# Print the mean accuracy of all 5 folds
print('Mean score: ', scores.mean())


Scores:  [0.96666667 0.96666667 0.9        0.96666667 1.        ]
Mean score:  0.9600000000000002


The k-fold cross-validation result indicates that the model's performance is quite good, with an average accuracy score of 96%. While it's slightly less than the perfect score (100%) we got on the test set, it's still very high. This implies that the Decision Tree classifier is doing a good job of classifying the Iris species and is not just overfitting to the data.