In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the Iris dataset from seaborn
data = sns.load_dataset('iris')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
#Find the shape of the Data
print("SHAPE: ",data.shape)

print('*******************************')

# Checking for any missing values
print(data.isnull().any())

SHAPE:  (150, 5)
*******************************
sepal_length    False
sepal_width     False
petal_length    False
petal_width     False
species         False
dtype: bool


In [4]:
# Checking for Percentage of missing values
data.isnull().sum() / len(data) * 100

sepal_length    0.0
sepal_width     0.0
petal_length    0.0
petal_width     0.0
species         0.0
dtype: float64

In [5]:
data.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
# Checking the datatypes of all features
data.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [7]:
# Analusing the Species column.
data['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [8]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = data.drop('species', axis=1)
y = data['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.tree import DecisionTreeClassifier
Dtc = DecisionTreeClassifier()

# Train the model
Dtc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = Dtc.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report)

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [None]:
# ## Since it is a small data set, The accuracy falls 100%, Hence we increase the test_size by 40% - 
#  to check the accuracy is varying

In [11]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
# # Train the model
# Dtc.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = Dtc.predict(X_test)

In [12]:
# from sklearn.metrics import accuracy_score, classification_report

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# classification_report = classification_report(y_test, y_pred)

# # Print the evaluation metrics
# print("Accuracy:", accuracy)
# print("Classification Report:\n", classification_report)

Accuracy: 0.9833333333333333
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        23
  versicolor       0.95      1.00      0.97        19
   virginica       1.00      0.94      0.97        18

    accuracy                           0.98        60
   macro avg       0.98      0.98      0.98        60
weighted avg       0.98      0.98      0.98        60

