In [1]:
# Import the pandas library for data manipulation and analysis
import pandas as pd

# Import the train_test_split function to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Import the DecisionTreeClassifier for building a decision tree model
from sklearn.tree import DecisionTreeClassifier

# Import the metrics module for evaluating the performance of the model
from sklearn import metrics

## Loading Data

In [6]:
# Define the file path for the Iris dataset
address = '..\data\iris.data.csv'

# Read the CSV file into a pandas DataFrame
# The file has no header row, and values are separated by commas
df = pd.read_csv(filepath_or_buffer=address, header=None, sep=',')

# Assign meaningful column names to the DataFrame
df.columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']

In [90]:
# Display the first five rows of the DataFrame to get an overview of the data
df.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Split into features and target variable

In [91]:
# Retrieve the unique values from the 'Species' column to identify the different species in the dataset
df.Species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [92]:
# Select the first four columns of the DataFrame to use as *features* for the model
X = df.iloc[:, 0:4]

# Display the selected feature set
X

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [93]:
# Select the fifth column of the DataFrame to use as the *target variable* for the model
Y = df.iloc[:, 4]

# Display the selected target variable
Y

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: Species, Length: 150, dtype: object

## Train & Test

In [94]:
# Split the dataset into *training* and *testing* sets
# X and Y are the *features* and target *variable* respectively
# test_size=0.3 indicates that 30% of the data will be used for testing
# random_state=0 ensures reproducibility of the split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [95]:
# Initialize a Decision Tree Classifier model
clf = DecisionTreeClassifier()

# Train the Decision Tree Classifier using the training data
clf.fit(X_train, Y_train)

In [96]:
# Use the trained Decision Tree Classifier to make predictions on the test data
Y_predict = clf.predict(X_test)

# Display the predicted values for the test data
Y_predict

array(['virginica', 'versicolor', 'setosa', 'virginica', 'setosa',
       'virginica', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'setosa', 'versicolor', 'versicolor', 'setosa',
       'setosa', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'setosa', 'virginica', 'versicolor', 'setosa', 'virginica',
       'virginica', 'versicolor', 'setosa', 'virginica', 'versicolor',
       'versicolor', 'virginica', 'setosa', 'virginica', 'setosa',
       'setosa'], dtype=object)

In [97]:
# Calculate the accuracy of the model by comparing the actual and predicted values
accuracy = metrics.accuracy_score(Y_test, Y_predict)

# Print the accuracy of the model in a formatted string
print(f"Accuracy: {accuracy}")

Accuracy: 0.9777777777777777


In [98]:
# Create a new DataFrame using the test features for better visualization
data = pd.DataFrame(X_test, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])

# Add a column for the predicted species
data['Prediction'] = Y_predict

# Add a column for the actual species from the test set
data['Original'] = Y_test

# Display the DataFrame to compare predicted and actual values
data

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Prediction,Original
114,5.8,2.8,5.1,2.4,virginica,virginica
62,6.0,2.2,4.0,1.0,versicolor,versicolor
33,5.5,4.2,1.4,0.2,setosa,setosa
107,7.3,2.9,6.3,1.8,virginica,virginica
7,5.0,3.4,1.5,0.2,setosa,setosa
100,6.3,3.3,6.0,2.5,virginica,virginica
40,5.0,3.5,1.3,0.3,setosa,setosa
86,6.7,3.1,4.7,1.5,versicolor,versicolor
76,6.8,2.8,4.8,1.4,versicolor,versicolor
71,6.1,2.8,4.0,1.3,versicolor,versicolor


In [99]:
# Filter the DataFrame to include only the rows where the prediction does not match the actual value
filtered_data = data[data['Prediction'] != data['Original']]

# Display the filtered DataFrame to see the instances where the model made incorrect predictions
filtered_data

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Prediction,Original
83,6.0,2.7,5.1,1.6,virginica,versicolor


## Using the Tukey's Method

In [100]:
# Generate a statistical summary of the DataFrame's numerical columns
# The summary includes metrics like mean, standard deviation, min, max, and quartiles
# Round the results to two decimal places for better readability
df.describe().round(2)

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
count,150.0,150.0,150.0,150.0
mean,5.84,3.06,3.76,1.2
std,0.83,0.44,1.77,0.76
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [101]:
# Tukey method for identifying outliers
# Example calculations:
# IQR (Interquartile Range) = Q3 - Q1 = 6.4 - 5.1 = 1.3
# Calculate 1.5 times the IQR: 1.5 * IQR = 1.95
# Determine the lower bound: Q1 - 1.5 * IQR = 5.1 - 1.95 = 3.15
# Determine the upper bound: Q3 + 1.5 * IQR = 6.4 + 1.95 = 8.35
# The function returns data points that are considered outliers based on the calculated bounds
def Tukey_method_outliners(data):
    # Calculate the first quartile (Q1)
    Q1 = data.quantile(0.25)
    
    # Calculate the third quartile (Q3)
    Q3 = data.quantile(0.75)
    
    # Compute the interquartile range (IQR)
    IQR = Q3 - Q1
    
    # Calculate the lower bound for outliers
    lower_bound = Q1 - (1.5 * IQR)
    
    # Calculate the upper bound for outliers
    upper_bound = Q3 + (1.5 * IQR)
    
    # Return data points that are outside the bounds (outliers)
    # Note: The logical operator should be corrected to identify outliers
    return data[(data > upper_bound) | (data < lower_bound)]

In [102]:
# Apply the Tukey method to identify outliers
Tukey_method_outliners(df['Sepal Length'])

Series([], Name: Sepal Length, dtype: float64)

In [103]:
# Apply the Tukey method to identify outliers
Tukey_method_outliners(df['Sepal Width'])

15    4.4
32    4.1
33    4.2
60    2.0
Name: Sepal Width, dtype: float64

In [104]:
# Apply the Tukey method to identify outliers
Tukey_method_outliners(df['Petal Length'])

Series([], Name: Petal Length, dtype: float64)

In [105]:
# Apply the Tukey method to identify outliers
Tukey_method_outliners(df['Petal Width'])

Series([], Name: Petal Width, dtype: float64)

In [106]:
outliers = Tukey_method_outliners(df['Sepal Width'])
mask = df['Sepal Width'].isin(outliers)
df[mask]

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species
15,5.7,4.4,1.5,0.4,setosa
32,5.2,4.1,1.5,0.1,setosa
33,5.5,4.2,1.4,0.2,setosa
60,5.0,2.0,3.5,1.0,versicolor


In [107]:
df = df[~mask]


In [108]:
X = df.iloc[:, 0:4]
Y = df.iloc[:, 4]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)

Y_predict = clf.predict(X_test)

accuracy = metrics.accuracy_score(Y_test, Y_predict)
print(f"Accuracy: {accuracy}")


# Create a new DataFrame using the test features for better visualization
data = pd.DataFrame(X_test, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])

# Add a column for the predicted species
data['Prediction'] = Y_predict

# Add a column for the actual species from the test set
data['Original'] = Y_test

# Filter the DataFrame to include only the rows where the prediction does not match the actual value
filtered_data = data[data['Prediction'] != data['Original']]

# Display the filtered DataFrame to see the instances where the model made incorrect predictions
filtered_data

Accuracy: 0.9545454545454546


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Prediction,Original
134,6.1,2.6,5.6,1.4,versicolor,virginica
77,6.7,3.0,5.0,1.7,virginica,versicolor


## Explanation of Accuracy Decrease with Tukey's Method

1. **Loss of Informative Data**: Outliers can contain valuable information that contributes to model predictive power. Removing them may lead to the loss of this information.

2. **Model Sensitivity**: Some models are more sensitive to outliers. Removing outliers can affect performance, especially in models like linear regression, which can be skewed by outliers.

3. **Data Distribution Changes**: Removing outliers alters the underlying data distribution, which can mismatch training and testing datasets and thus affect model performance.

4. **Overfitting and Underfitting**: Outliers can prevent overfitting by providing a range of examples for the model. Their removal can lead to overly simplistic models that do not capture data complexity.

5. **Impact on Variance and Bias**: Removing outliers reduces variance but can increase bias, leading to decreased accuracy if test data contains similar outliers.