# Advanced Certification Programme in AI and MLOps
## A programme by IISc and TalentSprint
### Ungraded Additional Notebook: Decision Tree

## Learning Objective

At the end of the experiment, you will be able :

* Split the data into train and test sets
* Apply decision tree classifier with varying max_depth
* Visualize the decision tree
* Understand various performance metrics

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import Image
import pydotplus

### Download Iris Dataset

<img src='https://drive.google.com/uc?id=1Z7BNkBDwojR2Ei3hoNMKnhGTowEULXei'>

In [None]:
#@title Download dataset
from IPython.display import clear_output
!wget https://cdn.iisc.talentsprint.com/AIandMLOps/Datasets/iris.csv
clear_output()
print("Dataset Downloaded!")
!ls | grep ".csv"

In [None]:
data = pd.read_csv('iris.csv')
data.head()

In [None]:
# Features
X = data.iloc[:, :4].values

# Target
Y = data['Name'].values

In [None]:
Y

#### Encoding the categorical label

In [None]:
# For example encoding target feature y

enc = LabelEncoder()
label_encoder = enc.fit(Y)
y = label_encoder.transform(Y)
y

In [None]:
# Unique catogories in target column
enc.classes_

In [None]:
# Again inversing the encoding
enc.inverse_transform(y)

##### Displaying the classes and their equivalent encoded values

In [None]:
print ("Categorical classes:", label_encoder.classes_)

integer_classes = label_encoder.transform(label_encoder.classes_)
print ("Integer classes:", integer_classes)

### **Training a  Classifier**

* Iterating the depth ranging from 1 to 4
* Performing decision tree classifier with each depth displaying the graph

#### **Training and plotting the tree with Depth=1**

In [None]:
# Training and testing set ratio is 67 : 33

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

##### Training

In [None]:
clf = DecisionTreeClassifier(max_depth=1, criterion='entropy')

# Fitting the data
clf.fit(X_train, y_train)

In [None]:
# Predicting on test set
pred = clf.predict(X_test)
print("Prediction of test set : \n", pred)

In [None]:
print("Prediction of test set : \n", label_encoder.inverse_transform(pred))

In [None]:
# accuracy on test set
print("\nAccuracy score on test set : ", clf.score(X_test, y_test))

In [None]:
data.columns.values

##### Plotting the tree

In [None]:
feature_names = data.columns.values[:4]
target_names = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

# Create DOT data
dot_data = export_graphviz(clf, out_file=None, feature_names=feature_names,  class_names=target_names)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

##### **Plotting Decision Boundary**

In [None]:
# Consider only 2 feature, so that they can be visualize in a 2D plot
reduced_features = X_train[:, [2,3]]          # petal length,  petal width

In [None]:
# Plot data points

sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue = label_encoder.inverse_transform(y_train))
plt.xlabel('Petal length (cm)')
plt.ylabel('Petal width (cm)')
plt.show()

In [None]:
# Train model using only 2 features
clf = DecisionTreeClassifier(max_depth=1, criterion='entropy')

# Fitting the data
clf.fit(reduced_features, y_train)

In [None]:
# Create a meshgrid of points to plot
x_min, x_max = reduced_features[:, 0].min() - 1, reduced_features[:, 0].max() + 1
y_min, y_max = reduced_features[:, 1].min() - 1, reduced_features[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

# Plot the decision boundary
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.6)

# Plot the training data
sns.scatterplot(x=reduced_features[:,0], y=reduced_features[:,1], hue = label_encoder.inverse_transform(y_train))

# Set the title and labels
plt.title('Decision Boundary of Decision Tree (Depth=1)')
plt.xlabel('Reduced Feature 1')
plt.ylabel('Reduced Feature 2')
plt.legend()

# Show the plot
plt.show()

#### **Training and plotting the tree with Depth=2**

##### Training

In [None]:
clf = DecisionTreeClassifier(max_depth=2, criterion='entropy', min_samples_split=2)

# Fitting the data
clf.fit(X_train,y_train)

# Predicting on test set
pred = clf.predict(X_test)
print("Prediction of test set : \n", label_encoder.inverse_transform(pred))

# accuracy on test set
print("\nAccuracy score on test set : ", clf.score(X_test, y_test))

##### Plotting the tree

In [None]:
feature_names = data.columns.values[:4]
target_names = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

# Create DOT data
dot_data = export_graphviz(clf, out_file=None, feature_names=feature_names,  class_names=target_names)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

##### **Plotting Decision Boundary**

In [None]:
clf = DecisionTreeClassifier(max_depth=2, criterion='entropy', min_samples_split=2)

# Fitting the data
clf.fit(reduced_features, y_train)

In [None]:
# Create a meshgrid of points to plot
x_min, x_max = reduced_features[:, 0].min() - 1, reduced_features[:, 0].max() + 1
y_min, y_max = reduced_features[:, 1].min() - 1, reduced_features[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

# Plot the decision boundary
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.6)

# Plot the training data
sns.scatterplot(x=reduced_features[:,0], y=reduced_features[:,1], hue = label_encoder.inverse_transform(y_train))

# Set the title and labels
plt.title('Decision Boundary of Decision Tree (Depth=2)')
plt.xlabel('Reduced Feature 1')
plt.ylabel('Reduced Feature 2')
plt.legend()

# Show the plot
plt.show()

#### **Training and plotting the tree with Depth=3**

##### Training

In [None]:
clf = DecisionTreeClassifier(max_depth=3, criterion='entropy')

# Fitting the data
clf.fit(X_train,y_train)

# Predicting on test set
pred = clf.predict(X_test)
print("Prediction of test set : \n", label_encoder.inverse_transform(pred))

# Accuracy on test set
print("\nAccuracy score on test set : ", clf.score(X_test, y_test))

##### Plotting the tree

In [None]:
feature_names = data.columns.values[:4]
target_names = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

# Create DOT data
dot_data = export_graphviz(clf, out_file=None, feature_names=feature_names,  class_names=target_names)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

##### **Plotting Decision Boundary**

In [None]:
clf = DecisionTreeClassifier(max_depth=3, criterion='entropy')

# Fitting the data
clf.fit(reduced_features, y_train)

In [None]:
# Create a meshgrid of points to plot
x_min, x_max = reduced_features[:, 0].min() - 1, reduced_features[:, 0].max() + 1
y_min, y_max = reduced_features[:, 1].min() - 1, reduced_features[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

# Plot the decision boundary
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.6)

# Plot the training data
sns.scatterplot(x=reduced_features[:,0], y=reduced_features[:,1], hue = label_encoder.inverse_transform(y_train))

# Set the title and labels
plt.title('Decision Boundary of Decision Tree (Depth=3)')
plt.xlabel('Reduced Feature 1')
plt.ylabel('Reduced Feature 2')
plt.legend()

# Show the plot
plt.show()

#### **Training and plotting the tree with Depth=5**

##### Training

In [None]:
clf = DecisionTreeClassifier(max_depth=5, criterion='entropy')

# Fitting the data
clf.fit(X_train,y_train)

# Predicting on test set
y_pred = clf.predict(X_test)
print("prediction of test set : \n", label_encoder.inverse_transform(y_pred))

# Accuracy on test set
print("\nAccuracy score on test set : ", clf.score(X_test, y_test))

##### Plotting the tree

In [None]:
feature_names = data.columns.values[:4]
target_names = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

# Create DOT data
dot_data = export_graphviz(clf, out_file=None, feature_names=feature_names,  class_names=target_names)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

##### Plotting Decision Boundary

In [None]:
clf = DecisionTreeClassifier(max_depth=5, criterion='entropy')

# Fitting the data
clf.fit(reduced_features, y_train)

In [None]:
# Create a meshgrid of points to plot
x_min, x_max = reduced_features[:, 0].min() - 1, reduced_features[:, 0].max() + 1
y_min, y_max = reduced_features[:, 1].min() - 1, reduced_features[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

# Plot the decision boundary
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.6)

# Plot the training data
sns.scatterplot(x=reduced_features[:,0], y=reduced_features[:,1], hue = label_encoder.inverse_transform(y_train))

# Set the title and labels
plt.title('Decision Boundary of Decision Tree (Depth=5)')
plt.xlabel('Reduced Feature 1')
plt.ylabel('Reduced Feature 2')
plt.legend()

# Show the plot
plt.show()

### **Confusion Matrix**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv('iris.csv')
df.head()

In [None]:
# Condition expression to check whether the sample is of class 'Iris-setosa' or not
df['Name'] != 'Iris-setosa'

In [None]:
# Filter dataset to consider samples of class other than 'Iris-setosa'
df_b = df.loc[ df['Name'] != 'Iris-setosa']
df_b.shape

In [None]:
# Features
X_b = df_b.iloc[:, :4].values

# Target
Y_b = df_b['Name'].values

In [None]:
# Training and testing set ratio is 67 : 33
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, Y_b, test_size=0.3, random_state=42)

In [None]:
clf = DecisionTreeClassifier(max_depth=1, criterion='entropy')

# Fitting the data
clf.fit(X_train_b,y_train_b)

# Predicting on test set
y_pred_b = clf.predict(X_test_b)

# Accuracy on test set
print("Accuracy score on test set : ", clf.score(X_test_b, y_test_b))

In [None]:
# Confusion matrix
mat = confusion_matrix(y_test_b, y_pred_b)
mat

In [None]:
import seaborn as sns
target_names = np.array(['Iris-versicolor', 'Iris-virginica'])
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Labels')
plt.ylabel('Predicted Labels')
plt.show()

#### Precision-Recall Metrics

<img src='https://drive.google.com/uc?id=1pEQdkP-trVknuCImQbN3-XziyTPRYQIC' width=400px>

* **Precision:** The precision is calculated as the ratio between the number of Positive samples correctly classified to the total number of samples classified as Positive (either correctly or incorrectly)

    Precision = $\mathbf{\frac{TruePositive}{TruePositive + FalsePositive}}$

* **Recall:** Recall tells us how many true positives (points labelled as positive) were recalled or found by our model.

   Recall = $\mathbf{\frac{TruePositive}{TruePositive + FalseNegative}}$

* **F1-score:** precision and recall can be combined into a single score that seeks to balance both concerns, called the F-score or the F-measure.
  
   F1-score = $\mathbf{\frac{2*Precision*Recall}{Precision+Recall}}$

#### Precision

In [None]:
from sklearn.metrics import precision_score

precision_score(y_test_b, y_pred_b, average="macro")

#### Recall

In [None]:
from sklearn.metrics import recall_score
recall_score(y_test_b, y_pred_b, average="macro")

####F1-score

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test_b, y_pred_b, average="macro")

### ROC-AUC Score and Curve

<center><img src='https://drive.google.com/uc?id=1_4LKC-FKTq1zdxsdZIS5eJnqR9KGbafb' width=400px></center>

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
def mapper(x):
    maps = {"Iris-virginica": 0,
            "Iris-versicolor": 1}
    return maps[x]

In [None]:
y_test_b2 = [i for i in map(mapper, y_test_b)]
y_pred_b2 = [i for i in map(mapper, y_pred_b)]

In [None]:
# False Positive Rate and True Positive Rate
fpr, tpr, _ = roc_curve(y_test_b2, y_pred_b2)
fpr, tpr

In [None]:
plt.plot([0,1], [0,1], linestyle='--', label='Untrained model', color='k')
plt.plot(fpr, tpr, marker='.', label='Trained DecisionTree')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()