## Decision Tree - Classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("penguins_size.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


## Exploratory Data Analysis and Visualization

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
df.describe()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [6]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [7]:
for i in df.columns:
    print(i,":", round(df[i].isnull().sum()/df.shape[0],3))

species : 0.0
island : 0.0
culmen_length_mm : 0.006
culmen_depth_mm : 0.006
flipper_length_mm : 0.006
body_mass_g : 0.006
sex : 0.029


In [8]:
df = df.dropna()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            334 non-null    object 
 1   island             334 non-null    object 
 2   culmen_length_mm   334 non-null    float64
 3   culmen_depth_mm    334 non-null    float64
 4   flipper_length_mm  334 non-null    float64
 5   body_mass_g        334 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.9+ KB


In [10]:
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [11]:
df["sex"].unique()

array(['MALE', 'FEMALE', '.'], dtype=object)

In [12]:
df["island"].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [13]:
df["species"].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [14]:
df[df["sex"]== "."]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [15]:
df[df.species =="Gentoo"].groupby("sex").describe().T

Unnamed: 0,sex,.,FEMALE,MALE
culmen_length_mm,count,1.0,58.0,61.0
culmen_length_mm,mean,44.5,45.563793,49.47377
culmen_length_mm,std,,2.051247,2.720594
culmen_length_mm,min,44.5,40.9,44.4
culmen_length_mm,25%,44.5,43.85,48.1
culmen_length_mm,50%,44.5,45.5,49.5
culmen_length_mm,75%,44.5,46.875,50.5
culmen_length_mm,max,44.5,50.5,59.6
culmen_depth_mm,count,1.0,58.0,61.0
culmen_depth_mm,mean,15.7,14.237931,15.718033


In [16]:
df.loc[336, "sex"] = "MALE"

In [17]:
df.loc[336, "sex"]

'MALE'

## Visualization

In [None]:
df["species"].value_counts()

In [None]:
sns.countplot(x="species", data = df);

In [None]:
sns.catplot(x='species', data=df, kind='count',col='sex')

In [None]:
plt.figure(figsize=(12,6))
sns.pairplot(df,hue='species',palette='Dark2')

In [None]:
df.species.unique()

In [None]:
# !pip install plotly

In [None]:
import plotly.express as px

In [None]:
fig = px.scatter_3d(df, 
                    x='culmen_length_mm',
                    y='flipper_length_mm',
                    z='culmen_depth_mm',
                    color='species')
fig.show();

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.select_dtypes("number").corr(),annot=True, cmap='viridis')
plt.title("Correlation Matrix")

plt.show()

## Feature Engineering

In [None]:
df.head()

In [None]:
X = df.drop('species',axis=1)
y = df['species']

## Train | Test Split

In [None]:
X=pd.get_dummies(X, drop_first=True)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101, stratify=y)

# Modelling - Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DT_model = DecisionTreeClassifier(random_state=101)

In [None]:
DT_model.fit(X_train,y_train)

In [None]:
y_pred = DT_model.predict(X_test)

## Model Performance on Classification Tasks

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plot_confusion_matrix(DT_model,X_test,y_test)

In [None]:
from yellowbrick.classifier import ClassPredictionError

visualizer = ClassPredictionError(DT_model)

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.poof();

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Cross Validate

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import make_scorer

In [None]:
model = DecisionTreeClassifier(max_depth=None, random_state=101)

scores = cross_validate(model, X_train, y_train, scoring = ["accuracy", "precision_macro", "recall_macro", "f1_macro"], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]

## Feature Importances

In [None]:
DT_model.feature_importances_

In [None]:
df_f_i = pd.DataFrame(index=X.columns, data = DT_model.feature_importances_, 
                      columns = ["Feature Importance"]).sort_values("Feature Importance")
df_f_i

In [None]:
sns.barplot(x = df_f_i.index, y = 'Feature Importance', data = df_f_i)
plt.xticks(rotation = 90)
plt.tight_layout()

In [None]:
X2 = X.drop(columns = ["flipper_length_mm"]) 
# The feature that weighs too much on the estimate can sometimes cause overfitting. For this reason, the most important feature can be dropped and the scores can be checked again

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size=0.3, random_state=101)

In [None]:
DT2_model = DecisionTreeClassifier(random_state=101)

In [None]:
DT2_model.fit(X_train2, y_train2) 

In [None]:
y_pred2 = DT2_model.predict(X_test2)

In [None]:
print(confusion_matrix(y_test2, y_pred2))
print(classification_report(y_test2, y_pred2))

In [None]:
model = DecisionTreeClassifier(random_state=101)

scores = cross_validate(model, X_train2, y_train2, scoring = ["accuracy", "precision_macro", "recall_macro", "f1_macro"], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]

In [None]:
from sklearn.metrics import make_scorer

In [None]:
scoring = {'precision-Adelie': make_scorer(precision_score,  average='weighted', pos_label="Adelie"),
           'recall-Adelie': make_scorer(recall_score, average='weighted', pos_label ="Adelie"),
           'f1-Adelie': make_scorer(f1_score, average='weighted', pos_label = "Adelie"),
          
          'precision-Chinstrap': make_scorer(precision_score,  average='weighted', pos_label="Chinstrap"),
          'recall-Chinstrap': make_scorer(recall_score, average='weighted', pos_label ="Chinstrap"),
          'f1-Chinstrap': make_scorer(f1_score, average='weighted', pos_label = "Chinstrap"),
          
          
          'precision-Gentoo': make_scorer(precision_score,  average='weighted', pos_label="Gentoo"),
          'recall-Gentoo': make_scorer(recall_score, average='weighted', pos_label ="Gentoo"),
          'f1-Gentoo': make_scorer(f1_score, average='weighted', pos_label = "Gentoo"),
          
          }

In [None]:
for i, j in scoring.items():
    model = DecisionTreeClassifier(random_state=101)
    scores = cross_val_score(model, X_train2, y_train2, cv = 10, scoring = j, n_jobs = -1)
    #print([round(i, 4) for i in scores], "\n")
    print(f" {i:20} : %{scores.mean()*100:.2f}, std : %{scores.std()*100:.3f} \n")

## Visualize the Tree

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(12,8))
plot_tree(DT_model);

In [None]:
plt.figure(figsize=(12,8),dpi=150)
plot_tree(DT_model, filled=True, feature_names=X.columns, class_names=df.species.unique());

## Understanding Hyperparameters

### Max Depth

In [None]:
def report_model(model):
    model_pred = model.predict(X_test)
    model_train_pred = model.predict(X_train)
    print('\n')
    print("Test Set")
    print(confusion_matrix(y_test, model_pred))
    print('\n')
    print(classification_report(y_test,model_pred))
    print('\n')
    print("Train Set")
    print(confusion_matrix(y_train, model_train_pred))
    print('\n')
    print(classification_report(y_train,model_train_pred))
    plt.figure(figsize=(12,8),dpi=150)
    plot_tree(model,filled=True, feature_names=X.columns, class_names=df.species.unique());

In [None]:
pruned_tree = DecisionTreeClassifier(max_depth=3, random_state=101)
pruned_tree.fit(X_train,y_train)

In [None]:
report_model(pruned_tree)

## Max Leaf Nodes

In [None]:
pruned_tree_2 = DecisionTreeClassifier(max_leaf_nodes=5, random_state=101)
pruned_tree_2.fit(X_train,y_train)

In [None]:
report_model(pruned_tree_2)

## Criterion

In [None]:
entropy_tree = DecisionTreeClassifier(criterion='entropy', random_state=101)
entropy_tree.fit(X_train,y_train)

In [None]:
report_model(entropy_tree)

## Max_features, Splitter

In [None]:
tree = DecisionTreeClassifier(splitter = "best", max_features=3)
tree.fit(X_train,y_train)

In [None]:
report_model(tree)

## Find Best Parameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = DecisionTreeClassifier(random_state=101)

In [None]:
param_grid = {"splitter":["best", "random"],
              "max_features":[None, "auto", "log2", 2, 3, 4, 5, 6],
              "max_depth": [None, 2,3, 4, 5, 6, 7],
              "min_samples_leaf": [1, 2, 3, 4],
              "min_samples_split": [2, 3, 5, 6, 7, 8]}

In [None]:
grid_model = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          scoring='f1_macro',
                          cv=10,
                          n_jobs = -1)

In [None]:
grid_model.fit(X_train, y_train)

In [None]:
grid_model.best_params_

In [None]:
y_pred = grid_model.predict(X_test)
y_train_pred = grid_model.predict(X_train)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))