# Exercise 4

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from decision_tree import *
from classification_metrics import *

In [2]:
df = pd.read_csv("../data/wine.data", sep=",", header=None)
header = ["Class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", 
          "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
df.columns = header
df

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [3]:
X = df.iloc[:,1:]
y = df.iloc[:,0] - 1 # attention: classes start at 1; adjusting this
feature_names = list(X.columns)
print("feature_names:", feature_names)
target_names = list(set(y))
target_names = [str(e) for e in target_names] # to str
print("target_names:", target_names)
X = np.asarray(X)
y = np.asarray(y)
print("X.shape:", X.shape)

feature_names: ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
target_names: ['0', '1', '2']
X.shape: (178, 13)


In [4]:
np.random.seed(777)
ind_train = np.random.choice(X.shape[0], size=int(X.shape[0] * 0.8), replace=False)
bool_ind_train = np.in1d(range(X.shape[0]), ind_train)
X_train = X[bool_ind_train,]
y_train = y[bool_ind_train]
X_test = X[~bool_ind_train,]
y_test = y[~bool_ind_train]
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_test.shape:", X_test.shape)
print("y_test.shape:", y_test.shape)

X_train.shape: (142, 13)
y_train.shape: (142,)
X_test.shape: (36, 13)
y_test.shape: (36,)


In [5]:
def majority_voting(yHats): # size of yHats is (K, M)
    yHat = []
    for i in range(yHats.shape[1]):
        vals, counts = np.unique(yHats[:,i], return_counts=True)
        index = np.argmax(counts)
        yHat.append(int(vals[index]))
    return yHat

def random_forest(X, y, K, max_depth=100):
    decision_trees = []
    for k in range(K):
        ind = np.random.choice(X.shape[0], size=X.shape[0], replace=True) # sampling with replacement = bootstrapping
        X_sample = X[ind,]
        y_sample = y[ind]
        decision_trees.append(build_tree(X_sample, y_sample, len(set(y_sample)), max_features="sqrt", max_depth=max_depth))
    return decision_trees

def random_forest_predict(decision_trees, X):
    K = len(decision_trees)
    yHats = np.zeros((K, X.shape[0]))
    for k in range(K):
        yHats[k,] = predict(decision_trees[k], X)
    return majority_voting(yHats)

In [6]:
np.random.seed(777)
decision_trees = random_forest(X_train, y_train, 100)
decision_trees[0]

<decision_tree.Node at 0x7f6e637f3bb0>

In [7]:
def get_accuracies_single(decision_trees):
    accuracies = []
    for k in range(len(decision_trees)):
        yHat = predict(decision_trees[k], X_test)
        _, confusion_mat = confusion_matrix(y_test, yHat)
        accuracies.append(accuracy(confusion_mat))    
    return accuracies

In [8]:
accuracies_firstTen = get_accuracies_single(decision_trees[:10])
print("First 10: Mean accuracy on test data set:", np.mean(accuracies_firstTen))
accuracies_All = get_accuracies_single(decision_trees)
print("All 100: Mean accuracy on test data set:", np.mean(accuracies_All))

First 10: Mean accuracy on test data set: 0.861111111111111
All 100: Mean accuracy on test data set: 0.8450000000000002


In [9]:
def get_accuracies_ensemble(decision_trees):
    yHat_rf = random_forest_predict(decision_trees, X_test)
    print(yHat_rf)
    _, confusion_mat = confusion_matrix(y_test, yHat_rf)
    return accuracy(confusion_mat)

In [10]:
accuracies_firstTen = get_accuracies_ensemble(decision_trees[:10])
print("First 10: Random forest accuracy on test data set:", accuracies_firstTen)
accuracies_All = get_accuracies_ensemble(decision_trees)
print("All 100: Random forest accuracy on test data set:", accuracies_All)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]
First 10: Random forest accuracy on test data set: 0.9722222222222222
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]
All 100: Random forest accuracy on test data set: 1.0
