<a href="https://colab.research.google.com/github/fboldt/aulasml/blob/master/aula05a_%C3%A1rvore_de_decis%C3%A3o_atributos_discretos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [7]:
from ucimlrepo import fetch_ucirepo

car_evaluation = fetch_ucirepo(id=19)

X = car_evaluation.data.features.to_numpy()
y = car_evaluation.data.targets.to_numpy()[:,0]

print(car_evaluation.variables)

       name     role         type demographic  \
0    buying  Feature  Categorical        None   
1     maint  Feature  Categorical        None   
2     doors  Feature  Categorical        None   
3   persons  Feature  Categorical        None   
4  lug_boot  Feature  Categorical        None   
5    safety  Feature  Categorical        None   
6     class   Target  Categorical        None   

                                         description units missing_values  
0                                       buying price  None             no  
1                           price of the maintenance  None             no  
2                                    number of doors  None             no  
3              capacity in terms of persons to carry  None             no  
4                           the size of luggage boot  None             no  
5                        estimated safety of the car  None             no  
6  evaulation level (unacceptable, acceptable, go...  None             no  

In [97]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [98]:
set(y)

{'acc', 'good', 'unacc', 'vgood'}

In [99]:
combinations = 1
for i in range(X.shape[1]):
  values = set(X[:,i])
  combinations *= len(values)
  print(f"{i} - {car_evaluation.variables['name'][i]}:\t{values}")
print(combinations)

0 - buying:	{'high', 'vhigh', 'low', 'med'}
1 - maint:	{'high', 'vhigh', 'low', 'med'}
2 - doors:	{'4', '2', '5more', '3'}
3 - persons:	{'4', '2', 'more'}
4 - lug_boot:	{'small', 'big', 'med'}
5 - safety:	{'high', 'low', 'med'}
1728


In [100]:
len(y)

1728

In [101]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from collections import Counter

def most_common(lst):
  data = Counter(lst)
  return data.most_common(1)[0][0]

class ZeroR(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.answer = most_common(y)
    return self
  def predict(self, X):
    return [self.answer] * len(X)

model = ZeroR()
model.fit(X, y)
y_pred = model.predict(X)
print(accuracy_score(y, y_pred))

0.7002314814814815


In [102]:
model = ZeroR()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.6994219653179191


In [103]:
most_common(y)

'unacc'

In [104]:
car_evaluation.variables['description'][6]

'evaulation level (unacceptable, acceptable, good, very good)'

In [105]:
import numpy as np

# apenas uma característica
class DecisionTree(BaseEstimator, ClassifierMixin):
  def __init__(self, feature=0):
    self.feature = feature

  def fit(self, X, y):
    self.value = list(set(X[:,self.feature]))[0]
    equals = X[:,self.feature] == self.value
    if sum(equals)>0 and sum(~equals)>0:
      self.equals_tree = DecisionTree(self.feature).fit(X[equals], y[equals])
      self.not_equals_tree = DecisionTree(self.feature).fit(X[~equals], y[~equals])
    else:
      self.answer = most_common(y)
    return self

  def predict(self, X):
    if hasattr(self, 'answer'):
      return np.array([self.answer] * len(X))
    else:
      equals = X[:,self.feature] == self.value
      return np.where(equals, self.equals_tree.predict(X), self.not_equals_tree.predict(X))

model = DecisionTree(0)
model.fit(X, y)
y_pred = model.predict(X)
print(accuracy_score(y, y_pred))

0.7002314814814815


In [106]:
def print_tree(tree, depth=0):
  if hasattr(tree, 'answer'):
    print('\t' * depth + str(tree.answer))
  else:
    print('\t' * depth + str(tree.value))
    print_tree(tree.equals_tree, depth+1)
    print_tree(tree.not_equals_tree, depth+1)

print_tree(model)

high
	unacc
	low
		unacc
		vhigh
			unacc
			unacc


In [107]:
def gini(y):
  labels = np.unique(y)
  tmp = 0
  for label in labels:
    p_label = np.mean(y == label)
    tmp += p_label**2
  return 1 - tmp

y_tmp = y[:]
gini_value = gini(y_tmp)
print(gini_value)

0.457283763074417


In [108]:
def impurity_value(x, y, value, impurity_function=gini):
  equals = x == value
  imp_equals = impurity_function(y[equals])
  imp_not_equals = impurity_function(y[~equals])
  return (sum(equals) / len(y)) * imp_equals + (sum(~equals) / len(y)) * imp_not_equals

impurity_value(X[:,0], y, 'vhigh')

np.float64(0.44934645776177407)

In [109]:
def best_split(x, y, impurity_function=gini):
  best_value = None
  best_impurity = float('inf')
  for value in set(x):
    impurity = impurity_value(x, y, value, impurity_function)
    if impurity < best_impurity:
      best_impurity = impurity
      best_value = value
  return best_value

best_split(X[:,0], y)

'vhigh'

In [110]:
def best_feature(X, y, impurity_function=gini):
  best_feature = None
  best_impurity = float('inf')
  for feature in range(X.shape[1]):
    value = best_split(X[:,feature], y, impurity_function)
    impurity = impurity_value(X[:,feature], y, value, impurity_function)
    if impurity < best_impurity:
      best_impurity = impurity
      best_feature = feature
      best_value = value
  return best_feature, best_value, best_impurity

best_feature(X, y)

(3, '2', np.float64(0.38615712609310704))

In [111]:
class DecisionTree(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.feature, self.value, self.impurity = best_feature(X, y)
    equals = X[:,self.feature] == self.value
    if sum(equals)>0 and sum(~equals)>0:
      self.equals_tree = DecisionTree().fit(X[equals], y[equals])
      self.not_equals_tree = DecisionTree().fit(X[~equals], y[~equals])
    else:
      self.answer = most_common(y)
    return self

  def predict(self, X):
    if hasattr(self, 'answer'):
      return np.array([self.answer] * len(X))
    else:
      equals = X[:,self.feature] == self.value
      return np.where(equals, self.equals_tree.predict(X), self.not_equals_tree.predict(X))

model = DecisionTree()
model.fit(X, y)
y_pred = model.predict(X)
print(accuracy_score(y, y_pred))

1.0


In [112]:
# print_tree(model)

In [113]:
model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9710982658959537


In [114]:
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit

scores = cross_validate(model, X, y, cv=StratifiedShuffleSplit(n_splits=5, random_state=42), scoring='accuracy')
print(scores['test_score'])
print(np.mean(scores['test_score']))

[0.97687861 0.98265896 0.96531792 0.97109827 0.97687861]
0.9745664739884393


In [115]:
class DecisionTree(BaseEstimator, ClassifierMixin):
  def __init__(self, max_depth=None):
    self.max_depth = max_depth

  def max_depth_reached(self):
    return not (self.max_depth is None or self.max_depth > 0)

  def fit(self, X, y):
    self.feature, self.value, self.impurity = best_feature(X, y)
    equals = X[:,self.feature] == self.value
    if sum(equals)>0 and sum(~equals)>0 and not self.max_depth_reached():
      max_depth = None if self.max_depth is None else self.max_depth - 1
      self.equals_tree = DecisionTree(max_depth).fit(X[equals], y[equals])
      self.not_equals_tree = DecisionTree(max_depth).fit(X[~equals], y[~equals])
    else:
      self.answer = most_common(y)
    return self

  def predict(self, X):
    if hasattr(self, 'answer'):
      return np.array([self.answer] * len(X))
    else:
      equals = X[:,self.feature] == self.value
      return np.where(equals, self.equals_tree.predict(X), self.not_equals_tree.predict(X))

model = DecisionTree(3)
model.fit(X, y)
y_pred = model.predict(X)
print(accuracy_score(y, y_pred))

0.8055555555555556


In [116]:
print_tree(model)

2
	high
		unacc
		low
			unacc
			unacc
	low
		high
			unacc
			unacc
		vhigh
			unacc
			acc


In [117]:
scores = cross_validate(model, X, y, cv=StratifiedShuffleSplit(n_splits=5, random_state=42), scoring='accuracy')
print(scores['test_score'])
print(np.mean(scores['test_score']))

[0.79768786 0.76878613 0.8150289  0.79768786 0.79190751]
0.7942196531791907


In [118]:
class DecisionTree(BaseEstimator, ClassifierMixin):
  def __init__(self, max_depth=None, min_samples_split=2):
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split

  def max_depth_reached(self):
    return not (self.max_depth is None or self.max_depth > 0)

  def fit(self, X, y):
    self.feature, self.value, self.impurity = best_feature(X, y)
    equals = X[:,self.feature] == self.value
    if sum(equals)>self.min_samples_split and sum(~equals)>self.min_samples_split and not self.max_depth_reached():
      max_depth = None if self.max_depth is None else self.max_depth - 1
      self.equals_tree = DecisionTree(max_depth, self.min_samples_split).fit(X[equals], y[equals])
      self.not_equals_tree = DecisionTree(max_depth, self.min_samples_split).fit(X[~equals], y[~equals])
    else:
      self.answer = most_common(y)
    return self

  def predict(self, X):
    if hasattr(self, 'answer'):
      return np.array([self.answer] * len(X))
    else:
      equals = X[:,self.feature] == self.value
      return np.where(equals, self.equals_tree.predict(X), self.not_equals_tree.predict(X))

model = DecisionTree(None, 10)
model.fit(X, y)
y_pred = model.predict(X)
print(accuracy_score(y, y_pred))

0.9502314814814815


In [119]:
# print_tree(model)

In [120]:
scores = cross_validate(model, X, y, cv=StratifiedShuffleSplit(n_splits=5, random_state=42), scoring='accuracy')
print(scores['test_score'])
print(np.mean(scores['test_score']))

[0.95953757 0.93641618 0.93641618 0.94797688 0.91907514]
0.9398843930635838


In [121]:
model = DecisionTree()
scores = cross_validate(model, X, y, cv=StratifiedShuffleSplit(n_splits=5, random_state=42), scoring='accuracy')
print(scores['test_score'])
print(np.mean(scores['test_score']))

[0.98265896 0.97687861 0.97687861 0.94219653 0.93641618]
0.9630057803468208
