<a href="https://colab.research.google.com/github/fboldt/postre/blob/main/aula6a_%C3%81rvores_de_decis%C3%A3o_atributos_discretos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Árvore de decisão para atributos discretos

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [3]:
from ucimlrepo import fetch_ucirepo

car_evaluation = fetch_ucirepo(id=19)

X = car_evaluation.data.features.to_numpy()
y = car_evaluation.data.targets.to_numpy()[:,0]

print(car_evaluation.metadata)
print(car_evaluation.variables)

{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'published_in': '8th Intl Workshop on Expert Systems and their Applications, Avignon, France', 'yea

In [4]:
set(y), len(y)

({'acc', 'good', 'unacc', 'vgood'}, 1728)

In [6]:
combinacoes = 1
for i in range(X.shape[1]):
  valores = set(X[:,i])
  combinacoes *= len(valores)
  print(valores)
print(combinacoes)

{'high', 'vhigh', 'low', 'med'}
{'high', 'vhigh', 'low', 'med'}
{'4', '5more', '3', '2'}
{'more', '4', '2'}
{'big', 'small', 'med'}
{'high', 'low', 'med'}
1728


In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
import numpy as np
from sklearn.model_selection import cross_validate

def maisFrequente(y):
  return Counter(y.flat).most_common(1)[0][0]

class ZeroR(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.resposta = maisFrequente(y)
    return self
  def predict(self, X, y=None):
    y = np.empty((X.shape[0]), dtype='<U5')
    y[:] = self.resposta
    return y

scores = cross_validate(ZeroR(), X, y)
scores['test_score'], np.mean(scores['test_score'])

(array([0.69942197, 0.69942197, 0.69942197, 0.70144928, 0.70144928]),
 0.7002328893356791)

In [8]:
modelo = ZeroR()
modelo.fit(X, y)
modelo.resposta

'unacc'

In [9]:
modelo = ZeroR()
modelo.fit(X, y)
ypred = modelo.predict(X)
sum(y==ypred)/len(y)

0.7002314814814815

In [18]:
def impureza(y): #Gini
  labels = list(set(y))
  labels.sort()
  probabilidades = np.zeros((len(labels),))
  for i, k in enumerate(labels):
    probabilidades[i] = sum(y==k)/len(y)
  result = 1 - sum(probabilidades ** 2)
  return result

impureza(y[:])

0.457283763074417

In [21]:
def impurezaValor(x, y, valor):
  iguais = x==valor
  impurezaIguais = impureza(y[iguais])
  proporcaoIguais = sum(iguais)/len(y)
  impurezaDiferentes = impureza(y[~iguais])
  proporcaoDiferentes = sum(~iguais)/len(y)
  impurezaTotal = proporcaoIguais*impurezaIguais + proporcaoDiferentes*impurezaDiferentes
  return impurezaTotal

impurezaValor(X[:,3], y, '2')

0.38615712609310704

In [22]:
def impurezaMinima(X, y):
  impurezas = []
  caracteristicasValores = []
  for i in range(X.shape[1]):
    valores = sorted(list(set(X[:,i])))
    for valor in valores:
      caracteristicasValores.append([i, valor])
      impurezaValorCaracteristica = impurezaValor(X[:,i], y, valor)
      impurezas.append(impurezaValorCaracteristica)
  impurezas = np.array(impurezas)
  menorImpureza = np.argmin(impurezas)
  caracteristica, valor = caracteristicasValores[menorImpureza]
  return impurezas[menorImpureza], caracteristica, valor

impurezaMinima(X, y)

(0.38615712609310704, 3, '2')

In [26]:
class Arvore(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.impureza, self.caracteristica, self.valor = impurezaMinima(X, y)
    iguais = X[:,self.caracteristica] == self.valor
    # print(self.impureza, self.caracteristica, self.valor, sum(iguais), sum(~iguais))
    if sum(iguais)>0 and sum(~iguais)>0:
      self.iguais = Arvore()
      self.iguais.fit(X[iguais,:], y[iguais])
      self.diferentes = Arvore()
      self.diferentes.fit(X[~iguais,:], y[~iguais])
    else:
      self.resposta = maisFrequente(y)
    return self
  def predict(self, X, y=None):
    y = np.empty((X.shape[0]), dtype='<U5')
    if hasattr(self, 'resposta'):
      y[:] = self.resposta
    else:
      iguais = X[:,self.caracteristica] == self.valor
      y[iguais] = self.iguais.predict(X[iguais,:])
      y[~iguais] = self.diferentes.predict(X[~iguais,:])
    return y

modelo = Arvore()
modelo.fit(X, y)
ypred = modelo.predict(X)
sum(y==ypred)/len(y)

1.0

In [27]:
scores = cross_validate(Arvore(), X, y)
scores['test_score'], np.mean(scores['test_score'])

(array([0.62716763, 0.73121387, 0.75144509, 0.75362319, 0.8057971 ]),
 0.7338493758900897)