<a href="https://colab.research.google.com/github/fboldt/aulasml/blob/master/aula6a_arvore_atributos_discretos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ucimlrepo



In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
car_evaluation = fetch_ucirepo(id=19)

# data (as pandas dataframes)
X = car_evaluation.data.features.to_numpy()
y = car_evaluation.data.targets.to_numpy()[:,0]

# metadata
print(car_evaluation.metadata)

# variable information
print(car_evaluation.variables)


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'published_in': '8th Intl Workshop on Expert Systems and their Applications, Avignon, France', 'yea

In [3]:
set(y), len(y)

({'acc', 'good', 'unacc', 'vgood'}, 1728)

In [4]:
combinacoes = 1
for i in range(X.shape[1]):
  valores = set(X[:,i])
  combinacoes *= len(valores)
  print(valores)
print(combinacoes)

{'low', 'vhigh', 'med', 'high'}
{'low', 'vhigh', 'med', 'high'}
{'4', '5more', '3', '2'}
{'4', '2', 'more'}
{'med', 'big', 'small'}
{'high', 'med', 'low'}
1728


In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from collections import Counter
from sklearn.model_selection import cross_validate

def maisFrequente(y):
  return Counter(y.flat).most_common(1)[0][0]

class ZeroR(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.resposta = maisFrequente(y)
  def predict(self, X):
    y = np.empty((X.shape[0]), dtype='<U5')
    y[:] = self.resposta
    return y

results = cross_validate(ZeroR(), X, y)
results['test_score'], np.mean(results['test_score'])

(array([0.69942197, 0.69942197, 0.69942197, 0.70144928, 0.70144928]),
 0.7002328893356791)

In [6]:
def impureza(y): #Gini
  labels = list(set(y))
  labels.sort()
  probabilidades = np.zeros((len(labels),))
  for i, k in enumerate(labels):
    probabilidades[i] = sum(y==k)/len(y)
  result = 1 - sum(probabilidades**2)
  return result

impureza(y[:])

0.457283763074417

In [7]:
def impurezaValor(x, y, valor):
  iguais = x==valor
  impurezaIguais = impureza(y[iguais])
  proporcaoIguais = sum(iguais)/len(y)
  impurezaDiferentes = impureza(y[~iguais])
  proporcaoDiferentes = sum(~iguais)/len(y)
  impurezaTotal = proporcaoIguais*impurezaIguais + proporcaoDiferentes*impurezaDiferentes
  return impurezaTotal

impurezaValor(X[:,3], y, '2')

0.38615712609310704

In [8]:
def impurezaMinima(X, y):
  impurezas = []
  caracteristicaValores = []
  for i in range(X.shape[1]):
    valores = sorted(list(set(X[:,i])))
    for valor in valores:
      caracteristicaValores.append([i, valor])
      impurezaValorCaracteristica = impurezaValor(X[:,i], y, valor)
      impurezas.append(impurezaValorCaracteristica)
  impurezas = np.array(impurezas)
  menorImpureza = np.argmin(impurezas)
  caracteristica, valor = caracteristicaValores[menorImpureza]
  return impurezas[menorImpureza], caracteristica, valor
impurezaMinima(X, y)

(0.38615712609310704, 3, '2')

In [9]:
class Arvore(BaseEstimator, ClassifierMixin):
  def fit(self, X, y):
    self.impureza, self.caracteristica, self.valor = impurezaMinima(X, y)
    iguais = X[:,self.caracteristica] == self.valor
    if sum(iguais)>0 and sum(~iguais)>0:
      self.iguais = Arvore()
      self.iguais.fit(X[iguais,:], y[iguais])
      self.diferentes = Arvore()
      self.diferentes.fit(X[~iguais,:], y[~iguais])
    else:
      self.resposta = maisFrequente(y)
    return self
  def predict(self, X):
    y = np.empty((X.shape[0]), dtype='<U5')
    if hasattr(self, 'resposta'):
      y[:] = self.resposta
    else:
      iguais = X[:, self.caracteristica] == self.valor
      y[iguais] = self.iguais.predict(X[iguais,:])
      y[~iguais] = self.diferentes.predict(X[~iguais,:])
    return y

results = cross_validate(Arvore(), X, y)

results['test_score'], np.mean(results['test_score'])

(array([0.62716763, 0.73121387, 0.75144509, 0.75362319, 0.8057971 ]),
 0.7338493758900897)

In [10]:
modelo = Arvore()
modelo.fit(X, y)
y_pred = modelo.predict(X)
sum(y==y_pred)/len(y)

1.0