In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from math import log
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
def preprocess(dataset):
    noCol = len(dataset.columns)-1
    for column in dataset:
        col = dataset[column]
        if (col.name == 'Class'):
            break
        else:
            name = "new" + col.name
            data = col.values
            temp = pd.cut(data, bins = 5)
            dataset[name] = temp
        
    dataset.drop (columns = dataset.columns[:noCol], axis = 1, inplace = True)


In [3]:
class Node:
       
  def __init__(self, label):
 
    self.label = label   
    self.attribute = None  
    self.values = []  
    self.children = {}  
    
    self.pruned = False  
    self.instances_labeled = []
     
    self.pAttribute = None
    self.pValue = None
 

In [4]:
def most_informative_attribute(instances):

    chosen = None
    max_ratio = -1000
 
    attributes = [key for key, value in instances[0].items()]

    attributes.remove('Class')
 
    for attribute in attributes:
        gain = gain_ratio(instances, attribute)
 
        if gain > max_ratio:
            max_ratio = gain
            chosen = attribute
 
    return chosen

In [5]:
def mode_class(instances):
 
    classes = [] 
 
    for instance in instances:
        classes.append(instance[0])
 
    return Counter(classes).most_common(1)[0][0]

In [6]:
def prior_entropy(instances):
 
    classes = []  
 
    for inst in instances:
        classes.append(inst[0])
    cnt = Counter(classes)
 
    if len(cnt) == 1:
        return 0
    else:
        entropy = 0
        for c, count_of_c in cnt.items():
            prob = count_of_c / len(classes)
            entropy += prob * (log(prob, 2))
        return -entropy

In [7]:
def entropy(instances, attribute, value):
    
  classes = [] 
  for inst in instances:
      if inst[attribute] == value:
          classes.append(inst[0])
  cnt = Counter(classes)

  if len(cnt) == 1:
      return 0
  else:
      entropy = 0
      for c, count_of_c in cnt.items():
          prob = count_of_c / len(classes)
          entropy += prob * (log(prob, 2))
      return -entropy

In [8]:
def gain_ratio(instances, attribute):
      
  priorentropy = prior_entropy(instances)

  values = []

  for inst in instances:
      values.append(inst[attribute])
  cnt = Counter(values) 
  
  remaining = 0

  split = 0

  for value, value_count in cnt.items():
      prob = value_count/len(values)
      remaining += (prob * entropy(instances, attribute, value))
      split += prob * (log(prob, 2))

  information_gain = priorentropy - remaining

  split = -split

  gainratio = None

  if split != 0:
      gainratio = information_gain / split
  else:
      gainratio = -1000

  return gainratio

In [9]:
def accuracy(trained_tree, test_instances):
    right_pred = 0

    for test_instance in test_instances:
        if predict(trained_tree, test_instance) == test_instance['Class']:
            right_pred += 1

    return right_pred / len(test_instances)

In [10]:
def predict(node, test_instance):

    if len(node.children) == 0:
        return node.label
    else:
        
        value = test_instance[node.attribute]
 
        if value in node.children and node.children[value].pruned == False:
            return predict(node.children[value], test_instance)
 
        else:
            instances = []
            for attr_value in node.values:
                instances += node.children[attr_value].instances_labeled
            return mode_class(instances)

In [11]:
TREE = None
def prune(node, val_instances):
    global TREE
    TREE = node
 
    def prune_node(node, val_instances):
        if len(node.children) == 0:
            accuracy_before = accuracy(TREE, val_instances)
            node.pruned = True
 
            if accuracy_before >= accuracy(TREE, val_instances):
                node.pruned = False
            return
 
        for value, child in node.children.items():
            prune_node(child, val_instances)
 
        accuracy_before = accuracy(TREE, val_instances)
        node.pruned = True
 
        if accuracy_before >= accuracy(TREE, val_instances):
            node.pruned = False
 
    prune_node(TREE, val_instances)

In [12]:
def ID3(instances, default):

    classes = []
    
    if len(instances) == 0:
        return Node(default)
 
    for inst in instances:
        classes.append(inst[0])
 
    if len(Counter(classes)) == 1 or len(classes) == 1:
        tree = Node(mode_class(instances))
        return tree
    else:
        best = most_informative_attribute(instances)
        tree = Node(mode_class(instances))
 
        tree.attribute = best
 
        best_values = []
 
        for inst in instances:
            try:
                best_values.append(inst[best])
            except:
                no_best = True

        tree.attribute_values = list(set(best_values))
 
        for best_attr_value_i in tree.attribute_values:
 
            i = []

            for inst in instances:

                if inst[best] == best_attr_value_i:
                    i.append(inst) 
 
            subtree = ID3(i, mode_class(instances))
 
            subtree.instances_labeled = i
            
            subtree.pAttribute = best 
            subtree.pValue = best_attr_value_i 
 
            tree.children[best_attr_value_i] = subtree
 
        return tree


In [13]:
irisdata = pd.read_csv('iris.data', sep=",", names=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Class'])
winedata = pd.read_csv('wine.data', sep=",", names=['Alcohol','MalicAcid','Ash','AlcalinityOfAsh','Magnesium','Total phenols','Flavanoids','NonflavanoidPhenols','Proanthocyanins','ColorIntensity','Hue','OD280/OD315OfDilutedWines','Proline'  ])

In [14]:
preprocess(irisdata)


validation_set = irisdata[: 1*len(irisdata)//10]
print(irisdata)

irisTrain, irisTest= train_test_split(irisdata, test_size = 0.33)
print (irisTrain)

pruned_accuracies_avgs = []
unpruned_accuracies_avgs = []

upper_limit = (round(len(irisTrain) * 0.9 * 0.8) - round(len(irisTrain) * 0.9 * 0.8) % 10) + 10
print(str(upper_limit)) 

if upper_limit <= 10:
    upper_limit = 50
    
default = mode_class(irisTrain)

print(len(irisTrain))
step_size = len(irisTrain)//20
print(step_size)
 
for length in range(10, upper_limit, step_size):
    print('Number of Training Instances:', length)

    pruned_accuracies = []
    unpruned_accuracies = []

    tree = ID3(irisTrain, default)
    ID3.prune(tree, validation_set)
    acc = ID3.accuracy(tree, irisTest)
    pruned_accuracies.append(acc)

    tree = ID3(irisTrain, default)
    acc = ID3.accuracy(tree, irisTest)
    unpruned_accuracies.append(acc) 
        
    avg_pruned_accuracies = sum(pruned_accuracies) / len(pruned_accuracies)
    avg_unpruned_accuracies = sum(unpruned_accuracies) / len(unpruned_accuracies)

    print("Classification Accuracy for Pruned Tree:", avg_pruned_accuracies) 
    print("Classification Accuracy for Unpruned Tree:", avg_unpruned_accuracies)
    print()

    pruned_accuracies_avgs.append(avg_pruned_accuracies)
    unpruned_accuracies_avgs.append(avg_unpruned_accuracies) 

              Class newSepalLength newSepalWidth newPetalLength  \
0       Iris-setosa   (5.02, 5.74]  (3.44, 3.92]  (0.994, 2.18]   
1       Iris-setosa  (4.296, 5.02]  (2.96, 3.44]  (0.994, 2.18]   
2       Iris-setosa  (4.296, 5.02]  (2.96, 3.44]  (0.994, 2.18]   
3       Iris-setosa  (4.296, 5.02]  (2.96, 3.44]  (0.994, 2.18]   
4       Iris-setosa  (4.296, 5.02]  (3.44, 3.92]  (0.994, 2.18]   
..              ...            ...           ...            ...   
145  Iris-virginica   (6.46, 7.18]  (2.96, 3.44]   (4.54, 5.72]   
146  Iris-virginica   (5.74, 6.46]  (2.48, 2.96]   (4.54, 5.72]   
147  Iris-virginica   (6.46, 7.18]  (2.96, 3.44]   (4.54, 5.72]   
148  Iris-virginica   (5.74, 6.46]  (2.96, 3.44]   (4.54, 5.72]   
149  Iris-virginica   (5.74, 6.46]  (2.96, 3.44]   (4.54, 5.72]   

      newPetalWidth  
0    (0.0976, 0.58]  
1    (0.0976, 0.58]  
2    (0.0976, 0.58]  
3    (0.0976, 0.58]  
4    (0.0976, 0.58]  
..              ...  
145     (2.02, 2.5]  
146    (1.54, 2.02]

KeyError: 0

In [None]:
# preprocess(irisdata)
# preprocess(winedata)

# irisTrain, irisTest, wineTrain, wineTest = train_test_split(irisdata, winedata, test_size = 0.33)

# pruned_accuracies_avgs = []
# unpruned_accuracies_avgs = []

TypeError: ufunc 'isinf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''