In [None]:
#implementation of ID3 Algorithm for classification of dataset

#importing the libraries
import pandas as pd
import math
from collections import Counter

#importing the dataset
df = pd.read_csv('https://raw.githubusercontent.com/darshanjoshi16/DataMiningPractical/main/Practical%209/data.csv')

#defining the function which will calculate the entropy using formula (-P*(logPbase2))
def entropy(probs):  
    return sum( [-prob*math.log(prob, 2) for prob in probs] )

#defining the function which will calculate the entropy of a list using the counter function in which we will define the classes and their probability
def entropy_of_list(a_list):  
    
    cnt = Counter(x for x in a_list)   
    num_instances = len(a_list)*1.0 
    print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))
    probs = [x / num_instances for x in cnt.values()]
    print("\n Classes:",min(cnt),max(cnt))
    print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    return entropy(probs)

#Displaying the dataset column, classes and the entropy in the console
print("\n  INPUT DATA SET FOR ENTROPY CALCULATION:\n", df['PlayTennis'])

total_entropy = entropy_of_list(df['PlayTennis'])

print("\n Total Entropy of PlayTennis Data Set:",total_entropy)


  INPUT DATA SET FOR ENTROPY CALCULATION:
 0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: PlayTennis, dtype: object

 Number of Instances of the Current Sub Class is 14.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.35714285714285715:
 
 Probabilities of Class Yes is 0.6428571428571429:

 Total Entropy of PlayTennis Data Set: 0.9402859586706309


In [None]:
#defining the information gain function with dataframe,splitting attribute and target attribute as the input parameters
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    '''
    Takes a DataFrame of attributes, and quantifies the entropy of a target
    attribute after performing a split along the values of another attribute.
    '''
  
    #splitting the attribute using groupby function 
    df_split = df.groupby(split_attribute_name)
    nobs = len(df.index) * 1.0

    #it aggregates the target attribute and entropy of the list
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
  
    df_agg_ent.columns = ['Entropy', 'PropObservations']

    # Calculate Information Gain:
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy


print('Info-gain for Outlook is :'+str( information_gain(df, 'Outlook', 'PlayTennis')),"\n")
print('\n Info-gain for Humidity is: ' + str( information_gain(df, 'Humidity', 'PlayTennis')),"\n")
print('\n Info-gain for Wind is:' + str( information_gain(df, 'Wind', 'PlayTennis')),"\n")
print('\n Info-gain for Temperature is:' + str( information_gain(df, 'Temperature','PlayTennis')),"\n")

Information Gain Calculation of  Outlook

 Number of Instances of the Current Sub Class is 4.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.4:
 
 Probabilities of Class Yes is 0.6:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.4:
 
 Probabilities of Class Yes is 0.6:

 Number of Instances of the Current Sub Class is 14.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.35714285714285715:
 
 Probabilities of Class Yes is 0.6428571428571429:
Info-gain for Outlook is :0.2467498197744391 

Information Gain Calculation of  Humidity

 Number of Instances of the Current Sub Class is 7.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.42857142857142855:
 
 Probabilities of Class Yes is 0.5714285714285714:

 Number of Instances of the Current Sub Class is 7.0:

 Classes:

In [None]:
#defining the id3 implementation function which takes the dataframe,splitting attribute,total attribute names and default class as parameters
def id3(df, target_attribute_name, attribute_names, default_class=None):
    
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    
    #counting the occurances of the total attributes name
    if len(cnt) == 1:
        return next(iter(cnt))  
    
    #checking if dataframe is empty or not
    elif df.empty or (not attribute_names):
        return default_class 
    
    #after all the calculation of parameters, It will construct the tree which is desicion classifier tree.
    else:
       
        default_class = max(cnt.keys()) 
       
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] #
        index_of_max = gainz.index(max(gainz))
      
        best_attr = attribute_names[index_of_max]
        
      
        tree = {best_attr:{}}  
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        
    
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

In [None]:

attribute_names = list(df.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('PlayTennis')
print("Predicting Attributes:", attribute_names)

List of Attributes: ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']
Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Wind']


In [None]:
#The pprint module provides a capability to “pretty-print” arbitrary Python data structures in a form which can be used as input to the interpreter.
from pprint import pprint
tree = id3(df,'PlayTennis',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())

Information Gain Calculation of  Outlook

 Number of Instances of the Current Sub Class is 4.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.4:
 
 Probabilities of Class Yes is 0.6:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.4:
 
 Probabilities of Class Yes is 0.6:

 Number of Instances of the Current Sub Class is 14.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.35714285714285715:
 
 Probabilities of Class Yes is 0.6428571428571429:
Information Gain Calculation of  Temperature

 Number of Instances of the Current Sub Class is 4.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.25:
 
 Probabilities of Class Yes is 0.75:

 Number of Instances of the Current Sub Class is 4.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.5:
 
 Probabilities of Class Ye

In [None]:
def classify(instance, tree, default=None):  
    
    #print("Instance:",instance)
    attribute = next(iter(tree))       
    print("Key:",tree.keys())  
    print("Attribute:",attribute) 
   
    
    if instance[attribute] in tree[attribute].keys():  
        result = tree[attribute][instance[attribute]]
        print("Instance Attribute:",instance[attribute],"TreeKeys :",tree[attribute].keys())
        if isinstance(result, dict): 
            return classify(instance, result)
        else:
            return result 
    else:
        return default

In [None]:
df['predicted'] = df.apply(classify, axis=1, args=(tree,'No') ) 
    # classify func allows for a default arg: when tree doesn't have answer for a particular
    # combitation of attribute-values, we can use 'no' as the default guess 

print(df['predicted'])

print('\n Accuracy is:\n' + str( sum(df['PlayTennis']==df['predicted'] ) / (1.0*len(df.index)) ))


df[['PlayTennis', 'predicted']]

Key: dict_keys(['Outlook'])
Attribute: Outlook
Instance Attribute: Sunny TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])
Key: dict_keys(['Humidity'])
Attribute: Humidity
Instance Attribute: High TreeKeys : dict_keys(['High', 'Normal'])
Key: dict_keys(['Outlook'])
Attribute: Outlook
Instance Attribute: Sunny TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])
Key: dict_keys(['Humidity'])
Attribute: Humidity
Instance Attribute: High TreeKeys : dict_keys(['High', 'Normal'])
Key: dict_keys(['Outlook'])
Attribute: Outlook
Instance Attribute: Overcast TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])
Key: dict_keys(['Outlook'])
Attribute: Outlook
Instance Attribute: Rain TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])
Key: dict_keys(['Wind'])
Attribute: Wind
Instance Attribute: Weak TreeKeys : dict_keys(['Strong', 'Weak'])
Key: dict_keys(['Outlook'])
Attribute: Outlook
Instance Attribute: Rain TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])
Key: dict_keys(['Wind'])
Attribute: W

Unnamed: 0,PlayTennis,predicted
0,No,No
1,No,No
2,Yes,Yes
3,Yes,Yes
4,Yes,Yes
5,No,No
6,Yes,Yes
7,No,No
8,Yes,Yes
9,Yes,Yes


In [None]:
training_data = df.iloc[1:-4] # all but last four instances
test_data  = df.iloc[-4:] # just the last four
train_tree = id3(training_data, 'PlayTennis', attribute_names)

test_data['predicted2'] = test_data.apply(                                # <---- test_data source
                                          classify, 
                                          axis=1, 
                                          args=(train_tree,'Yes') ) # <---- train_data tree


print ('\n\n Accuracy is : ' + str( sum(test_data['PlayTennis']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))

Information Gain Calculation of  Outlook

 Number of Instances of the Current Sub Class is 2.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 4.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.25:
 
 Probabilities of Class Yes is 0.75:

 Number of Instances of the Current Sub Class is 3.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.3333333333333333:
 
 Probabilities of Class Yes is 0.6666666666666666:

 Number of Instances of the Current Sub Class is 9.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.3333333333333333:
 
 Probabilities of Class Yes is 0.6666666666666666:
Information Gain Calculation of  Temperature

 Number of Instances of the Current Sub Class is 4.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.25:
 
 Probabilities of Class Yes is 0.75:

 Number of Instances of the Current Sub Class is 2.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
