In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas import DataFrame 
df_tennis = pd.read_csv('tennis.csv')
print("\n Given Play Tennis Data Set:\n\n",df_tennis)
df_tennis.keys()[0]

def entropy(probs):  
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )

def entropy_of_list(a_list):  
    from collections import Counter
    cnt = Counter(x for x in a_list)   
    num_instances = len(a_list)*1.0  
    print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))
    probs = [x / num_instances for x in cnt.values()]
    print("\n Classes:",min(cnt),max(cnt))
    print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    return entropy(probs) 
print("\n  INPUT DATA SET FOR ENTROPY CALCULATION:\n", df_tennis['PlayTennis'])
total_entropy = entropy_of_list(df_tennis['PlayTennis'])
print("\n Total Entropy of PlayTennis Data Set:",total_entropy)

def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    df_split = df.groupby(split_attribute_name)
    for name,group in df_split:
            print("Name:\n",name)
            print("Group:\n",group)
    nobs = len(df.index) * 1.0
    print("NOBS",nobs)
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    print([target_attribute_name])
    print(" Entropy List ",entropy_of_list)
    print("DFAGGENT",df_agg_ent)
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    if trace: 
        print(df_agg_ent)
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy
print('Info-gain for Outlook is :'+str( information_gain(df_tennis, 'Outlook', 'PlayTennis')),"\n")
print('\n Info-gain for Humidity is: ' + str( information_gain(df_tennis, 'Humidity', 'PlayTennis')),"\n")
print('\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'Wind', 'PlayTennis')),"\n")
print('\n Info-gain for Temperature is:' + str( information_gain(df_tennis, 'Temperature','PlayTennis')),"\n")

def id3(df, target_attribute_name, attribute_names, default_class=None):
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:
        return next(iter(cnt)) 
    elif df.empty or (not attribute_names):
        return default_class  
    else:
        default_class = max(cnt.keys()) 
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] #
        index_of_max = gainz.index(max(gainz))
        best_attr = attribute_names[index_of_max]
        tree = {best_attr:{}}  
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree
attribute_names = list(df_tennis.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('PlayTennis')  
print("Predicting Attributes:", attribute_names)
from pprint import pprint
tree = id3(df_tennis,'PlayTennis',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())

def classify(instance, tree, default=None): 
    print("Instance:",instance)
    attribute = next(iter(tree))       
    print("Attribute:",attribute) 
    if instance[attribute] in tree[attribute].keys():  
        result = tree[attribute][instance[attribute]]
        print("Instance Attribute:",instance[attribute],"TreeKeys :",tree[attribute].keys())
        if isinstance(result, dict):
            return classify(instance, result)
        else:
            return result 
    else:
        return default
df_tennis['predicted'] = df_tennis.apply(classify, axis=1, args=(tree,'No') )  
df_tennis[['PlayTennis', 'predicted']]
training_data = df_tennis.iloc[1:-4] 
test_data  = df_tennis.iloc[-4:] 
train_tree = id3(training_data, 'PlayTennis', attribute_names)

test_data['predicted2'] = test_data.apply( 
                                          classify, 
                                          axis=1, 
                                          args=(train_tree,'Yes') )


print ('\n\n Accuracy is : ' + str( sum(test_data['PlayTennis']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))


 Given Play Tennis Data Set:

      Outlook Temperature Humidity    Wind PlayTennis
0      Sunny         Hot     High    Weak         No
1      Sunny         Hot     High  Strong         No
2   Overcast         Hot     High    Weak        Yes
3       Rain        Mild     High    Weak        Yes
4       Rain        Cool   Normal    Weak        Yes
5       Rain        Cool   Normal  Strong         No
6   Overcast        Cool   Normal  Strong        Yes
7      Sunny        Mild     High    Weak         No
8      Sunny        Cool   Normal    Weak        Yes
9       Rain        Mild   Normal    Weak        Yes
10     Sunny        Mild   Normal  Strong        Yes
11  Overcast        Mild     High  Strong        Yes
12  Overcast         Hot   Normal    Weak        Yes
13      Rain        Mild     High  Strong         No

  INPUT DATA SET FOR ENTROPY CALCULATION:
 0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes


 Classes: No Yes
 
 Probabilities of Class No is 0.35714285714285715:
 
 Probabilities of Class Yes is 0.6428571428571429:
Information Gain Calculation of  Humidity
Name:
 High
Group:
      Outlook Temperature Humidity    Wind PlayTennis
0      Sunny         Hot     High    Weak         No
1      Sunny         Hot     High  Strong         No
2   Overcast         Hot     High    Weak        Yes
3       Rain        Mild     High    Weak        Yes
7      Sunny        Mild     High    Weak         No
11  Overcast        Mild     High  Strong        Yes
13      Rain        Mild     High  Strong         No
Name:
 Normal
Group:
      Outlook Temperature Humidity    Wind PlayTennis
4       Rain        Cool   Normal    Weak        Yes
5       Rain        Cool   Normal  Strong         No
6   Overcast        Cool   Normal  Strong        Yes
8      Sunny        Cool   Normal    Weak        Yes
9       Rain        Mild   Normal    Weak        Yes
10     Sunny        Mild   Normal  Strong        Y

Attribute: Wind
Instance Attribute: Strong TreeKeys : dict_keys(['Strong', 'Weak'])
Information Gain Calculation of  Outlook
Name:
 Overcast
Group:
     Outlook Temperature Humidity    Wind PlayTennis predicted
2  Overcast         Hot     High    Weak        Yes       Yes
6  Overcast        Cool   Normal  Strong        Yes       Yes
Name:
 Rain
Group:
   Outlook Temperature Humidity    Wind PlayTennis predicted
3    Rain        Mild     High    Weak        Yes       Yes
4    Rain        Cool   Normal    Weak        Yes       Yes
5    Rain        Cool   Normal  Strong         No        No
9    Rain        Mild   Normal    Weak        Yes       Yes
Name:
 Sunny
Group:
   Outlook Temperature Humidity    Wind PlayTennis predicted
1   Sunny         Hot     High  Strong         No        No
7   Sunny        Mild     High    Weak         No        No
8   Sunny        Cool   Normal    Weak        Yes       Yes
NOBS 9.0

 Number of Instances of the Current Sub Class is 2.0:

 Classes: Yes Yes
 

Weak                0.0        0.75

 Number of Instances of the Current Sub Class is 4.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.25:
 
 Probabilities of Class Yes is 0.75:
Information Gain Calculation of  Temperature
Name:
 Cool
Group:
   Outlook Temperature Humidity  Wind PlayTennis predicted
8   Sunny        Cool   Normal  Weak        Yes       Yes
Name:
 Hot
Group:
   Outlook Temperature Humidity    Wind PlayTennis predicted
1   Sunny         Hot     High  Strong         No        No
Name:
 Mild
Group:
   Outlook Temperature Humidity  Wind PlayTennis predicted
7   Sunny        Mild     High  Weak         No        No
NOBS 3.0

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: No No
 
 Probabilities of Class No is 1.0:
 
 Probabilities of Class No is 1.0:

 Number of Instances of the Current Sub Class is 1.0