Q1

In [31]:
import numpy as np
import math
from scipy.stats import entropy

def entropy(target_col):
    """
    Calculate the entropy of a dataset.
    The only parameter of this function is the target_col parameter which specifies the target column
    """
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data, split_attribute_name, target_name="result"):
    """
    Calculate the Information Gain of a dataset. This function takes three parameters:
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default is 'target'.
    """
    # Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])

    # Calculate the values and the corresponding counts for the split attribute
    vals, counts= np.unique(data[split_attribute_name], return_counts=True)

    # Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])

    # Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

# Example usage
import pandas as pd

# Example DataFrame
data = pd.DataFrame({
    'status': ['N', 'Y', 'Y', 'N','N','Y','Y','N','N','Y'],
    'background': ['M', 'M', 'C', 'O','O','M','C','M','C','M'],
    'online': ['Y', 'N', 'Y', 'Y','N','Y','Y','Y','N','N'],
    'result': ['P', 'P', 'F', 'P', 'F','F','P','P','F','P']
})

# Calculate Information Gain
info_gain = InfoGain(data, 'status', 'result')
print(f"Information Gain for 'status': {info_gain}")
info_gain = InfoGain(data, 'background', 'result')
print(f"Information Gain for 'background': {info_gain}")
info_gain = InfoGain(data, 'online', 'result')
print(f"Information Gain for 'online': {info_gain}")


Information Gain for 'status': 0.0
Information Gain for 'background': 0.13449779679464058
Information Gain for 'online': 0.01997309402197489
