
You will be given a toy dataset called 'Car Evaluation Data Set' (see: http://archive.ics.uci.edu/ml/datasets/Car+Evaluation for details).
You are not required to, but advised to test your code with the toy dataset, or any other dataset that contains categorical variables.

The given dataset contains six descriptive features and a target variable. Each of those are ordinal scale, categorical variables. The name of the target feature is 'evaluation'. 




In [6]:
%matplotlib inline
import pandas as pd
import math
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

### Read the dataset

In [7]:
edf = pd.read_csv('careval.csv')
edf.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,evaluation
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [8]:
edf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying        1728 non-null object
maint         1728 non-null object
doors         1728 non-null object
persons       1728 non-null object
lug_boot      1728 non-null object
safety        1728 non-null object
evaluation    1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


You will create a method called IUFS (impurity-based univariate feature selection), which will select the most informative features with a univariate feature selection schema. This feature selection method will take the dataset, name of the target variable, number of features to be selected (k) and the measure of impurity as an input, and will output the names of k best features based on the information gain. You are expected to implement information gain, entropy and Gini index functions. Note here that this will be a univariate selection, which means that you need to test the features individually.

In [9]:
def entropy(target_col):
    p_x = target_col.value_counts(normalize = True)
    return -np.sum( p_x * np.log2(p_x))
    
entropy(edf['doors'])

2.0

In [10]:
def entropy(target_col):
    #count of the unique elements in the specified column
    elements,counts = np.unique(target_col,return_counts = True)
    #calculatign the entropy using the formula and looping thru the elements to perform operation
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy
pass
entropy(edf['doors'])

2.0

In [11]:
def gini_index(feature, dataset):
    # count all samples at split point
    n_instances = float(sum([len(element) for element in feature]))
    # sum weighted Gini index for each faeture
    gini = 0.0
    for element in feature:
        size = float(len(element))
# check if we are dividing by zero in case we have a empty column
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for row in dataset:
            p = [row[-1] for row in element].count(row) / size
            score += p * p
# weight the group score by its relative size
        gini = np.sum ( (1.0 - score) * (size / n_instances) )
    return gini

gini_index(edf['lug_boot'], edf ) 

0.0004734848484848485

In [13]:
def gini(feature, dataset):
    """Calculates the gini index of a feature in a given dataset.
    
    Parameters
    ----------
    feature: str
        name of the feature
    dataset: pd.DataFrame
        dataframe for the dataset
    Returns
    -------
    float
        gini index for the feature in the dataset
    """
    vc = dataset[feature].value_counts(normalize=True, sort=False)
    return 1.0 - np.power(vc, 2.0).sum()
    
    
gini('evaluation', edf) 
gini('buying', edf) 

0.75

In [120]:
def InfoGain(data,split_col,target_col="class"):
       
    #entropy of the entire dataset
    total_entropy = entropy(data[target_col])
    
    
    #Calculate the values and the corresponding counts for the split attribute 
    values,counts= np.unique(data[split_col],return_counts=True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_col]==values[i]).dropna()[target_col]) for i in range(len(values))])
    
    #evaluation of the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

InfoGain(edf,'buying', 'evaluation')

0.09644896916961376

In [None]:
def IUFS(target, dataset, k, measure='entropy'):
    """Finds k most informative features in the given dataset based on the target variable
        using information gain with the selected measure.
        
    Parameters
    ----------
    target: str
        name of the target variable
    dataset: pd.DataFrame
        dataframe for the dataset
    k: int
        number of features to return, must be less than or equal to number of descriptive features in dataset.
        in other words, 0 < k < len(dataset.columns).
    measure: str, 'entropy' or 'gini'
        measure of impurity
    Returns
    -------
    list
        returns a list of k feature names, selected based on univariate selection schema
    """
    ranking = {}
    for feature in dataset.columns:
        if feature==target:
            continue
        IG_f = IG(feature, target, edf, measure)
        ranking[feature]=IG_f
    
    return sorted(ranking, key=ranking.get, reverse=True)[:k]

IUFS('evaluation', edf, 2, measure='entropy')

### Bonus
Improve the IUFS by including an option for gain ratio. Gain ratio is an alternative to information gain and can be used with either of the Gini index or entropy measures.  

In [None]:
def GR(feature, target, dataset, measure):
    """Calculates the gain ratio of a feature for a given target variable and a dataset.
    
    Parameters
    ----------
    feature: str
        name of the feature
    target: str
        name of the target variable
    dataset: pd.DataFrame
        dataframe for the dataset
    measure: str ('entropy' or 'gini')
        measure of impurity to be used
    Returns
    -------
    float
        gain ratio for the feature in the dataset for a given target variable
    """
    ##your implementation goes here
    pass


# GR('buying','evaluation', edf, 'gini') 

In [None]:
def IUFS2(target, dataset, k, measure='entropy', gain='IG'):
    """Finds k most informative features in the given dataset based on the target variable
        using information gain with the selected measure.
        
    Parameters
    ----------
    target: str
        name of the target variable
    dataset: pd.DataFrame
        dataframe for the dataset
    k: int
        number of features to return, must be less than or equal to number of descriptive features in dataset.
        in other words, 0 < k < len(dataset.columns).
    measure: str, 'entropy' or 'gini'
        measure of impurity
    gain: str, 'IG' or 'GR'
        feature selection metric ('IG' for information gain, 'GR' for gain ratio)
    Returns
    -------
    list
        returns a list of k feature names, selected based on univariate selection schema
    """
    ##your implementation goes here
    pass

# IUFS2('evaluation', edf, 2, measure='gini', gain='GR')