# Feature filtering based on Mutual Information for classification

In [11]:
# import libraries and data

import pandas as pd
import os
import numpy as np
from sklearn.feature_selection import mutual_info_classif

# depending on the OS the path to the data file is different
if os.name == 'nt':
    data = pd.read_excel(r'..\data\nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')
elif os.name == 'posix':
    data = pd.read_excel(r'../data/nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')

# print first 5 results.
data.head()

Unnamed: 0,Public Food Key,Classification,Food Name,"Energy with dietary fibre, equated \n(kJ)","Energy, without dietary fibre, equated \n(kJ)",Moisture (water) \n(g),Protein \n(g),Nitrogen \n(g),"Fat, total \n(g)",Ash \n(g),...,Leucine \n(mg),Lysine \n(mg),Methionine \n(mg),Phenylalanine \n(mg),Proline \n(mg),Serine \n(mg),Threonine \n(mg),Tyrosine \n(mg),Tryptophan \n(mg),Valine \n(mg)
0,F002258,31302.0,"Cardamom seed, dried, ground",1236,1012,8.3,10.8,1.72,6.7,5.8,...,,,,,,,,,155,
1,F002893,31302.0,"Chilli (chili), dried, ground",1280,1002,10.8,13.4,2.14,14.3,11.8,...,,,,,,,,,69,
2,F002963,31302.0,"Cinnamon, dried, ground",1004,579,10.6,4.0,0.64,1.2,3.6,...,,,,,,,,,49,
3,F002970,31302.0,"Cloves, dried, ground",1389,1118,9.9,6.0,0.96,13.0,5.6,...,,,,,,,,,30,
4,F003190,31302.0,"Coriander seed, dried, ground",1344,1009,8.9,12.4,1.98,17.8,6.0,...,,,,,,,,,178,


Since the dataset is largely continuous, we need to turn features into discrete ones.

## Variable discretisation

There are different methods to discretise continuous variables:

- **Equal-width binning**: divides the scope of possible values into N bins of the same width.
- **Equal-frequency binning**: divides the scope of possible values into N bins, each of them containing approximately the same number of samples.
- **Domain knowledge binning**: divides the scope of possible values into bins according to the domain knowledge.
<!-- There are other methods too listed below.

- ChiMerge: merges the bins using the Chi2 test to evaluate the statistical dependence of the classes and the feature.
- Entropy-based binning: merges the bins using the entropy of the classes and the feature.
- K-means binning: merges the bins using the K-means algorithm.
- Gaussian mixture binning: merges the bins using a Gaussian Mixture Model.
- Quantile binning: merges the bins so that each bin contains the same number of samples.
- Uniform binning: merges the bins so that each bin contains the same width.
- Recursive partitioning: merges the bins using a decision tree.
- Discretisation using decision trees: merges the bins using a decision tree.
- Discretisation using clustering: merges the bins using a clustering algorithm.
- Discretisation using support vector machines: merges the bins using a support vector machine.
- Discretisation using linear models: merges the bins using a linear model.
- Discretisation using nearest neighbours: merges the bins using a nearest neighbours algorithm.
- Discretisation using kernel density estimation: merges the bins using a kernel density estimation.
- Discretisation using fuzzy logic: merges the bins using a fuzzy logic algorithm.
- Discretisation using genetic algorithms: merges the bins using a genetic algorithm.
- Discretisation using simulated annealing: merges the bins using a simulated annealing algorithm.
- Discretisation using a neural network: merges the bins using a neural network.
- Discretisation using a random forest: merges the bins using a random forest.
- Discretisation using a linear discriminant analysis: merges the bins using a linear discriminant analysis.
- Discretisation using a quadratic discriminant analysis: merges the bins using a quadratic discriminant analysis.
- Discretisation using a principal component analysis: merges the bins using a principal component analysis.
- Discretisation using a factor analysis: merges the bins using a factor analysis.
- Discretisation using a canonical correlation analysis: merges the bins using a canonical correlation analysis.
- Discretisation using a partial least squares regression: merges the bins using a partial least squares regression.
- Discretisation using a ridge regression: merges the bins using a ridge regression. -->

WARNING: The choice of bins will influence the results of the mutual information filter.

In [12]:
# variable discretisation using pandas.qcut

# add new column with discretised values
data['Discretised Energy with dietary fibre, equated \n(kJ)'] = pd.cut(data['Energy with dietary fibre, equated \n(kJ)'], 20, labels=False)

# print the first few rows of the data for the two columns
data[['Energy with dietary fibre, equated \n(kJ)', 'Discretised Energy with dietary fibre, equated \n(kJ)']].head(10)

# print the first few rows of the sorted data for the two columns
data[['Energy with dietary fibre, equated \n(kJ)', 'Discretised Energy with dietary fibre, equated \n(kJ)']].sort_values(by='Energy with dietary fibre, equated \n(kJ)', ascending=False).head(10)

Unnamed: 0,"Energy with dietary fibre, equated \n(kJ)","Discretised Energy with dietary fibre, equated \n(kJ)"
535,3703,19
536,3702,19
506,3701,19
520,3700,19
534,3700,19
518,3700,19
519,3700,19
533,3700,19
523,3700,19
532,3700,19


In [13]:
# Discretise all columns of data



ignored_columns = ['Public Food Key', 'Classification', 'Food Name']
label = 'Classification' # label to test
test_col = [] # names of columns
data = data.fillna(0) # having values of NaN prevents calculation of MI scores.

for nutrient in data.columns:
    if nutrient in ignored_columns: 
        continue # disregard first 3 columns ['Public Food Key', 'Classification', 'Food Name']
    else:
        test_col.append(nutrient) # for features below
        data[nutrient] = pd.cut(data[nutrient], 20, labels=False) # issues with pd.qcut relating to size of bins, proceeded with pd.cut
        # discretise each column so data is discrete and not continuous

# as follows in Week 9 Workshop - Feature filtering based on Mutual Information for classification
features = data[test_col]
features = features.fillna(0)
class_label = data[label]

data.head()

Unnamed: 0,Public Food Key,Classification,Food Name,"Energy with dietary fibre, equated \n(kJ)","Energy, without dietary fibre, equated \n(kJ)",Moisture (water) \n(g),Protein \n(g),Nitrogen \n(g),"Fat, total \n(g)",Ash \n(g),...,Lysine \n(mg),Methionine \n(mg),Phenylalanine \n(mg),Proline \n(mg),Serine \n(mg),Threonine \n(mg),Tyrosine \n(mg),Tryptophan \n(mg),Valine \n(mg),"Discretised Energy with dietary fibre, equated \n(kJ)"
0,F002258,31302.0,"Cardamom seed, dried, ground",6,5,1,2,2,1,1,...,0,0,0,0,0,0,0,3,0,6
1,F002893,31302.0,"Chilli (chili), dried, ground",6,5,2,3,2,2,2,...,0,0,0,0,0,0,0,1,0,6
2,F002963,31302.0,"Cinnamon, dried, ground",5,3,2,0,0,0,0,...,0,0,0,0,0,0,0,1,0,5
3,F002970,31302.0,"Cloves, dried, ground",7,6,1,1,1,2,1,...,0,0,0,0,0,0,0,0,0,7
4,F003190,31302.0,"Coriander seed, dried, ground",7,5,1,2,2,3,1,...,0,0,0,0,0,0,0,4,0,7


In [14]:


# Following in Week 9 Workshop - Feature filtering based on Mutual Information for classification
filtered_features = []
THRESHOLD = 0.2 # threshold value not fixed

mi_arr = mutual_info_classif(X=features, y=class_label, discrete_features=True)


for feature, mi in zip(features.columns, mi_arr):
    print(f'MI value for feature "{feature}": {mi:.4f}')

    if (mi >= THRESHOLD):
        filtered_features.append(feature)

print('\nFeature set after filtering with MI:', filtered_features)

MI value for feature "Energy with dietary fibre, equated 
(kJ)": 1.5167
MI value for feature "Energy, without dietary fibre, equated 
(kJ)": 1.5030
MI value for feature "Moisture (water) 
(g)": 1.6912
MI value for feature "Protein 
(g)": 1.1684
MI value for feature "Nitrogen 
(g)": 1.1189
MI value for feature "Fat, total 
(g)": 0.8504
MI value for feature "Ash 
(g)": 0.1743
MI value for feature "Total dietary fibre 
(g)": 0.4059
MI value for feature "Alcohol 
(g)": 0.1091
MI value for feature "Fructose 
(g)": 0.3985
MI value for feature "Glucose 
(g)": 0.4286
MI value for feature "Sucrose
(g)": 0.4097
MI value for feature "Maltose 
(g)": 0.3054
MI value for feature "Lactose 
(g)": 0.1810
MI value for feature "Galactose 
(g)": 0.0293
MI value for feature "Maltotrios 
(g)": 0.0308
MI value for feature "Total sugars (g)": 0.8123
MI value for feature "Added sugars (g)": 0.5332
MI value for feature "Free sugars 
(g)": 0.5571
MI value for feature "Starch 
(g)": 0.9812
MI value for feature "D