# Feature filtering based on Mutual Information for classification

In [31]:
# import libraries and data

import pandas as pd
import os
import numpy as np

# depending on the OS the path to the data file is different
if os.name == 'nt':
    data = pd.read_excel(r'..\data\nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')
elif os.name == 'posix':
    data = pd.read_excel(r'../data/nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')

# print first 5 results.
data.head()

Unnamed: 0,Public Food Key,Classification,Food Name,"Energy with dietary fibre, equated \n(kJ)","Energy, without dietary fibre, equated \n(kJ)",Moisture (water) \n(g),Protein \n(g),Nitrogen \n(g),"Fat, total \n(g)",Ash \n(g),...,Leucine \n(mg),Lysine \n(mg),Methionine \n(mg),Phenylalanine \n(mg),Proline \n(mg),Serine \n(mg),Threonine \n(mg),Tyrosine \n(mg),Tryptophan \n(mg),Valine \n(mg)
0,F002258,31302.0,"Cardamom seed, dried, ground",1236,1012,8.3,10.8,1.72,6.7,5.8,...,,,,,,,,,155,
1,F002893,31302.0,"Chilli (chili), dried, ground",1280,1002,10.8,13.4,2.14,14.3,11.8,...,,,,,,,,,69,
2,F002963,31302.0,"Cinnamon, dried, ground",1004,579,10.6,4.0,0.64,1.2,3.6,...,,,,,,,,,49,
3,F002970,31302.0,"Cloves, dried, ground",1389,1118,9.9,6.0,0.96,13.0,5.6,...,,,,,,,,,30,
4,F003190,31302.0,"Coriander seed, dried, ground",1344,1009,8.9,12.4,1.98,17.8,6.0,...,,,,,,,,,178,


In [32]:
# convert classification to food group defined as taking the slice of the string for first 3 characters
data['Classification'] = data['Classification'].astype(str).str.slice(0, 2)

# print the total number of classifications
print('Total number of classifications: ' + str(len(data['Classification'].unique())))

# count the number of each classification and take the top x
top_x = 5
top_x_classifications = data['Classification'].value_counts().head(top_x).index.tolist()
print('Top ' + str(top_x) + ' classifications: ' + str(top_x_classifications))

# filter the data to only include the top x classifications
data = data[data['Classification'].isin(top_x_classifications)]

Total number of classifications: 24
Top 5 classifications: ['18', '24', '12', '16', '15']


In [33]:
# deal with NaN values

# drop columns with x many NaN values
x = data.shape[0] * 0.1
data_without_nans = data.dropna(axis=1, thresh=x)

# print shapes of data
print(f"Original data: {data.shape}")
print(f"After dropping columns: {data_without_nans.shape}")

# # drop rows with NaN values
# data_without_nans = data_without_nans.dropna(axis=0)

# print shapes of data
print(f"After dropping rows: {data_without_nans.shape}")

# replace NaN values with 0
data_without_nans = data.fillna(0)

data_without_nans.head()

Original data: (1101, 293)
After dropping columns: (1101, 177)
After dropping rows: (1101, 177)


Unnamed: 0,Public Food Key,Classification,Food Name,"Energy with dietary fibre, equated \n(kJ)","Energy, without dietary fibre, equated \n(kJ)",Moisture (water) \n(g),Protein \n(g),Nitrogen \n(g),"Fat, total \n(g)",Ash \n(g),...,Leucine \n(mg),Lysine \n(mg),Methionine \n(mg),Phenylalanine \n(mg),Proline \n(mg),Serine \n(mg),Threonine \n(mg),Tyrosine \n(mg),Tryptophan \n(mg),Valine \n(mg)
27,F004220,12,"Gluten, from wheat (vital wheat gluten)",1507,1502,8.2,75.2,12.89,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,795,0.0
28,F008831,24,"Starch, potato",1332,1285,19.1,0.0,0.0,0.5,3.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
141,F001683,12,"Breadcrumbs, white",1598,1550,7.3,14.0,2.45,4.1,2.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,135,0.0
142,F001353,12,"Bread roll, from white flour",1044,1020,34.4,9.3,1.63,2.9,1.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145,0.0
143,F001415,12,"Bread, wrap, white, commercial",1266,1234,31.0,7.8,1.36,8.4,2.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,79,0.0


In [34]:
# energy with dietary fibre is the target variable of mutual information calculation
from sklearn.feature_selection import mutual_info_regression

nutritional_values = data_without_nans.drop(['Public Food Key', 'Classification', 'Food Name'], axis=1)

# calculate mutual information
mi = mutual_info_regression(nutritional_values, nutritional_values['Energy with dietary fibre, equated \n(kJ)'])

In [35]:
# # classification is the target variable of mutual information calculation
# from sklearn.feature_selection import mutual_info_classif

# nutritional_values = data_without_nans.drop(['Public Food Key', 'Food Name'], axis=1)

# # calculate mutual information
# mi = mutual_info_classif(nutritional_values, nutritional_values['Classification'])

In [36]:
# show all mutual information values above threshold
THRESHOLD = 0.1
selected_features = {}
for i in range(len(mi)):
    if mi[i] >= THRESHOLD:
        selected_features[nutritional_values.columns[i]] = mi[i]
print("Number of features selected:", len(selected_features))

# use the selected features to create a new dataframe of only the selected features
data_with_selected_features = data[['Public Food Key', 'Classification', 'Food Name'] + list(selected_features.keys())[1:]]

# convert dictionary to dataframe
selected_features = pd.DataFrame.from_dict(selected_features, orient='index', columns=['Mutual Information'])
selected_features

Number of features selected: 123


Unnamed: 0,Mutual Information
"Energy with dietary fibre, equated \n(kJ)",5.592534
"Energy, without dietary fibre, equated \n(kJ)",4.031156
Moisture (water) \n(g),1.812182
Protein \n(g),0.827462
Nitrogen \n(g),0.823987
...,...
Serine \n(mg),0.141332
Threonine \n(mg),0.150554
Tyrosine \n(mg),0.167330
Tryptophan \n(mg),0.723824


In [37]:
# save selected features to csv
selected_features.to_csv('../data/selected-features.csv', sep=',', header=True, index=False)

# save data with selected features to csv
data_with_selected_features.to_csv('../data/data-with-selected-features.csv', sep=',', header=True, index=False)

---

Since the dataset is largely continuous, we need to turn features into discrete ones.

## Variable discretisation

There are different methods to discretise continuous variables:

- **Equal-width binning**: divides the scope of possible values into N bins of the same width.
- **Equal-frequency binning**: divides the scope of possible values into N bins, each of them containing approximately the same number of samples.
- **Domain knowledge binning**: divides the scope of possible values into bins according to the domain knowledge.
<!-- There are other methods too listed below.

- ChiMerge: merges the bins using the Chi2 test to evaluate the statistical dependence of the classes and the feature.
- Entropy-based binning: merges the bins using the entropy of the classes and the feature.
- K-means binning: merges the bins using the K-means algorithm.
- Gaussian mixture binning: merges the bins using a Gaussian Mixture Model.
- Quantile binning: merges the bins so that each bin contains the same number of samples.
- Uniform binning: merges the bins so that each bin contains the same width.
- Recursive partitioning: merges the bins using a decision tree.
- Discretisation using decision trees: merges the bins using a decision tree.
- Discretisation using clustering: merges the bins using a clustering algorithm.
- Discretisation using support vector machines: merges the bins using a support vector machine.
- Discretisation using linear models: merges the bins using a linear model.
- Discretisation using nearest neighbours: merges the bins using a nearest neighbours algorithm.
- Discretisation using kernel density estimation: merges the bins using a kernel density estimation.
- Discretisation using fuzzy logic: merges the bins using a fuzzy logic algorithm.
- Discretisation using genetic algorithms: merges the bins using a genetic algorithm.
- Discretisation using simulated annealing: merges the bins using a simulated annealing algorithm.
- Discretisation using a neural network: merges the bins using a neural network.
- Discretisation using a random forest: merges the bins using a random forest.
- Discretisation using a linear discriminant analysis: merges the bins using a linear discriminant analysis.
- Discretisation using a quadratic discriminant analysis: merges the bins using a quadratic discriminant analysis.
- Discretisation using a principal component analysis: merges the bins using a principal component analysis.
- Discretisation using a factor analysis: merges the bins using a factor analysis.
- Discretisation using a canonical correlation analysis: merges the bins using a canonical correlation analysis.
- Discretisation using a partial least squares regression: merges the bins using a partial least squares regression.
- Discretisation using a ridge regression: merges the bins using a ridge regression. -->

WARNING: The choice of bins will influence the results of the mutual information filter.

In [38]:
# # variable discretisation using pandas.qcut

# # add new column with discretised values
# data['Discretised Energy with dietary fibre, equated \n(kJ)'] = pd.cut(data['Energy with dietary fibre, equated \n(kJ)'], 20, labels=False)

# # print the first few rows of the data for the two columns
# data[['Energy with dietary fibre, equated \n(kJ)', 'Discretised Energy with dietary fibre, equated \n(kJ)']].head(10)

# # print the first few rows of the sorted data for the two columns
# data[['Energy with dietary fibre, equated \n(kJ)', 'Discretised Energy with dietary fibre, equated \n(kJ)']].sort_values(by='Energy with dietary fibre, equated \n(kJ)', ascending=False).head(10)

---

In [39]:
# # Discretise all columns of data



# ignored_columns = ['Public Food Key', 'Classification', 'Food Name']
# label = 'Classification' # label to test
# test_col = [] # names of columns
# data = data.fillna(0) # having values of NaN prevents calculation of MI scores.

# for nutrient in data.columns:
#     if nutrient in ignored_columns: 
#         continue # disregard first 3 columns ['Public Food Key', 'Classification', 'Food Name']
#     else:
#         test_col.append(nutrient) # for features below
#         data[nutrient] = pd.cut(data[nutrient], 20, labels=False) # issues with pd.qcut relating to size of bins, proceeded with pd.cut
#         # discretise each column so data is discrete and not continuous

# # as follows in Week 9 Workshop - Feature filtering based on Mutual Information for classification
# features = data[test_col]
# features = features.fillna(0)
# class_label = data[label]

# data.head()

In [40]:


# # Following in Week 9 Workshop - Feature filtering based on Mutual Information for classification
# filtered_features = []
# THRESHOLD = 0.2 # threshold value not fixed

# mi_arr = mutual_info_classif(X=features, y=class_label, discrete_features=False)


# for feature, mi in zip(features.columns, mi_arr):
#     print(f'MI value for feature "{feature}": {mi:.4f}')

#     if (mi >= THRESHOLD):
#         filtered_features.append(feature)

# print('\nFeature set after filtering with MI:', filtered_features)