# Import data
Import the full dataset.

In [None]:
import pandas as pd
import os

# depending on the OS the path to the data file is different
if os.name == 'nt':
    data = pd.read_excel(r'..\data\nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')
elif os.name == 'posix':
    data = pd.read_excel(r'../data/nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')

data.shape

---
# Preprocessing
1. Convert "Classification" column to its root food group.
2. Replace less common food groups with the "Miscellaneous" food group.

In [None]:
TOP_X = 6
MISCELLANEOUS = '31'

# convert na to 0
data = data.fillna(MISCELLANEOUS)

# convert classification to food group defined as taking the slice of the string for first 3 characters
data['Classification'] = data['Classification'].astype(str).str.slice(0, 2)

# food_groups_to_replace_as_misc is defined as the value counts of the
# classification column that are less than the last value count of the
# top X classifications
food_groups_to_replace_as_misc = data['Classification'].value_counts()[TOP_X - 1:].index.tolist()
print('Food groups to replace as misc: ' + str(food_groups_to_replace_as_misc))

# replace row values in the classification column with the misc food group "31"
data['Classification'] = data['Classification'].replace(food_groups_to_replace_as_misc, MISCELLANEOUS)

# print the total number of classifications
print('Total number of classifications: ' + str(len(data['Classification'].unique())))

# count the number of each classification and take the top x
top_x_classifications = data['Classification'].value_counts().head(TOP_X).index.tolist()
print('Top ' + str(min(TOP_X, len(data['Classification'].unique()))) + ' classifications: ' + str(sorted(top_x_classifications)))

In [None]:
# replace NaN values with 0
data_without_nans = data.fillna(0)

---
# Feature selection by mutual information
Select the features above a certain threshold of mutual information with the target variable.

### For regression
Used with `linear-regression.ipynb`

In [None]:
# energy with dietary fibre is the target variable of mutual information calculation
from sklearn.feature_selection import mutual_info_regression

nutritional_values_regression = data_without_nans.drop(['Public Food Key', 'Classification', 'Food Name'], axis=1)

# calculate mutual information
mi_regression = mutual_info_regression(nutritional_values_regression, nutritional_values_regression['Energy with dietary fibre, equated \n(kJ)'])

### For classification
Used with `knn.ipynb`

In [None]:
# classification is the target variable of mutual information calculation
from sklearn.feature_selection import mutual_info_classif

nutritional_values_classif = data_without_nans.drop(['Public Food Key', 'Food Name'], axis=1)

# calculate mutual information
mi_classif = mutual_info_classif(nutritional_values_classif, nutritional_values_classif['Classification'])

## Save dataset with selected features

In [None]:
THRESHOLD = 0.2


def save_mutual_information_to_csv(mi, nutritional_values, filename, threshold=THRESHOLD):
    # show all mutual information values above threshold
    selected_features = {}
    for i in range(len(mi)):
        if mi[i] >= THRESHOLD:
            selected_features[nutritional_values.columns[i]] = mi[i]
    print("Number of features selected:", len(selected_features))

    # use the selected features to create a new dataframe of only the selected features
    data_with_selected_features = data[['Public Food Key', 'Classification', 'Food Name'] + list(selected_features.keys())[1:]]

    # convert dictionary to dataframe
    selected_features = pd.DataFrame.from_dict(selected_features, orient='index', columns=['Mutual Information'])

    # save data with selected features to csv
    data_with_selected_features.to_csv(f'../data/generated/{filename}', sep=',', header=True, index=False)


# save mutual information to csv
save_mutual_information_to_csv(mi_regression, nutritional_values_regression, 'preprocessed-data-regression.csv')
save_mutual_information_to_csv(mi_classif, nutritional_values_classif, 'preprocessed-data-classification.csv')