# Import data
Import the full dataset.

In [86]:
import pandas as pd
import os

# depending on the OS the path to the data file is different
if os.name == 'nt':
    data = pd.read_excel(r'..\data\nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')
elif os.name == 'posix':
    data = pd.read_excel(r'../data/nutrient-file-release2-jan22.xlsx', sheet_name='All solids & liquids per 100g')

data.shape

(1616, 293)

---
# Preprocessing
1. Convert "Classification" column to its root food group.
2. Replace less common food groups with the "Miscellaneous" food group.

In [87]:
FIRST_TOP_X = 6
SECOND_TOP_X = 14
THIRD_TOP_X = 23
MISCELLANEOUS = '31'

# convert na to 0
data = data.fillna(MISCELLANEOUS)

# convert classification to food group defined as taking the slice of the string for first 3 characters
data['Classification'] = data['Classification'].astype(str).str.slice(0, 2)

def make_misc(TOP_X, MISCELLANEOUS):
    # copy data, to avoid changing the original df
    data_new = data.copy()

    # food_groups_to_replace_as_misc is defined as the value counts of the
    # classification column that are less than the last value count of the
    # top X classifications
    food_groups_to_replace_as_misc = data_new['Classification'].value_counts()[TOP_X:].index.tolist()
    print('\tFood groups to replace as misc: ' + str(food_groups_to_replace_as_misc))

    # replace row values in the classification column with the misc food group "31"
    data_new['Classification'] = data_new['Classification'].replace(food_groups_to_replace_as_misc, MISCELLANEOUS)

    # print the total number of classifications
    print('\tTotal number of classifications: ' + str(len(data_new['Classification'].unique())))

    # count the number of each classification and take the top x
    top_x_classifications = data_new['Classification'].value_counts().head(TOP_X).index.tolist()
    print('\tTop ' + str(min(TOP_X, len(data_new['Classification'].unique()))) + ' classifications: ' + str(sorted(top_x_classifications)) + '\n')

    # replace NaN values with 0
    data_without_nans = data_new.fillna(0)

    return data_without_nans, data_new


# get df for each version of food groups
print(f'For first version of food group:')
first_groups, first_data = make_misc(FIRST_TOP_X, MISCELLANEOUS)

print(f'\nFor second version of food group:')
second_groups, second_data = make_misc(SECOND_TOP_X, MISCELLANEOUS)

print(f'\nFor third version of food group:')
third_groups, third_data = make_misc(THIRD_TOP_X, MISCELLANEOUS)

For first version of food group:
	Food groups to replace as misc: ['19', '11', '22', '23', '14', '31', '28', '20', '29', '25', '27', '17', '26', '21', '34', '30', '32']
	Total number of classifications: 7
	Top 6 classifications: ['12', '15', '16', '18', '24', '31']


For second version of food group:
	Food groups to replace as misc: ['29', '25', '27', '17', '26', '21', '34', '30', '32']
	Total number of classifications: 14
	Top 14 classifications: ['11', '12', '13', '14', '15', '16', '18', '19', '20', '22', '23', '24', '28', '31']


For third version of food group:
	Food groups to replace as misc: []
	Total number of classifications: 23
	Top 23 classifications: ['11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '34']



---
# Feature selection by mutual information
Select the features above a certain threshold of mutual information with the target variable.

### For regression
Used with `linear-regression.ipynb`

In [88]:
# energy with dietary fibre is the target variable of mutual information calculation
from sklearn.feature_selection import mutual_info_regression
def m_i_regress(data_without_nans):
    nutritional_values_regression = data_without_nans.drop(['Public Food Key', 'Classification', 'Food Name'], axis=1)

    # calculate mutual information
    mi_regression = mutual_info_regression(nutritional_values_regression, nutritional_values_regression['Energy with dietary fibre, equated \n(kJ)'])
    return mi_regression

# generate regression mutual information for each version of food groups
first_groups_reg = m_i_regress(first_groups)
second_groups_reg = m_i_regress(second_groups)
third_groups_reg = m_i_regress(third_groups)


### For classification
Used with `knn.ipynb`

In [89]:
# classification is the target variable of mutual information calculation
from sklearn.feature_selection import mutual_info_classif

def m_i_classif(data_without_nans):
    nutritional_values_classif = data_without_nans.drop(['Public Food Key', 'Food Name'], axis=1)

    # calculate mutual information
    mi_classif = mutual_info_classif(nutritional_values_classif, nutritional_values_classif['Classification'])
    return mi_classif, nutritional_values_classif

# generate classificaion mutual information for each version of food groups
first_groups_classif, first_nutr_vals = m_i_classif(first_groups)
second_groups_classif, second_nutr_vals = m_i_classif(second_groups)
third_groups_classif, third_nutr_vals = m_i_classif(third_groups)

## Save dataset with selected features

In [90]:
print(first_data[['Public Food Key', 'Classification', 'Food Name']])

     Public Food Key Classification   
0            F002258             31  \
1            F002893             31   
2            F002963             31   
3            F002970             31   
4            F003190             31   
...              ...            ...   
1611         F009774             24   
1612         F009773             24   
1613         F009766             24   
1614         F009765             24   
1615         F009764             24   

                                              Food Name  
0                          Cardamom seed, dried, ground  
1                         Chilli (chili), dried, ground  
2                               Cinnamon, dried, ground  
3                                 Cloves, dried, ground  
4                         Coriander seed, dried, ground  
...                                                 ...  
1611  Zucchini, green skin, fresh, unpeeled, fried, ...  
1612  Zucchini, green skin, fresh, unpeeled, boiled,...  
1613     

In [92]:
THRESHOLD = 0.2


def save_mutual_information_to_csv(mi, nutritional_values, new_classif_data, filename, threshold=THRESHOLD):
    # show all mutual information values above threshold
    selected_features = {}
    for i in range(len(mi)):
        if mi[i] >= THRESHOLD:
            selected_features[nutritional_values.columns[i]] = mi[i]
    print("\tNumber of features selected:", len(selected_features))

    # use the selected features to create a new dataframe of only the selected features
    data_with_selected_features = data[list(selected_features.keys())[1:]]

    classifs_for_variation = new_classif_data[['Public Food Key', 'Classification', 'Food Name']]

    concat_data = pd.concat([classifs_for_variation, data_with_selected_features], axis=1)

    # convert dictionary to dataframe
    selected_features = pd.DataFrame.from_dict(selected_features, orient='index', columns=['Mutual Information'])

    # save data with selected features to csv
    concat_data.to_csv(f'../data/generated/{filename}', sep=',', header=True, index=False)


# save information to be used for supervised learning model to csv
print(f'For first version of food groups, {FIRST_TOP_X} groups:')
save_mutual_information_to_csv(first_groups_classif, first_nutr_vals, first_data, 'preprocessed-data-classification-first.csv')

print(f'\nFor second version of food groups, {SECOND_TOP_X} groups:')
save_mutual_information_to_csv(second_groups_classif, second_nutr_vals, second_data, 'preprocessed-data-classification-second.csv')

print(f'\nFor third version of food groups, {THIRD_TOP_X} groups:')
save_mutual_information_to_csv(third_groups_classif, third_nutr_vals, third_data, 'preprocessed-data-classification-third.csv')

For first version of food groups, 6 groups:
	Number of features selected: 109

For second version of food groups, 14 groups:
	Number of features selected: 131

For third version of food groups, 23 groups:
	Number of features selected: 146
