# Impute missing values

This notebook analyzes the `swiss_food_composition_database_proc` to see for outlier. The per category mean imputation is defined and tested.

## Load and preprocess the data

In [101]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt

In [102]:
dataset = pd.read_csv('../data/swiss_food_composition_database_proc_without_original_category.csv')

In [103]:
dataset.shape

(1092, 42)

In [104]:
dataset.head()

Unnamed: 0,ID,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,...,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg,selenium_µg
0,0,Agar Agar,other,160.0,0.2,,,,,0.0,...,52.0,130.0,,660.0,100.0,34.0,4.5,,1.5,
1,1,Agave syrup,sweets,293.0,0.0,0.0,,,,73.1,...,,4.0,,,,,,,,
2,2,Almond,fruits,624.0,52.1,4.1,31.4,11.4,0.0,7.8,...,740.0,1.1,40.0,270.0,240.0,510.0,3.3,0.2,3.3,2.2
3,3,"Almond, dry roasted, salted",nuts,637.0,52.5,4.1,33.1,13.0,0.0,10.1,...,710.0,230.0,1190.0,270.0,280.0,470.0,3.7,2.4,3.3,2.0
4,4,"Almond, roasted, salted",nuts,649.0,55.2,4.2,34.8,13.5,0.0,7.2,...,670.0,330.0,1190.0,240.0,270.0,470.0,3.3,2.4,3.1,2.0


Remove columns and samples with more than 20% of values missing.

### Remove columns with more than 20% missing values

In [105]:
percentage_present = 0.8
feature_threshold = math.ceil(percentage_present*dataset.shape[0]) # 20% of values missing
feature_threshold

874

In [106]:
columns_before = dataset.columns
dataset = dataset.dropna(axis=1, thresh=feature_threshold)
columns_after = dataset.columns
columns_after

Index(['ID', 'name', 'category', 'energy_kcal', 'fat_g', 'fatty_acids_sat_g',
       'fatty_acids_monounsat_g', 'fatty_acids_polyunsat_g', 'cholesterol_mg',
       'carbohydrates_g', 'sugars_g', 'starch_g', 'fibres_g', 'protein_g',
       'salt_g', 'alcohol_g', 'water_g', 'vit_A_activity_re_µg',
       'vit_A_activity_rae_µg', 'retinol_µg', 'beta_carotene_activity_µg',
       'beta_carotene_µg', 'vit_B1_mg', 'vit_B2_mg', 'vit_B6_mg', 'vit_B12_µg',
       'niacin_mg', 'folate_µg', 'panthotenic_acid_mg', 'vit_c_mg', 'vit_d_µg',
       'vit_e_activity_mg', 'potassium_mg', 'sodium_mg', 'chloride_mg',
       'calcium_mg', 'magnesium_mg', 'phosphorus_mg', 'iron_mg', 'iodide_µg',
       'zinc_mg'],
      dtype='object')

### Remove rows with more than 20% of values missing

In [107]:
row_threshold = math.ceil(percentage_present*dataset.shape[1]) # 20% of values missing
row_threshold

33

In [108]:
rows_before = dataset.index
dataset = dataset.dropna(axis=0, thresh=row_threshold)
rows_after = dataset.index
dataset.shape

(1084, 41)

## Impute missing values

In [109]:
# create a boolean mask for NaN values
nan_mask = dataset.isna().any(axis=1)

# create a new DataFrame with samples containing NaN values
df_with_nan_only = dataset[nan_mask]

In [110]:
df_with_nan_only.shape

(95, 41)

In [111]:
#dataset = dataset.dropna(axis=0) - disable the drop of the nans
dataset.shape

(1084, 41)

In [112]:
dataset['category'].unique() # find the unique categories

array(['fruits', 'nuts', 'cereals', 'sweets', 'other', 'dairy',
       'non_alcoholic_beverages', 'vegetables', 'meat', 'herbs', 'sauce',
       'alcoholic_beverages'], dtype=object)

In [113]:
dataset['category'].value_counts()

other                      228
meat                       187
cereals                    146
vegetables                 127
sweets                     118
dairy                      103
fruits                      99
non_alcoholic_beverages     28
alcoholic_beverages         22
sauce                       16
herbs                        7
nuts                         3
Name: category, dtype: int64

In [114]:
dataset.head()

Unnamed: 0,ID,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,...,vit_e_activity_mg,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg
2,2,Almond,fruits,624.0,52.1,4.1,31.4,11.4,0.0,7.8,...,27.1,740.0,1.1,40.0,270.0,240.0,510.0,3.3,0.2,3.3
3,3,"Almond, dry roasted, salted",nuts,637.0,52.5,4.1,33.1,13.0,0.0,10.1,...,23.9,710.0,230.0,1190.0,270.0,280.0,470.0,3.7,2.4,3.3
4,4,"Almond, roasted, salted",nuts,649.0,55.2,4.2,34.8,13.5,0.0,7.2,...,25.97,670.0,330.0,1190.0,240.0,270.0,470.0,3.3,2.4,3.1
5,5,"Amaranth, seed, cooked (without addition of fa...",cereals,119.0,2.1,0.4,0.5,0.8,0.0,18.1,...,0.4,150.0,1.9,36.0,58.0,84.0,180.0,3.0,0.8,1.2
6,6,"Amaranth, seed, raw",cereals,376.0,7.0,1.5,1.7,2.8,0.0,56.8,...,1.19,480.0,4.0,110.0,160.0,250.0,560.0,9.0,2.5,3.7


In [115]:
df_with_nan_only.head()

Unnamed: 0,ID,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,...,vit_e_activity_mg,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg
17,17,"Applesauce, sweetened, canned",fruits,89.0,0.3,0.1,0.0,0.1,0.0,20.5,...,0.34,97.0,3.6,2.1,4.4,3.6,8.1,0.2,0.7,0.1
18,18,"Applesauce, unsweetened, canned",fruits,55.0,0.3,0.1,0.0,0.1,0.0,11.7,...,0.43,110.0,4.0,2.2,4.8,4.0,9.0,0.2,0.8,0.1
46,46,"Beef ragout with sauce, prepared",other,85.0,2.8,0.4,1.9,0.4,31.0,3.0,...,0.7,130.0,300.0,470.0,9.9,8.9,91.0,0.8,2.2,2.2
47,47,"Beef, boiling meat lean, cooked (without addit...",meat,190.0,7.1,3.3,2.8,0.6,98.0,0.0,...,0.33,280.0,48.0,90.0,5.9,19.0,200.0,2.8,5.0,7.6
48,48,"Beef, boiling meat with fat, cooked (without a...",meat,276.0,18.9,7.9,8.7,1.2,80.0,0.0,...,0.3,230.0,41.0,82.0,5.0,16.0,160.0,1.8,4.5,5.3


## Impute missing values

In [116]:
def impute_missing_values(dataset:pd.DataFrame, category_col:str, feature_col_names:list)->pd.DataFrame:
    modified_dataset = dataset.copy()
    for feature in feature_col_names:
            modified_dataset[feature] = modified_dataset.groupby(category_col)[feature].transform(lambda x: x.fillna(x.mean()))
    return modified_dataset

In [117]:
cols = dataset.columns[3:]

In [118]:
dataset_2 = impute_missing_values(dataset, category_col='category', feature_col_names=cols)

In [119]:
# Create a boolean mask for NaN values
nan_mask_2 = dataset_2.isna().any(axis=1)

# Create a new DataFrame with samples containing NaN values
df_with_nan_only_2 = dataset_2[nan_mask_2]
df_with_nan_only_2.shape

(0, 41)

In [121]:
dataset_2.loc[540]

ID                                  540
name                         Minestrone
category                          other
energy_kcal                        53.0
fat_g                               1.8
fatty_acids_sat_g                   0.7
fatty_acids_monounsat_g             0.7
fatty_acids_polyunsat_g             0.2
cholesterol_mg                      1.0
carbohydrates_g                     7.0
sugars_g                            2.6
starch_g                            4.3
fibres_g                            0.4
protein_g                           2.0
salt_g                              1.0
alcohol_g                           0.0
water_g                            87.6
vit_A_activity_re_µg              143.0
vit_A_activity_rae_µg              72.0
retinol_µg                          1.0
beta_carotene_activity_µg         850.0
beta_carotene_µg                  850.0
vit_B1_mg                          0.02
vit_B2_mg                          0.02
vit_B6_mg                          0.04


There are no misisng values afterwards.