# Impute missing values

This notebook analyzes the `swiss_food_composition_database_proc` to see for outlier. The per category mean imputation is defined and tested.

## Load and preprocess the data

In [235]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt

In [236]:
dataset = pd.read_csv('../data/swiss_food_composition_database_proc.csv')

In [237]:
dataset.shape

(1092, 43)

In [238]:
dataset.head()

Unnamed: 0,ID,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,sugars_g,starch_g,fibres_g,protein_g,salt_g,alcohol_g,water_g,vit_A_activity_re_µg,vit_A_activity_rae_µg,retinol_µg,beta_carotene_activity_µg,beta_carotene_µg,vit_B1_mg,vit_B2_mg,vit_B6_mg,vit_B12_µg,niacin_mg,folate_µg,panthotenic_acid_mg,vit_c_mg,vit_d_µg,vit_e_activity_mg,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg,selenium_µg,category_new
0,0,Agar Agar,Various/Gelling and binding agents,160.0,0.2,,,,,0.0,,,74.1,2.4,0.3,,20.5,,,,,,,,,,,,,,,,52.0,130.0,,660.0,100.0,34.0,4.5,,1.5,,other
1,1,Agave syrup,Sweets/Sugar and sweeteners,293.0,0.0,0.0,,,,73.1,,,0.0,0.2,0.0,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,sweets
2,2,Almond,"Nuts, seeds and oleaginous fruit",624.0,52.1,4.1,31.4,11.4,0.0,7.8,6.6,0.6,10.6,25.6,0.0,0.0,3.3,0.0,0.0,0.0,0.0,0.0,0.15,0.94,0.13,0.0,1.88,55.9,0.48,0.5,0.0,27.1,740.0,1.1,40.0,270.0,240.0,510.0,3.3,0.2,3.3,2.2,fruits
3,3,"Almond, dry roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",637.0,52.5,4.1,33.1,13.0,0.0,10.1,4.9,0.7,10.9,25.4,0.6,0.0,2.4,0.0,0.0,0.0,1.0,1.0,0.08,1.2,0.14,0.0,3.64,55.0,0.32,0.0,0.0,23.9,710.0,230.0,1190.0,270.0,280.0,470.0,3.7,2.4,3.3,2.0,nuts
4,4,"Almond, roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",649.0,55.2,4.2,34.8,13.5,0.0,7.2,4.6,0.6,10.5,25.6,0.8,0.0,2.8,0.0,0.0,0.0,1.0,1.0,0.09,0.78,0.12,0.0,3.67,27.0,0.23,0.0,0.0,25.97,670.0,330.0,1190.0,240.0,270.0,470.0,3.3,2.4,3.1,2.0,nuts


Remove columns and samples with more than 20% of values missing.

### Remove columns with more than 20% missing values

In [239]:
percentage_present = 0.8
feature_threshold = math.ceil(percentage_present*dataset.shape[0]) # 20% of values missing
feature_threshold

874

In [240]:
columns_before = dataset.columns
dataset = dataset.dropna(axis=1, thresh=feature_threshold)
columns_after = dataset.columns
columns_after

Index(['ID', 'name', 'category', 'energy_kcal', 'fat_g', 'fatty_acids_sat_g',
       'fatty_acids_monounsat_g', 'fatty_acids_polyunsat_g', 'cholesterol_mg',
       'carbohydrates_g', 'sugars_g', 'starch_g', 'fibres_g', 'protein_g',
       'salt_g', 'alcohol_g', 'water_g', 'vit_A_activity_re_µg',
       'vit_A_activity_rae_µg', 'retinol_µg', 'beta_carotene_activity_µg',
       'beta_carotene_µg', 'vit_B1_mg', 'vit_B2_mg', 'vit_B6_mg', 'vit_B12_µg',
       'niacin_mg', 'folate_µg', 'panthotenic_acid_mg', 'vit_c_mg', 'vit_d_µg',
       'vit_e_activity_mg', 'potassium_mg', 'sodium_mg', 'chloride_mg',
       'calcium_mg', 'magnesium_mg', 'phosphorus_mg', 'iron_mg', 'iodide_µg',
       'zinc_mg', 'category_new'],
      dtype='object')

### Remove rows with more than 20% of values missing

In [241]:
row_threshold = math.ceil(percentage_present*dataset.shape[1]) # 20% of values missing
row_threshold

34

In [242]:
rows_before = dataset.index
dataset = dataset.dropna(axis=0, thresh=row_threshold)
rows_after = dataset.index
dataset.shape

(1084, 42)

## Impute missing values

In [243]:
# create a boolean mask for NaN values
nan_mask = dataset.isna().any(axis=1)

# create a new DataFrame with samples containing NaN values
df_with_nan_only = dataset[nan_mask]

In [244]:
df_with_nan_only.shape

(95, 42)

In [245]:
#dataset = dataset.dropna(axis=0) - disable the drop of the nans
dataset.shape

(1084, 42)

In [246]:
dataset['category_new'].unique() # find the unique categories

array(['fruits', 'nuts', 'cereals', 'sweets', 'other', 'dairy',
       'non_alcoholic_beverages', 'vegetables', 'meat', 'herbs', 'sauce',
       'alcoholic_beverages'], dtype=object)

In [247]:
dataset['category_new'].value_counts()

other                      228
meat                       187
cereals                    146
vegetables                 127
sweets                     118
dairy                      103
fruits                      99
non_alcoholic_beverages     28
alcoholic_beverages         22
sauce                       16
herbs                        7
nuts                         3
Name: category_new, dtype: int64

In [250]:
dataset.head()

Unnamed: 0,ID,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,sugars_g,starch_g,fibres_g,protein_g,salt_g,alcohol_g,water_g,vit_A_activity_re_µg,vit_A_activity_rae_µg,retinol_µg,beta_carotene_activity_µg,beta_carotene_µg,vit_B1_mg,vit_B2_mg,vit_B6_mg,vit_B12_µg,niacin_mg,folate_µg,panthotenic_acid_mg,vit_c_mg,vit_d_µg,vit_e_activity_mg,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg,category_new
2,2,Almond,"Nuts, seeds and oleaginous fruit",624.0,52.1,4.1,31.4,11.4,0.0,7.8,6.6,0.6,10.6,25.6,0.0,0.0,3.3,0.0,0.0,0.0,0.0,0.0,0.15,0.94,0.13,0.0,1.88,55.9,0.48,0.5,0.0,27.1,740.0,1.1,40.0,270.0,240.0,510.0,3.3,0.2,3.3,fruits
3,3,"Almond, dry roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",637.0,52.5,4.1,33.1,13.0,0.0,10.1,4.9,0.7,10.9,25.4,0.6,0.0,2.4,0.0,0.0,0.0,1.0,1.0,0.08,1.2,0.14,0.0,3.64,55.0,0.32,0.0,0.0,23.9,710.0,230.0,1190.0,270.0,280.0,470.0,3.7,2.4,3.3,nuts
4,4,"Almond, roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",649.0,55.2,4.2,34.8,13.5,0.0,7.2,4.6,0.6,10.5,25.6,0.8,0.0,2.8,0.0,0.0,0.0,1.0,1.0,0.09,0.78,0.12,0.0,3.67,27.0,0.23,0.0,0.0,25.97,670.0,330.0,1190.0,240.0,270.0,470.0,3.3,2.4,3.1,nuts
5,5,"Amaranth, seed, cooked (without addition of fa...","Cereal products, pulses and potatoes/Other cer...",119.0,2.1,0.4,0.5,0.8,0.0,18.1,0.5,17.6,3.5,5.0,0.0,0.0,71.2,0.0,0.0,0.0,0.0,0.0,0.02,0.06,0.11,0.0,0.34,16.2,0.27,1.0,0.0,0.4,150.0,1.9,36.0,58.0,84.0,180.0,3.0,0.8,1.2,cereals
6,6,"Amaranth, seed, raw","Cereal products, pulses and potatoes/Other cer...",376.0,7.0,1.5,1.7,2.8,0.0,56.8,1.7,55.1,11.0,15.8,0.0,0.0,11.1,0.0,0.0,0.0,0.0,0.0,0.08,0.19,0.53,0.0,1.2,64.3,0.95,4.2,0.0,1.19,480.0,4.0,110.0,160.0,250.0,560.0,9.0,2.5,3.7,cereals


In [251]:
df_with_nan_only.head()

Unnamed: 0,ID,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,sugars_g,starch_g,fibres_g,protein_g,salt_g,alcohol_g,water_g,vit_A_activity_re_µg,vit_A_activity_rae_µg,retinol_µg,beta_carotene_activity_µg,beta_carotene_µg,vit_B1_mg,vit_B2_mg,vit_B6_mg,vit_B12_µg,niacin_mg,folate_µg,panthotenic_acid_mg,vit_c_mg,vit_d_µg,vit_e_activity_mg,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg,category_new
17,17,"Applesauce, sweetened, canned",Fruit/Cooked fruit (incl. cans),89.0,0.3,0.1,0.0,0.1,0.0,20.5,20.4,0.1,1.9,0.3,0.0,0.0,76.5,,,,,,0.02,0.02,0.04,0.0,0.08,5.9,0.08,2.3,0.0,0.34,97.0,3.6,2.1,4.4,3.6,8.1,0.2,0.7,0.1,fruits
18,18,"Applesauce, unsweetened, canned",Fruit/Cooked fruit (incl. cans),55.0,0.3,0.1,0.0,0.1,0.0,11.7,11.6,0.1,2.1,0.3,0.0,0.0,85.0,,,,,,0.02,0.02,0.05,0.0,0.09,6.5,0.09,2.5,0.0,0.43,110.0,4.0,2.2,4.8,4.0,9.0,0.2,0.8,0.1,fruits
46,46,"Beef ragout with sauce, prepared",Prepared dishes/Other savoury dishes,85.0,2.8,0.4,1.9,0.4,31.0,3.0,1.5,,0.6,11.6,0.8,0.0,82.1,123.0,63.0,2.0,727.0,727.0,0.01,0.07,0.13,0.48,,5.0,0.35,0.9,0.2,0.7,130.0,300.0,470.0,9.9,8.9,91.0,0.8,2.2,2.2,other
47,47,"Beef, boiling meat lean, cooked (without addit...",Meat and offal/Beef,190.0,7.1,3.3,2.8,0.6,98.0,0.0,0.0,,0.0,31.7,0.1,0.0,61.2,9.0,7.0,6.0,13.0,13.0,0.02,0.15,0.39,1.4,,6.5,0.62,0.0,0.6,0.33,280.0,48.0,90.0,5.9,19.0,200.0,2.8,5.0,7.6,meat
48,48,"Beef, boiling meat with fat, cooked (without a...",Meat and offal/Beef,276.0,18.9,7.9,8.7,1.2,80.0,0.0,0.0,,0.0,26.6,0.1,0.0,54.5,8.0,7.0,6.0,11.0,11.0,0.02,0.16,0.32,1.27,5.3,5.9,0.67,0.0,0.5,0.3,230.0,41.0,82.0,5.0,16.0,160.0,1.8,4.5,5.3,meat


## Impute missing values

In [258]:
def impute_missing_values(dataset:pd.DataFrame, category_col:str, feature_col_names:list)->pd.DataFrame:
    modified_dataset = dataset.copy()
    for feature in feature_col_names:
            modified_dataset[feature] = modified_dataset.groupby(category_col)[feature].transform(lambda x: x.fillna(x.mean()))
    return modified_dataset

In [259]:
cols = dataset.columns[3:-1]

38

In [284]:
dataset_2 = impute_missing_values(dataset, category_col='category_new', feature_col_names=cols)

In [285]:
# Create a boolean mask for NaN values
nan_mask_2 = dataset_2.isna().any(axis=1)

# Create a new DataFrame with samples containing NaN values
df_with_nan_only_2 = dataset_2[nan_mask_2]
df_with_nan_only_2.shape

(0, 42)

There are no misisng values afterwards.