# 02 Data Imputation

This notebook performs data imputation on nutritional composition data using a variety of methods. Each section includes code and explanations to guide the analysis and benchmarking process.

## 1. Imports

Import necessary libraries for data manipulation, JSON handling, timing, imputation algorithms, parallel processing, and evaluation metrics.

In [8]:
# Import standard libraries and tools for data handling, timing, parallel processing, and modeling
import json  # JSON I/O for configuration
import os
import time  # timing utilities
import warnings  # suppress warnings when needed

# Concurrent processing utilities
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime  # timestamp logging
from functools import partial  # function argument binding
from multiprocessing import cpu_count  # detect CPU cores

import numpy as np  # numerical operations
import pandas as pd  # data manipulation

# Imputation methods from fancyimpute and scikit-learn
from fancyimpute import KNN, BiScaler, IterativeSVD, MatrixFactorization, NuclearNormMinimization, SoftImpute
from sklearn.ensemble import RandomForestRegressor  # ensemble model for imputation
from sklearn.experimental import enable_iterative_imputer  # enable experimental feature
from sklearn.impute import IterativeImputer as IterativeImputer_sklearn
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import BayesianRidge, LinearRegression  # regression estimators

# Evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor  # MLP for iterative imputer
from tqdm import tqdm  # progress bars

from helpers import symmetric_mape  # custom SMAPE metric

## 2. Settings

Configure pandas display options to show more rows and columns during exploratory analysis and debugging.

In [9]:
# Configure pandas display options for better visibility during exploration
pd.set_option("display.max_columns", 500)  # show up to 500 columns
pd.set_option("display.max_rows", 500)  # show up to 500 rows


In [10]:
# Paths and imputer lists for benchmarking
CONFIG = {
    "parquet_path": "data/01_cleaned.parquet",  # cleaned data file
    "json_path": "data/01_order_to_impute.json",  # order in which to impute columns
    # List of multi-variable imputation methods to test
    "list_imputers_names_multi": [
        "KNN_3",
        "KNN_4",
        "KNN_5",
        "KNN_6",
        "KNN_7",
        "KNN_8",
        "IterativeSVD",
        "KNN_9",
        "KNN_10",
        "KNN_11",
        "KNN_12",
        "KNN_13",
        "KNN_14",
        "KNN_15",
        "IterativeImputer_sklearn",
        "IterativeImputer_sklearn_LinearRegression",
        "SoftImpute",
        "BiScaler+SoftImpute",
        "MatrixFactorization",
    ],
    # Additional single-target imputer methods with timing
    "list_imputers_name_ta_com_tempo": ["IterativeImputer_sklearn_MLP", "IterativeImputer_sklearn_RandomForest", "IterativeImputer_sklearn_BayesianRidge"],
}

## 3. Configuration

Define file paths for the cleaned data and imputation order, and list all imputation methods to evaluate in this notebook.

In [11]:
# Load dataset and imputation order


def load_data(parquet_path, json_path):
    """Read cleaned data from a parquet file and load imputation order from JSON."""
    df = pd.read_parquet(parquet_path)  # load cleaned DataFrame
    with open(json_path, "r") as f:
        order_to_impute = json.load(f)  # load column order for imputation
    return df, order_to_impute


# One-hot encode categorical variables


def prepare_data(df):
    """Convert categorical features to one-hot encoding and set sample 'Código' as index."""
    data_onehot = pd.get_dummies(df.copy(), columns=["Gênero", "Espécie"])
    data_onehot.set_index("Código", inplace=True)  # use unique code identifier
    data_onehot = data_onehot.astype(float)  # ensure numeric dtype for imputation
    return data_onehot


# Identify rows with non-null values in a column


def get_idx_non_null_rows(dataframe, column):
    """Return indices where the specified column has observed (non-null) values."""
    return dataframe[dataframe[column].notnull()].index


## 4. Data Loading and Preparation Functions

- **load_data**: Reads the cleaned dataset from a parquet file and the imputation order from a JSON file.
- **prepare_data**: One-hot encodes categorical features and sets the sample identifier as the index.
- **get_idx_non_null_rows**: Helper to find rows where a column is not null for cross-validation.

In [12]:
def impute_cv(df_input, col_name, imputer_name):
    """Impute missing entries for one column, record true vs predicted values and elapsed time."""
    df = df_input.copy()  # avoid mutating original DataFrame
    results = []  # store tuples: (imputer, column, true, pred, time)
    # print(f'Imputing column {col_name} with imputer {imputer_name}')
    try:
        # locate column index and prepare numpy matrix
        col_idx = df.columns.get_loc(col_name)
        input_df = df.values
        idx_non_null_rows = get_idx_non_null_rows(df, col_name)
        imputer = get_imputer(imputer_name)

        # iterate over each observed row, mask its value, and predict
        for row_idx in tqdm(idx_non_null_rows, desc=f"Imputing {col_name} using {imputer_name}"):
            start = time.time()
            true_val = input_df[row_idx, col_idx]
            input_df[row_idx, col_idx] = np.nan  # mask
            imputed = fit_transform_imputer(imputer, imputer_name, input_df)
            pred_val = imputed[row_idx, col_idx]
            elapsed = time.time() - start
            results.append((imputer_name, col_name, true_val, pred_val, elapsed))
            input_df[row_idx, col_idx] = true_val  # restore

        return results
    except Exception as e:
        print("Error during impute_cv:", e)
        return [None]


def get_imputer(imputer_name):
    """Map imputer_name string to a configured imputer instance."""
    if imputer_name.startswith("KNN"):
        k = int(imputer_name.split("_")[1])
        return KNNImputer(n_neighbors=k)
    elif imputer_name == "MatrixFactorization":
        return MatrixFactorization(verbose=False)
    elif imputer_name == "NuclearNormMinimization":
        return NuclearNormMinimization(verbose=False)
    elif imputer_name == "SoftImpute":
        return SoftImpute(verbose=False)
    elif imputer_name == "BiScaler+SoftImpute":
        return SoftImpute(verbose=False)  # BiScaler handled in fit_transform_imputer
    elif imputer_name == "IterativeSVD":
        return IterativeSVD(verbose=False)
    elif imputer_name == "IterativeImputer_sklearn":
        return IterativeImputer_sklearn(verbose=0, min_value=0, random_state=271828)
    elif imputer_name == "IterativeImputer_sklearn_LinearRegression":
        return IterativeImputer_sklearn(verbose=0, estimator=LinearRegression(), min_value=0, random_state=271828)
    elif imputer_name == "IterativeImputer_sklearn_MLP":
        return IterativeImputer_sklearn(verbose=0, estimator=MLPRegressor(), min_value=0, random_state=271828)
    elif imputer_name == "IterativeImputer_sklearn_RandomForest":
        return IterativeImputer_sklearn(verbose=0, estimator=RandomForestRegressor(n_jobs=-1), min_value=0, random_state=271828)
    elif imputer_name == "IterativeImputer_sklearn_BayesianRidge":
        return IterativeImputer_sklearn(verbose=0, estimator=BayesianRidge(), min_value=0, random_state=271828)
    elif imputer_name == "SimpleImputerMean":
        return SimpleImputer(strategy="mean", verbose=0)
    elif imputer_name == "SimpleImputerMedian":
        return SimpleImputer(strategy="median", verbose=0)
    else:
        return None


def fit_transform_imputer(imputer, imputer_name, input_df):
    """Apply imputer; handle BiScaler+SoftImpute by scaling before imputation."""
    if imputer_name == "BiScaler+SoftImpute":
        scaled = BiScaler(verbose=False).fit_transform(input_df)
        return imputer.fit_transform(scaled)
    return imputer.fit_transform(input_df)


def metrics_impute_cv(results):
    """Calculate RMSE, MAE, MAPE, SMAPE, R², and timing stats for imputation."""
    filtered = [r for r in results if r is not None]
    df_res = pd.DataFrame(filtered, columns=["imputer_name", "col_name", "real_value", "predicted_value", "time_elapsed"])
    if df_res.empty:
        return pd.DataFrame()

    return pd.DataFrame(
        [
            {  # aggregate metrics
                "imputer_name": df_res["imputer_name"][0],
                "col_name": df_res["col_name"][0],
                "mean_time": df_res["time_elapsed"].mean(),
                "r2": r2_score(df_res["real_value"], df_res["predicted_value"]),
                "rmse": mean_squared_error(df_res["real_value"], df_res["predicted_value"], squared=False),
                "mae": mean_absolute_error(df_res["real_value"], df_res["predicted_value"]),
                "mape": mean_absolute_percentage_error(df_res["real_value"], df_res["predicted_value"]),
                "smape": symmetric_mape(df_res["real_value"], df_res["predicted_value"]),
                "mae_std_ratio": df_res["real_value"].std() and (mean_absolute_error(df_res["real_value"], df_res["predicted_value"]) / df_res["real_value"].std()),
                "real_values": [df_res["real_value"].values],
                "predicted_values": [df_res["predicted_value"].values],
            }
        ]
    )


## 5. Imputation Utility Functions

- **impute_cv**: Performs cross-validation-style evaluation for a single column and imputer, recording predictions and timing.
- **get_imputer**: Returns an instantiated imputer object based on its name.
- **fit_transform_imputer**: Applies the imputer to the input matrix, handling special cases.
- **metrics_impute_cv**: Computes evaluation metrics (RMSE, MAE, MAPE, SMAPE, R²) from cross-validation results.

In [13]:
def impute_col(col, imputer_name, df_input):
    """Run impute_cv for a column and return its aggregated metrics."""
    try:
        results = impute_cv(df_input, col, imputer_name)
        print(f"Completed {col} with {imputer_name} at", datetime.now())
        return metrics_impute_cv(results)
    except Exception as e:
        print(f"Error in impute_col ({col}, {imputer_name}):", e)
        return None

## 6. Initialization

- Suppress warnings for cleaner output.
- Load dataset and imputation order.
- Prepare one-hot encoded DataFrame and keep a copy.
- Define columns to impute and a partial function for ease of use.

In [14]:
# Suppress pandas and sklearn warnings for cleaner output
warnings.filterwarnings("ignore")

# Load data and prepare features
df, order_to_impute = load_data(CONFIG["parquet_path"], CONFIG["json_path"])
data_onehot = prepare_data(df)  # one-hot encoding of categorical variables

data_onehot_copy = data_onehot.copy()  # keep original encoded data
# Reset index to numeric for imputation routines that use integer-based indexing
data_onehot.reset_index(inplace=True, drop=True)

# Columns that need missing-value estimation
cols_to_impute = list(order_to_impute.keys())
# Create a partial function binding the DataFrame argument for parallel calls
impute_col_onehot = partial(impute_col, df_input=data_onehot)


In [15]:
# Determine number of available CPU cores for parallel execution
print(f"Available CPU cores: {cpu_count()}")

Available CPU cores: 48


## 7. Parallel Execution

Use `ProcessPoolExecutor` to run imputation tasks in parallel across multiple CPU cores. Collect only non-empty results for further analysis.

In [None]:
# Parallel imputation across columns and methods
results_onehot = []
with ProcessPoolExecutor(max_workers=max(cpu_count() - 2, 1)) as executor:
    # submit all tasks
    futures = [executor.submit(impute_col_onehot, col, imp) for col in cols_to_impute for imp in CONFIG["list_imputers_names_multi"]]
    # collect completed results
    for fut in as_completed(futures):
        df_metrics = fut.result()
        if df_metrics is not None and not df_metrics.empty:
            results_onehot.append(df_metrics)

# total runtime may be significant (e.g., ~800 minutes on large datasets)


In [None]:
# Combine all individual DataFrames and export results
if results_onehot:
    df_results_onehot = pd.concat(results_onehot, ignore_index=True)
    df_results_onehot.to_csv("data/metrics_results.csv", index=False)
    print("Results saved to data/metrics_results.csv")
else:
    print("No results to aggregate")

## 8. Aggregating and Saving Results

Combine individual result DataFrames into a single DataFrame and save to CSV for downstream analysis and reporting.

In [None]:
df_results_onehot = pd.read_csv("data/metrics_results.csv")

In [None]:
# Print grouped results
grouped_results = df_results_onehot.groupby("col_name").apply(lambda x: x.sort_values(["rmse", "smape"], ascending=True).head(2)).reset_index(drop=True)
grouped_results

Unnamed: 0,imputer_name,col_name,mean_time,r2,rmse,mae,mape,smape,mae_std_ratio,real_values,predicted_values
0,KNN_6,Ag 107 (ng/g),0.162667,-0.031151,2.036925,0.924706,25.087611,101.670577,0.457586,"[array([0.56, 0.44, 0.01, 0.01, 0.01, 0.01, 0....","[array([0.355 , 2.54166667, 0.01 , 2...."
1,KNN_12,Ag 107 (ng/g),0.211125,-0.045544,2.051091,1.05049,39.965716,141.484101,0.51983,"[array([0.56, 0.44, 0.01, 0.01, 0.01, 0.01, 0....","[array([1.08416667, 1.27583333, 0.19 , 1...."
2,IterativeImputer_sklearn,Al 27 (ug/g),648.033202,-0.030978,2651.086209,675.045243,12272.965592,174.778802,0.256635,"[array([1.000000e-02, 7.267900e+02, 1.000000e-...","[array([5.23215790e+02, 5.74163438e+02, 5.7676..."
3,SoftImpute,Al 27 (ug/g),266.343393,-0.04052,2663.326489,638.4549,7643.532553,191.549185,0.242725,"[array([1.000000e-02, 7.267900e+02, 1.000000e-...","[array([ 6.55837103e+01, 1.96106774e+02, 9.3..."
4,KNN_8,Cinzas,0.087251,-2.841574,1.59657,0.665969,0.154182,12.938982,0.812622,"[array([4.41, 4.56, 3.7 , 4.51, 4.52, 4.75, 4....","[array([ 5.115 , 4.73 , 4.605 ,..."
5,KNN_7,Cinzas,0.081028,-2.850707,1.598467,0.675014,0.156249,13.176208,0.823659,"[array([4.41, 4.56, 3.7 , 4.51, 4.52, 4.75, 4....","[array([ 5.19 , 4.75428571, 4.63 ,..."
6,KNN_10,Lipídios,0.123489,-0.711866,12.574523,8.119307,0.491226,34.81748,0.83952,"[array([18.55, 17.68, 22.44, 24.58, 20.24, 5....","[array([27.679 , 19.805 , 24.957 ,..."
7,KNN_9,Lipídios,0.14132,-0.720629,12.606665,8.143393,0.493316,34.840279,0.84201,"[array([18.55, 17.68, 22.44, 24.58, 20.24, 5....","[array([27.59333333, 16.17888889, 23.61444444,..."
8,KNN_5,Mn 55 (ng/g),0.083333,0.003606,278.345011,123.920835,24.108964,84.088791,0.44187,"[array([2.10250e+02, 2.17620e+02, 4.79200e+01,...","[array([6.71498e+02, 1.63288e+02, 6.28220e+01,..."
9,KNN_6,Mn 55 (ng/g),0.079554,0.003428,278.36993,125.582449,20.52643,85.949301,0.447795,"[array([2.10250e+02, 2.17620e+02, 4.79200e+01,...","[array([589.02666667, 146.47833333, 56.211666..."


In [11]:
df

Unnamed: 0,Código,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Gênero,Espécie
0,006C,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,melanosuchus,niger
1,029C,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05,cuniculus,paca
2,038C,72.26,3.7,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.4,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.0,6.91,0.02,6.18,8.7,9.41,0.01,cuniculus,paca
3,041C,73.06,4.51,24.58,90.53,313.0,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.5,0.18,3.13,23.2,0.1,3.31,6.54,0.9,0.01,mazama,americana
4,043C,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.5,2.69,0.01,cuniculus,paca
5,044C,76.52,4.75,5.22,87.16,837.22,53.36,0.01,0.01,147.06,8.15,2.4,84.28,850.46,1.24,9.95,82.1,317.03,1320.23,165.03,38.23,0.02,0.17,13.19,0.08,5.32,10.1,4.15,22.84,cuniculus,paca
6,047C,73.73,4.51,32.34,82.85,609.8,71.34,0.03,0.01,160.85,18.38,7.07,62.37,976.12,7.13,1.56,55.54,142.91,177.42,319.35,4.47,0.1,0.77,32.46,0.04,696.64,10.61,7.47,24.95,﻿cairina,moschata
7,049C,71.61,4.75,20.23,80.96,94.07,8.06,0.01,0.01,0.01,15.91,3.43,15.52,685.34,7.13,11.01,64.27,30.61,786.37,108.99,0.01,0.03,0.01,42.27,0.68,56.32,13.74,7.97,0.01,penelope,jacquaca
8,050C,69.58,5.01,18.7,83.72,69.23,11.82,0.01,0.01,0.01,4.31,3.43,15.89,806.16,1.62,16.14,60.41,0.01,754.55,106.62,0.01,0.01,0.01,20.54,0.34,66.41,9.57,13.11,0.01,penelope,jacquaca
9,053C,73.36,3.54,28.58,51.27,67.58,2.65,0.01,0.01,0.01,7.96,1.24,32.92,1262.95,8.22,10.9,10.13,0.01,391.48,15.6,10.18,0.07,3.39,16.72,2.24,11189.22,43.81,3.86,0.01,euphractus,sexcinctus


In [12]:
data_onehot

Unnamed: 0,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Gênero_alouatta,Gênero_boa,Gênero_busarellus,Gênero_caiman,Gênero_cebus,Gênero_crax,Gênero_cuniculus,Gênero_dasyprocta,Gênero_eunectes,Gênero_euphractus,Gênero_mazama,Gênero_melanosuchus,Gênero_mergus,Gênero_mesoclemmys,Gênero_mitu,Gênero_paleosuchus,Gênero_penelope,Gênero_phalacrocorax,Gênero_pipile,Gênero_podocnemis,Gênero_porphyrio,Gênero_puma,Gênero_tapirus,Gênero_tayassu,Gênero_﻿cairina,Gênero_﻿sotalia,Espécie_americana,Espécie_apella,Espécie_brasilianus,Espécie_concolor,Espécie_constrictor,Espécie_crocodilus,Espécie_cumanensis,Espécie_expansa,Espécie_fluviatilis,Espécie_fuliginosa,Espécie_globulosa,Espécie_jacquaca,Espécie_martinica,Espécie_moschata,Espécie_murinus,Espécie_niger,Espécie_nigricollis,Espécie_octosetaceus,Espécie_paca,Espécie_palpebrosus,Espécie_raniceps,Espécie_seniculus,Espécie_sexcinctus,Espécie_sextuberculata,Espécie_tajacu,Espécie_terrestris,Espécie_tuberosum,Espécie_unifilis
0,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,72.26,3.7,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.4,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.0,6.91,0.02,6.18,8.7,9.41,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,73.06,4.51,24.58,90.53,313.0,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.5,0.18,3.13,23.2,0.1,3.31,6.54,0.9,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.5,2.69,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,76.52,4.75,5.22,87.16,837.22,53.36,0.01,0.01,147.06,8.15,2.4,84.28,850.46,1.24,9.95,82.1,317.03,1320.23,165.03,38.23,0.02,0.17,13.19,0.08,5.32,10.1,4.15,22.84,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,73.73,4.51,32.34,82.85,609.8,71.34,0.03,0.01,160.85,18.38,7.07,62.37,976.12,7.13,1.56,55.54,142.91,177.42,319.35,4.47,0.1,0.77,32.46,0.04,696.64,10.61,7.47,24.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,71.61,4.75,20.23,80.96,94.07,8.06,0.01,0.01,0.01,15.91,3.43,15.52,685.34,7.13,11.01,64.27,30.61,786.37,108.99,0.01,0.03,0.01,42.27,0.68,56.32,13.74,7.97,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,69.58,5.01,18.7,83.72,69.23,11.82,0.01,0.01,0.01,4.31,3.43,15.89,806.16,1.62,16.14,60.41,0.01,754.55,106.62,0.01,0.01,0.01,20.54,0.34,66.41,9.57,13.11,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,73.36,3.54,28.58,51.27,67.58,2.65,0.01,0.01,0.01,7.96,1.24,32.92,1262.95,8.22,10.9,10.13,0.01,391.48,15.6,10.18,0.07,3.39,16.72,2.24,11189.22,43.81,3.86,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [13]:
imputer = KNNImputer(n_neighbors=8)
input_df = data_onehot.values
imputed_matrix = imputer.fit_transform(input_df)
imputed_df = pd.DataFrame(imputed_matrix, columns=data_onehot.columns)
imputed_df


Unnamed: 0,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Gênero_alouatta,Gênero_boa,Gênero_busarellus,Gênero_caiman,Gênero_cebus,Gênero_crax,Gênero_cuniculus,Gênero_dasyprocta,Gênero_eunectes,Gênero_euphractus,Gênero_mazama,Gênero_melanosuchus,Gênero_mergus,Gênero_mesoclemmys,Gênero_mitu,Gênero_paleosuchus,Gênero_penelope,Gênero_phalacrocorax,Gênero_pipile,Gênero_podocnemis,Gênero_porphyrio,Gênero_puma,Gênero_tapirus,Gênero_tayassu,Gênero_﻿cairina,Gênero_﻿sotalia,Espécie_americana,Espécie_apella,Espécie_brasilianus,Espécie_concolor,Espécie_constrictor,Espécie_crocodilus,Espécie_cumanensis,Espécie_expansa,Espécie_fluviatilis,Espécie_fuliginosa,Espécie_globulosa,Espécie_jacquaca,Espécie_martinica,Espécie_moschata,Espécie_murinus,Espécie_niger,Espécie_nigricollis,Espécie_octosetaceus,Espécie_paca,Espécie_palpebrosus,Espécie_raniceps,Espécie_seniculus,Espécie_sexcinctus,Espécie_sextuberculata,Espécie_tajacu,Espécie_terrestris,Espécie_tuberosum,Espécie_unifilis
0,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,72.26,3.7,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.4,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.0,6.91,0.02,6.18,8.7,9.41,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,73.06,4.51,24.58,90.53,313.0,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.5,0.18,3.13,23.2,0.1,3.31,6.54,0.9,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.5,2.69,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,76.52,4.75,5.22,87.16,837.22,53.36,0.01,0.01,147.06,8.15,2.4,84.28,850.46,1.24,9.95,82.1,317.03,1320.23,165.03,38.23,0.02,0.17,13.19,0.08,5.32,10.1,4.15,22.84,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,73.73,4.51,32.34,82.85,609.8,71.34,0.03,0.01,160.85,18.38,7.07,62.37,976.12,7.13,1.56,55.54,142.91,177.42,319.35,4.47,0.1,0.77,32.46,0.04,696.64,10.61,7.47,24.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,71.61,4.75,20.23,80.96,94.07,8.06,0.01,0.01,0.01,15.91,3.43,15.52,685.34,7.13,11.01,64.27,30.61,786.37,108.99,0.01,0.03,0.01,42.27,0.68,56.32,13.74,7.97,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,69.58,5.01,18.7,83.72,69.23,11.82,0.01,0.01,0.01,4.31,3.43,15.89,806.16,1.62,16.14,60.41,0.01,754.55,106.62,0.01,0.01,0.01,20.54,0.34,66.41,9.57,13.11,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,73.36,3.54,28.58,51.27,67.58,2.65,0.01,0.01,0.01,7.96,1.24,32.92,1262.95,8.22,10.9,10.13,0.01,391.48,15.6,10.18,0.07,3.39,16.72,2.24,11189.22,43.81,3.86,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df

Unnamed: 0,Código,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Gênero,Espécie
0,006C,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,melanosuchus,niger
1,029C,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05,cuniculus,paca
2,038C,72.26,3.7,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.4,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.0,6.91,0.02,6.18,8.7,9.41,0.01,cuniculus,paca
3,041C,73.06,4.51,24.58,90.53,313.0,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.5,0.18,3.13,23.2,0.1,3.31,6.54,0.9,0.01,mazama,americana
4,043C,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.5,2.69,0.01,cuniculus,paca
5,044C,76.52,4.75,5.22,87.16,837.22,53.36,0.01,0.01,147.06,8.15,2.4,84.28,850.46,1.24,9.95,82.1,317.03,1320.23,165.03,38.23,0.02,0.17,13.19,0.08,5.32,10.1,4.15,22.84,cuniculus,paca
6,047C,73.73,4.51,32.34,82.85,609.8,71.34,0.03,0.01,160.85,18.38,7.07,62.37,976.12,7.13,1.56,55.54,142.91,177.42,319.35,4.47,0.1,0.77,32.46,0.04,696.64,10.61,7.47,24.95,﻿cairina,moschata
7,049C,71.61,4.75,20.23,80.96,94.07,8.06,0.01,0.01,0.01,15.91,3.43,15.52,685.34,7.13,11.01,64.27,30.61,786.37,108.99,0.01,0.03,0.01,42.27,0.68,56.32,13.74,7.97,0.01,penelope,jacquaca
8,050C,69.58,5.01,18.7,83.72,69.23,11.82,0.01,0.01,0.01,4.31,3.43,15.89,806.16,1.62,16.14,60.41,0.01,754.55,106.62,0.01,0.01,0.01,20.54,0.34,66.41,9.57,13.11,0.01,penelope,jacquaca
9,053C,73.36,3.54,28.58,51.27,67.58,2.65,0.01,0.01,0.01,7.96,1.24,32.92,1262.95,8.22,10.9,10.13,0.01,391.48,15.6,10.18,0.07,3.39,16.72,2.24,11189.22,43.81,3.86,0.01,euphractus,sexcinctus


In [None]:
imputed_df["Código"] = df["Código"].copy()

imputed_df

Unnamed: 0,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Gênero_alouatta,Gênero_boa,Gênero_busarellus,Gênero_caiman,Gênero_cebus,Gênero_crax,Gênero_cuniculus,Gênero_dasyprocta,Gênero_eunectes,Gênero_euphractus,Gênero_mazama,Gênero_melanosuchus,Gênero_mergus,Gênero_mesoclemmys,Gênero_mitu,Gênero_paleosuchus,Gênero_penelope,Gênero_phalacrocorax,Gênero_pipile,Gênero_podocnemis,Gênero_porphyrio,Gênero_puma,Gênero_tapirus,Gênero_tayassu,Gênero_﻿cairina,Gênero_﻿sotalia,Espécie_americana,Espécie_apella,Espécie_brasilianus,Espécie_concolor,Espécie_constrictor,Espécie_crocodilus,Espécie_cumanensis,Espécie_expansa,Espécie_fluviatilis,Espécie_fuliginosa,Espécie_globulosa,Espécie_jacquaca,Espécie_martinica,Espécie_moschata,Espécie_murinus,Espécie_niger,Espécie_nigricollis,Espécie_octosetaceus,Espécie_paca,Espécie_palpebrosus,Espécie_raniceps,Espécie_seniculus,Espécie_sexcinctus,Espécie_sextuberculata,Espécie_tajacu,Espécie_terrestris,Espécie_tuberosum,Espécie_unifilis,Código
0,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,006C
1,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,029C
2,72.26,3.7,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.4,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.0,6.91,0.02,6.18,8.7,9.41,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,038C
3,73.06,4.51,24.58,90.53,313.0,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.5,0.18,3.13,23.2,0.1,3.31,6.54,0.9,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,041C
4,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.5,2.69,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,043C
5,76.52,4.75,5.22,87.16,837.22,53.36,0.01,0.01,147.06,8.15,2.4,84.28,850.46,1.24,9.95,82.1,317.03,1320.23,165.03,38.23,0.02,0.17,13.19,0.08,5.32,10.1,4.15,22.84,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,044C
6,73.73,4.51,32.34,82.85,609.8,71.34,0.03,0.01,160.85,18.38,7.07,62.37,976.12,7.13,1.56,55.54,142.91,177.42,319.35,4.47,0.1,0.77,32.46,0.04,696.64,10.61,7.47,24.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,047C
7,71.61,4.75,20.23,80.96,94.07,8.06,0.01,0.01,0.01,15.91,3.43,15.52,685.34,7.13,11.01,64.27,30.61,786.37,108.99,0.01,0.03,0.01,42.27,0.68,56.32,13.74,7.97,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,049C
8,69.58,5.01,18.7,83.72,69.23,11.82,0.01,0.01,0.01,4.31,3.43,15.89,806.16,1.62,16.14,60.41,0.01,754.55,106.62,0.01,0.01,0.01,20.54,0.34,66.41,9.57,13.11,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,050C
9,73.36,3.54,28.58,51.27,67.58,2.65,0.01,0.01,0.01,7.96,1.24,32.92,1262.95,8.22,10.9,10.13,0.01,391.48,15.6,10.18,0.07,3.39,16.72,2.24,11189.22,43.81,3.86,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,053C


In [None]:
imputed_df = imputed_df[[col for col in imputed_df.columns if "_" not in col]]
imputed_df

Unnamed: 0,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Código
0,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,006C
1,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05,029C
2,72.26,3.7,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.4,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.0,6.91,0.02,6.18,8.7,9.41,0.01,038C
3,73.06,4.51,24.58,90.53,313.0,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.5,0.18,3.13,23.2,0.1,3.31,6.54,0.9,0.01,041C
4,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.5,2.69,0.01,043C
5,76.52,4.75,5.22,87.16,837.22,53.36,0.01,0.01,147.06,8.15,2.4,84.28,850.46,1.24,9.95,82.1,317.03,1320.23,165.03,38.23,0.02,0.17,13.19,0.08,5.32,10.1,4.15,22.84,044C
6,73.73,4.51,32.34,82.85,609.8,71.34,0.03,0.01,160.85,18.38,7.07,62.37,976.12,7.13,1.56,55.54,142.91,177.42,319.35,4.47,0.1,0.77,32.46,0.04,696.64,10.61,7.47,24.95,047C
7,71.61,4.75,20.23,80.96,94.07,8.06,0.01,0.01,0.01,15.91,3.43,15.52,685.34,7.13,11.01,64.27,30.61,786.37,108.99,0.01,0.03,0.01,42.27,0.68,56.32,13.74,7.97,0.01,049C
8,69.58,5.01,18.7,83.72,69.23,11.82,0.01,0.01,0.01,4.31,3.43,15.89,806.16,1.62,16.14,60.41,0.01,754.55,106.62,0.01,0.01,0.01,20.54,0.34,66.41,9.57,13.11,0.01,050C
9,73.36,3.54,28.58,51.27,67.58,2.65,0.01,0.01,0.01,7.96,1.24,32.92,1262.95,8.22,10.9,10.13,0.01,391.48,15.6,10.18,0.07,3.39,16.72,2.24,11189.22,43.81,3.86,0.01,053C


In [17]:
imputed_df.columns

Index(['Umidade', 'Cinzas', 'Lipídios ', 'Proteínas', 'Mg 24 (ug/g)',
       'Fe 57 (ug/g)', 'Ag 107 (ng/g)', 'Al 27 (ug/g)', 'Ba 138 (ug/g)',
       'Co 59 (ng/g)', 'Cu 63 (ug/g)', 'Zn 66 (ug/g)', 'Se 82 (ng/g)',
       'Ti 205 (ng/g)', 'Li 7 (ng/g)', 'Rb 85 (ug/g)', 'Sr 88 (ug/g)',
       'Cs 133 (ng/g)', 'Mn 55 (ng/g)', 'Ni 60 (ng/g)', 'U 238 (ng/g)',
       'Sb 121 (ng/g)', 'Sn 118 (ng/g)', 'Te 130 (ng/g)', 'Hg 202 (ng/g)',
       'As 75 (ng/g)', 'Cd 111 (ng/g)', 'Pb 208 (ng/g)', 'Código'],
      dtype='object')

In [None]:
imputed_df = imputed_df[
    [
        "Código",
        "Umidade",
        "Cinzas",
        "Lipídios ",
        "Proteínas",
        "Mg 24 (ug/g)",
        "Fe 57 (ug/g)",
        "Ag 107 (ng/g)",
        "Al 27 (ug/g)",
        "Ba 138 (ug/g)",
        "Co 59 (ng/g)",
        "Cu 63 (ug/g)",
        "Zn 66 (ug/g)",
        "Se 82 (ng/g)",
        "Ti 205 (ng/g)",
        "Li 7 (ng/g)",
        "Rb 85 (ug/g)",
        "Sr 88 (ug/g)",
        "Cs 133 (ng/g)",
        "Mn 55 (ng/g)",
        "Ni 60 (ng/g)",
        "U 238 (ng/g)",
        "Sb 121 (ng/g)",
        "Sn 118 (ng/g)",
        "Te 130 (ng/g)",
        "Hg 202 (ng/g)",
        "As 75 (ng/g)",
        "Cd 111 (ng/g)",
        "Pb 208 (ng/g)",
    ]
]

In [19]:
imputed_df

Unnamed: 0,Código,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g)
0,006C,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01
1,029C,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05
2,038C,72.26,3.7,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.4,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.0,6.91,0.02,6.18,8.7,9.41,0.01
3,041C,73.06,4.51,24.58,90.53,313.0,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.5,0.18,3.13,23.2,0.1,3.31,6.54,0.9,0.01
4,043C,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.5,2.69,0.01
5,044C,76.52,4.75,5.22,87.16,837.22,53.36,0.01,0.01,147.06,8.15,2.4,84.28,850.46,1.24,9.95,82.1,317.03,1320.23,165.03,38.23,0.02,0.17,13.19,0.08,5.32,10.1,4.15,22.84
6,047C,73.73,4.51,32.34,82.85,609.8,71.34,0.03,0.01,160.85,18.38,7.07,62.37,976.12,7.13,1.56,55.54,142.91,177.42,319.35,4.47,0.1,0.77,32.46,0.04,696.64,10.61,7.47,24.95
7,049C,71.61,4.75,20.23,80.96,94.07,8.06,0.01,0.01,0.01,15.91,3.43,15.52,685.34,7.13,11.01,64.27,30.61,786.37,108.99,0.01,0.03,0.01,42.27,0.68,56.32,13.74,7.97,0.01
8,050C,69.58,5.01,18.7,83.72,69.23,11.82,0.01,0.01,0.01,4.31,3.43,15.89,806.16,1.62,16.14,60.41,0.01,754.55,106.62,0.01,0.01,0.01,20.54,0.34,66.41,9.57,13.11,0.01
9,053C,73.36,3.54,28.58,51.27,67.58,2.65,0.01,0.01,0.01,7.96,1.24,32.92,1262.95,8.22,10.9,10.13,0.01,391.48,15.6,10.18,0.07,3.39,16.72,2.24,11189.22,43.81,3.86,0.01


In [None]:
imputed_df.to_csv("data/imputed_values.csv", index=False)