# Data Preparation for Nutritional Composition Wildmeat

This notebook organizes and cleans the raw dataset of wild meat nutritional composition, laying the groundwork for reliable analysis and imputation.

## Overview
This notebook prepares the raw wild meat nutritional composition dataset by cleaning, selecting relevant columns, handling missing data, and extracting taxonomic information to enable downstream analysis.


In [2]:
import pandas as pd

pd.set_option("display.max_columns", 500)


## Setup

Import required libraries and configure pandas display options.

In [3]:
df = pd.read_csv("data/Thales_Planilha_Geral_MJ_JM_15_07 (2) (1).csv", sep=";")


## Data Loading

Load the raw dataset from CSV and specify the delimiter for correct parsing.

In [4]:
df

Unnamed: 0,OBS,Código,Amostra,Reserva,Nome científico,Espécie,Classe,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g)
0,,006C,A1,RDSM,Melanosuchus niger,Jacaré-açu,Répteis,76.57,4.41,18.55,92.85,79.60,2.72,0.56,0.01,0.01,3.60,0.58,18.47,500.69,5.24,5.30,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01
1,,006C,A2,RDSM,Melanosuchus niger,Jacaré-açu,Répteis,76.57,4.86,17.03,90.83,,,,,,,,,,,,,,,,,,,,,,,,
2,,006C,A3,RDSM,Melanosuchus niger,Jacaré-açu,Répteis,76.57,4.79,18.57,92.02,,,,,,,,,,,,,,,,,,,,,,,,
3,,029C,A1,RDSA,Cuniculus paca,Paca,Mamíferos,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.50,1622.04,217.62,77.49,0.20,6.13,21.54,0.01,0.01,5.50,33.63,182.05
4,,029C,A2,RDSA,Cuniculus paca,Paca,Mamíferos,71.17,4.61,18.19,86.77,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
995,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
996,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
997,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
# Preview the first rows of the dataset to verify it loaded correctly
df.head()


Unnamed: 0,OBS,Código,Amostra,Reserva,Nome científico,Espécie,Classe,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g)
0,,006C,A1,RDSM,Melanosuchus niger,Jacaré-açu,Répteis,76.57,4.41,18.55,92.85,79.6,2.72,0.56,0.01,0.01,3.6,0.58,18.47,500.69,5.24,5.3,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01
1,,006C,A2,RDSM,Melanosuchus niger,Jacaré-açu,Répteis,76.57,4.86,17.03,90.83,,,,,,,,,,,,,,,,,,,,,,,,
2,,006C,A3,RDSM,Melanosuchus niger,Jacaré-açu,Répteis,76.57,4.79,18.57,92.02,,,,,,,,,,,,,,,,,,,,,,,,
3,,029C,A1,RDSA,Cuniculus paca,Paca,Mamíferos,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.5,1622.04,217.62,77.49,0.2,6.13,21.54,0.01,0.01,5.5,33.63,182.05
4,,029C,A2,RDSA,Cuniculus paca,Paca,Mamíferos,71.17,4.61,18.19,86.77,,,,,,,,,,,,,,,,,,,,,,,,


## Preview Loaded Data

Display the first few rows of the dataframe to verify successful loading.

In [6]:
df.dropna(subset=["Mg 24 (ug/g)"], inplace=True)
df

Unnamed: 0,OBS,Código,Amostra,Reserva,Nome científico,Espécie,Classe,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g)
0,,006C,A1,RDSM,Melanosuchus niger,Jacaré-açu,Répteis,76.57,4.41,18.55,92.85,79.60,2.72,0.56,0.01,0.01,3.60,0.58,18.47,500.69,5.24,5.30,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01
3,,029C,A1,RDSA,Cuniculus paca,Paca,Mamíferos,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.50,1622.04,217.62,77.49,0.20,6.13,21.54,0.01,0.01,5.50,33.63,182.05
6,,038C,A1,RDSA,Cuniculus paca,Paca,Mamíferos,72.26,3.70,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.40,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.00,6.91,0.02,6.18,8.70,9.41,0.01
9,,041C,A1,RDSA,Mazama americana,Veado capoeira,Mamíferos,73.06,4.51,24.58,90.53,313.00,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.50,0.18,3.13,23.20,0.10,3.31,6.54,0.90,0.01
12,,043C,A1,RDSA,Cuniculus paca,Paca,Mamíferos,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.50,2.69,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,só micros,LOR007C,,RDSM,Alouatta seniculus,Macaco Guariba,Mamíferos,77.17,,,,428.72,31.32,,,74.22,57.27,7.79,228.08,3.27,5.99,144.13,112.10,0.28,0.19,0.44,153.77,0.25,3.07,12.22,,0.02,26.73,1841.17,167.35
253,só micros,LOR002C,,RDSM,Podocnemis unifilis,Tracajá,Répteis,81.46,,,,420.65,141.16,,,1041.39,25.48,17.38,245.98,0.40,20.31,19.09,68.85,2.32,0.06,2.15,58.54,0.45,2.47,28.54,,0.06,23.53,20.21,101.33
254,só micros,LOR003C,,RDSM,Podocnemis expansa,Tartaruga,Répteis,74.76,,,,934.00,36.80,,,644.29,34.93,4.77,88.92,4.63,43.33,10.93,259.22,1.26,0.28,1.43,72.27,0.31,3.84,39.19,,0.11,31.71,20.13,104.42
255,só micros,LOR004C,,RDSM,Podocnemis unifilis,Tracajá,Répteis,76.90,,,,596.56,46.72,,,398.33,29.47,4.36,76.40,0.51,13.59,26.42,139.76,1.02,0.18,1.32,134.92,0.38,4.25,59.38,,0.09,43.99,27.24,102.26


## Remove Missing Values for Magnesium

Drop rows with missing values in the magnesium column to ensure accurate Mg analysis.

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 0 to 256
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OBS              16 non-null     object 
 1   Código           89 non-null     object 
 2   Amostra          83 non-null     object 
 3   Reserva          84 non-null     object 
 4   Nome científico  89 non-null     object 
 5   Espécie          89 non-null     object 
 6   Classe           89 non-null     object 
 7   Umidade          42 non-null     float64
 8   Cinzas           83 non-null     float64
 9   Lipídios         80 non-null     float64
 10  Proteínas        83 non-null     float64
 11  Mg 24 (ug/g)     89 non-null     float64
 12  Fe 57 (ug/g)     89 non-null     float64
 13  Ag 107 (ng/g)    68 non-null     float64
 14  Al 27 (ug/g)     68 non-null     float64
 15  Ba 138 (ug/g)    89 non-null     float64
 16  Co 59 (ng/g)     89 non-null     float64
 17  Cu 63 (ug/g)     89 no

## Initial Data Overview

Inspect dataframe structure, data types, and non-null counts to understand dataset completeness.

In [8]:
df["Nome científico"].value_counts().sort_index()

Nome científico
Alouatta seniculus             4
Boa constrictor                1
Busarellus nigricollis         1
Caiman crocodilus              4
Cebus apella                   1
Crax globulosa                 3
Cuniculus paca                27
Dasyprocta fuliginosa          9
Eunectes murinus               1
Euphractus sexcinctus          4
Mazama americana               5
Melanosuchus niger             3
Mergus octosetaceus            1
Mesoclemmys raniceps           1
Mitu tuberosum                 2
Paleosuchus palpebrosus        1
Penelope jacquaca              5
Phalacrocorax brasilianus      1
Pipile cumanensis              1
Podocnemis expansa             1
Podocnemis unifilis            2
Porphyrio martinica            1
Puma concolor                  1
Tapirus terrestris             1
Tayassu tajacu                 1
 Podocnemis sextuberculata     1
﻿Cairina moschata              5
﻿Sotalia fluviatilis           1
Name: count, dtype: int64

In [9]:
df["Reserva"].value_counts()

Reserva
RDSA     53
RDSM     20
FLONA    11
Name: count, dtype: int64

In [10]:
df.columns

Index(['OBS', 'Código', 'Amostra', 'Reserva', 'Nome científico', 'Espécie',
       'Classe', 'Umidade', 'Cinzas', 'Lipídios ', 'Proteínas', 'Mg 24 (ug/g)',
       'Fe 57 (ug/g)', 'Ag 107 (ng/g)', 'Al 27 (ug/g)', 'Ba 138 (ug/g)',
       'Co 59 (ng/g)', 'Cu 63 (ug/g)', 'Zn 66 (ug/g)', 'Se 82 (ng/g)',
       'Ti 205 (ng/g)', 'Li 7 (ng/g)', 'Rb 85 (ug/g)', 'Sr 88 (ug/g)',
       'Cs 133 (ng/g)', 'Mn 55 (ng/g)', 'Ni 60 (ng/g)', 'U 238 (ng/g)',
       'Sb 121 (ng/g)', 'Sn 118 (ng/g)', 'Te 130 (ng/g)', 'Hg 202 (ng/g)',
       'As 75 (ng/g)', 'Cd 111 (ng/g)', 'Pb 208 (ng/g)'],
      dtype='object')

## Select Relevant Columns
We select only the identifier and the nutritional element columns needed for downstream analysis.


In [11]:
df = df[
    [
        "Código",
        "Nome científico",
        "Umidade",
        "Cinzas",
        "Lipídios ",
        "Proteínas",
        "Mg 24 (ug/g)",
        "Fe 57 (ug/g)",
        "Ag 107 (ng/g)",
        "Al 27 (ug/g)",
        "Ba 138 (ug/g)",
        "Co 59 (ng/g)",
        "Cu 63 (ug/g)",
        "Zn 66 (ug/g)",
        "Se 82 (ng/g)",
        "Ti 205 (ng/g)",
        "Li 7 (ng/g)",
        "Rb 85 (ug/g)",
        "Sr 88 (ug/g)",
        "Cs 133 (ng/g)",
        "Mn 55 (ng/g)",
        "Ni 60 (ng/g)",
        "U 238 (ng/g)",
        "Sb 121 (ng/g)",
        "Sn 118 (ng/g)",
        "Te 130 (ng/g)",
        "Hg 202 (ng/g)",
        "As 75 (ng/g)",
        "Cd 111 (ng/g)",
        "Pb 208 (ng/g)",
    ]
]

## Column Subsetting

Select only relevant nutritional element columns and identifiers for downstream analysis.

In [12]:
df

Unnamed: 0,Código,Nome científico,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g)
0,006C,Melanosuchus niger,76.57,4.41,18.55,92.85,79.60,2.72,0.56,0.01,0.01,3.60,0.58,18.47,500.69,5.24,5.30,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01
3,029C,Cuniculus paca,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.50,1622.04,217.62,77.49,0.20,6.13,21.54,0.01,0.01,5.50,33.63,182.05
6,038C,Cuniculus paca,72.26,3.70,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.40,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.00,6.91,0.02,6.18,8.70,9.41,0.01
9,041C,Mazama americana,73.06,4.51,24.58,90.53,313.00,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.50,0.18,3.13,23.20,0.10,3.31,6.54,0.90,0.01
12,043C,Cuniculus paca,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.50,2.69,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,LOR007C,Alouatta seniculus,77.17,,,,428.72,31.32,,,74.22,57.27,7.79,228.08,3.27,5.99,144.13,112.10,0.28,0.19,0.44,153.77,0.25,3.07,12.22,,0.02,26.73,1841.17,167.35
253,LOR002C,Podocnemis unifilis,81.46,,,,420.65,141.16,,,1041.39,25.48,17.38,245.98,0.40,20.31,19.09,68.85,2.32,0.06,2.15,58.54,0.45,2.47,28.54,,0.06,23.53,20.21,101.33
254,LOR003C,Podocnemis expansa,74.76,,,,934.00,36.80,,,644.29,34.93,4.77,88.92,4.63,43.33,10.93,259.22,1.26,0.28,1.43,72.27,0.31,3.84,39.19,,0.11,31.71,20.13,104.42
255,LOR004C,Podocnemis unifilis,76.90,,,,596.56,46.72,,,398.33,29.47,4.36,76.40,0.51,13.59,26.42,139.76,1.02,0.18,1.32,134.92,0.38,4.25,59.38,,0.09,43.99,27.24,102.26


In [13]:
df["Nome científico"] = df["Nome científico"].str.replace("\xa0", " ")
df["Nome científico"] = df["Nome científico"].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Nome científico"] = df["Nome científico"].str.replace("\xa0", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Nome científico"] = df["Nome científico"].str.strip()


## Scientific Name Cleaning

Replace non-breaking spaces and trim entries to standardize the `Nome científico` column.

In [14]:
df["Gênero"] = df["Nome científico"].str.split(" ").str[0].str.strip().str.lower()
df["Espécie"] = df["Nome científico"].str.split(" ").str[1].str.strip().str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Gênero"] = df["Nome científico"].str.split(" ").str[0].str.strip().str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Espécie"] = df["Nome científico"].str.split(" ").str[1].str.strip().str.lower()


## Extract Genus and Species

Split standardized scientific names into lowercase `Gênero` and `Espécie` columns for taxonomic analysis.

In [15]:
df

Unnamed: 0,Código,Nome científico,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Gênero,Espécie
0,006C,Melanosuchus niger,76.57,4.41,18.55,92.85,79.60,2.72,0.56,0.01,0.01,3.60,0.58,18.47,500.69,5.24,5.30,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,melanosuchus,niger
3,029C,Cuniculus paca,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.50,1622.04,217.62,77.49,0.20,6.13,21.54,0.01,0.01,5.50,33.63,182.05,cuniculus,paca
6,038C,Cuniculus paca,72.26,3.70,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.40,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.00,6.91,0.02,6.18,8.70,9.41,0.01,cuniculus,paca
9,041C,Mazama americana,73.06,4.51,24.58,90.53,313.00,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.50,0.18,3.13,23.20,0.10,3.31,6.54,0.90,0.01,mazama,americana
12,043C,Cuniculus paca,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.50,2.69,0.01,cuniculus,paca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,LOR007C,Alouatta seniculus,77.17,,,,428.72,31.32,,,74.22,57.27,7.79,228.08,3.27,5.99,144.13,112.10,0.28,0.19,0.44,153.77,0.25,3.07,12.22,,0.02,26.73,1841.17,167.35,alouatta,seniculus
253,LOR002C,Podocnemis unifilis,81.46,,,,420.65,141.16,,,1041.39,25.48,17.38,245.98,0.40,20.31,19.09,68.85,2.32,0.06,2.15,58.54,0.45,2.47,28.54,,0.06,23.53,20.21,101.33,podocnemis,unifilis
254,LOR003C,Podocnemis expansa,74.76,,,,934.00,36.80,,,644.29,34.93,4.77,88.92,4.63,43.33,10.93,259.22,1.26,0.28,1.43,72.27,0.31,3.84,39.19,,0.11,31.71,20.13,104.42,podocnemis,expansa
255,LOR004C,Podocnemis unifilis,76.90,,,,596.56,46.72,,,398.33,29.47,4.36,76.40,0.51,13.59,26.42,139.76,1.02,0.18,1.32,134.92,0.38,4.25,59.38,,0.09,43.99,27.24,102.26,podocnemis,unifilis


In [16]:
df.drop(columns=["Nome científico"], inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["Nome científico"], inplace=True)


Unnamed: 0,Código,Umidade,Cinzas,Lipídios,Proteínas,Mg 24 (ug/g),Fe 57 (ug/g),Ag 107 (ng/g),Al 27 (ug/g),Ba 138 (ug/g),Co 59 (ng/g),Cu 63 (ug/g),Zn 66 (ug/g),Se 82 (ng/g),Ti 205 (ng/g),Li 7 (ng/g),Rb 85 (ug/g),Sr 88 (ug/g),Cs 133 (ng/g),Mn 55 (ng/g),Ni 60 (ng/g),U 238 (ng/g),Sb 121 (ng/g),Sn 118 (ng/g),Te 130 (ng/g),Hg 202 (ng/g),As 75 (ng/g),Cd 111 (ng/g),Pb 208 (ng/g),Gênero,Espécie
0,006C,76.57,4.41,18.55,92.85,79.60,2.72,0.56,0.01,0.01,3.60,0.58,18.47,500.69,5.24,5.30,79.09,0.01,221.13,210.25,0.01,0.14,1.77,9.89,0.65,3435.75,63.42,3.87,0.01,melanosuchus,niger
3,029C,71.17,4.56,17.68,86.08,615.04,46.64,0.44,726.79,34.31,6.95,2.09,24.22,737.03,1.74,2.73,87.23,274.50,1622.04,217.62,77.49,0.20,6.13,21.54,0.01,0.01,5.50,33.63,182.05,cuniculus,paca
6,038C,72.26,3.70,22.44,85.21,297.82,23.71,0.01,0.01,0.01,4.84,1.82,48.86,437.40,2.32,10.06,101.83,241.04,473.04,47.92,0.01,0.01,0.00,6.91,0.02,6.18,8.70,9.41,0.01,cuniculus,paca
9,041C,73.06,4.51,24.58,90.53,313.00,21.01,0.01,12056.64,0.01,4.49,3.52,25.64,339.99,1.95,3.53,39.43,46.55,639.45,97.52,32.50,0.18,3.13,23.20,0.10,3.31,6.54,0.90,0.01,mazama,americana
12,043C,75.58,4.52,20.24,87.68,194.56,25.26,0.01,0.01,0.01,12.52,3.33,58.91,552.87,2.15,38.13,68.99,59.23,430.14,132.09,6.71,0.01,0.46,6.13,0.01,0.01,6.50,2.69,0.01,cuniculus,paca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,LOR007C,77.17,,,,428.72,31.32,,,74.22,57.27,7.79,228.08,3.27,5.99,144.13,112.10,0.28,0.19,0.44,153.77,0.25,3.07,12.22,,0.02,26.73,1841.17,167.35,alouatta,seniculus
253,LOR002C,81.46,,,,420.65,141.16,,,1041.39,25.48,17.38,245.98,0.40,20.31,19.09,68.85,2.32,0.06,2.15,58.54,0.45,2.47,28.54,,0.06,23.53,20.21,101.33,podocnemis,unifilis
254,LOR003C,74.76,,,,934.00,36.80,,,644.29,34.93,4.77,88.92,4.63,43.33,10.93,259.22,1.26,0.28,1.43,72.27,0.31,3.84,39.19,,0.11,31.71,20.13,104.42,podocnemis,expansa
255,LOR004C,76.90,,,,596.56,46.72,,,398.33,29.47,4.36,76.40,0.51,13.59,26.42,139.76,1.02,0.18,1.32,134.92,0.38,4.25,59.38,,0.09,43.99,27.24,102.26,podocnemis,unifilis


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 0 to 256
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Código         89 non-null     object 
 1   Umidade        42 non-null     float64
 2   Cinzas         83 non-null     float64
 3   Lipídios       80 non-null     float64
 4   Proteínas      83 non-null     float64
 5   Mg 24 (ug/g)   89 non-null     float64
 6   Fe 57 (ug/g)   89 non-null     float64
 7   Ag 107 (ng/g)  68 non-null     float64
 8   Al 27 (ug/g)   68 non-null     float64
 9   Ba 138 (ug/g)  89 non-null     float64
 10  Co 59 (ng/g)   89 non-null     float64
 11  Cu 63 (ug/g)   89 non-null     float64
 12  Zn 66 (ug/g)   89 non-null     float64
 13  Se 82 (ng/g)   89 non-null     float64
 14  Ti 205 (ng/g)  89 non-null     float64
 15  Li 7 (ng/g)    89 non-null     float64
 16  Rb 85 (ug/g)   89 non-null     float64
 17  Sr 88 (ug/g)   89 non-null     float64
 18  Cs 133 (ng/g)  8

In [18]:
cols_to_impute = [col for col in df.columns if df[col].isnull().sum() > 0]
cols_to_impute

['Umidade',
 'Cinzas',
 'Lipídios ',
 'Proteínas',
 'Ag 107 (ng/g)',
 'Al 27 (ug/g)',
 'Mn 55 (ng/g)',
 'Ni 60 (ng/g)',
 'Te 130 (ng/g)']

## Identify Missing Data for Imputation
Determine which columns have missing values and count them to prioritize the imputation order.


In [19]:
order_to_impute = {}

for col in cols_to_impute:
    order_to_impute[col] = df[col].isnull().sum()

order_to_impute = {k: v for k, v in sorted(order_to_impute.items(), key=lambda item: item[1])}

order_to_impute

{'Mn 55 (ng/g)': np.int64(1),
 'Ni 60 (ng/g)': np.int64(1),
 'Cinzas': np.int64(6),
 'Proteínas': np.int64(6),
 'Lipídios ': np.int64(9),
 'Ag 107 (ng/g)': np.int64(21),
 'Al 27 (ug/g)': np.int64(21),
 'Te 130 (ng/g)': np.int64(21),
 'Umidade': np.int64(47)}

## Export Cleaned Data


In [20]:
df.to_parquet("data/01_cleaned.parquet", index=False)

## Save Imputation Order

Export the computed order of columns to impute based on missing value frequencies.

In [21]:
order_to_impute = {k: int(v) for k, v in order_to_impute.items()}

In [22]:
import json

with open("data/01_order_to_impute.json", "w") as f:
    json.dump(order_to_impute, f)