# Regression Tests for Bionutrient Analysis

This notebook performs regression analysis on nutrient data, focusing on the prioritization of food composition and consumption research. It includes data preprocessing, model training, and visualization of results using various machine learning techniques.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from glob import glob
import re
import time
from multiprocessing import cpu_count

# Machine learning libraries
from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Machine learning models
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

# Set pandas display options
pd.options.display.float_format = '{:.5f}'.format

## Data Loading and Preprocessing

In [2]:
# Load Excel files from specified directories
files_bio = glob('input/BIOLOGOS/*.xlsx')
files_food = glob('input/ALIMENTOS/*.xlsx')

print(f"Number of biologist files: {len(files_bio)}, Number of food scientist files: {len(files_food)}")

Number of biologist files: 5, Number of food scientist files: 13


In [3]:
def process_files(file_list, labeler_type):
    """Process Excel files and return a dictionary of DataFrames."""
    dfs = {}
    for file in file_list:
        df = pd.read_excel(file)
        labeler = file.split('/')[-1].split('.')[0]
        match = re.findall(r'BIONUT - (\w+)', labeler)
        if match:
            labeler = match[0]
        df['person'] = labeler
        df['labeler_from_food_science'] = 1 if labeler_type == 'FOOD' else 0
        print(f"Processed file for: {labeler}")
        dfs[labeler] = df
    return dfs

dfs_bio = process_files(files_bio, 'BIO')
dfs_food = process_files(files_food, 'FOOD')

Processed file for: ALVES
Processed file for: BIZRI
Processed file for: MORCATTY
Processed file for: OLIVEIRA
Processed file for: ZENOBIA
Processed file for: AQUINO
Processed file for: BATISTA
Processed file for: BEZERRA
Processed file for: CARIOCA
Processed file for: CAZARIN
Processed file for: GIUTINI
Processed file for: LIMA
Processed file for: MORAIS
Processed file for: NORDE
Processed file for: OLIVEIRA
Processed file for: SANTOS
Processed file for: STELUTI
Processed file for: TEIXEIRA


In [4]:
# Define Likert scale mapping
likert_scale = {
    'Muito Baixa': 1,
    'Baixa': 2,
    'Neutra': 3,
    'Alta': 4,
    'Muito Alta': 5
}

def process_likert_scores(dfs):
    """Process Likert scale scores in the DataFrames."""
    for _, df in dfs.items():
        df['priority_composition'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimento seja analisado em estudos de composição nutricional? '].replace(likert_scale).fillna(0)
        df['priority_consumption'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimento seja incluído em pesquisas de consumo alimentar?'].replace(likert_scale).fillna(0)
        df['confidence_composition'] = df['Qual o seu nível de confiança nessa resposta relacionada à priorização de composição?'].replace(likert_scale).fillna(0)
        df['confidence_consumption'] = df['Qual o seu nível de confiança na sua resposta relacionada à priorização de consumo?'].replace(likert_scale).fillna(0)

process_likert_scores(dfs_bio)
process_likert_scores(dfs_food)

## Data Processing and Feature Engineering

In [5]:
import numpy as np
import pandas as pd
from typing import Dict

def create_confidence_array(dataframes: Dict[str, pd.DataFrame], score_type: str) -> np.ndarray:
    """Create a numpy array of confidence scores from a dictionary of DataFrames.
    
    Args:
        dataframes (Dict[str, pd.DataFrame]): Dictionary of DataFrames containing confidence scores.
        score_type (str): Type of score to extract (e.g., 'composition', 'consumption').

    Returns:
        np.ndarray: Numpy array of confidence scores.
    """
    # Get the first key to determine the shape of the array
    first_key = list(dataframes.keys())[0]
    array_shape = (len(dataframes[first_key]), len(dataframes))
    
    # Initialize the confidence array with zeros
    confidence_array = np.zeros(array_shape, dtype=int)
    
    # Populate the confidence array with values from the DataFrames
    for index, (_, dataframe) in enumerate(dataframes.items()):
        confidence_array[:, index] = dataframe[f'confidence_{score_type}'].values
    
    # Replace zeros with the most common value in the second column
    most_common_value = np.argmax(np.bincount(confidence_array[:, 1]))
    confidence_array[confidence_array == 0] = most_common_value
    
    return confidence_array

# Create confidence arrays for different score types and datasets
confidence_composition_food = create_confidence_array(dfs_food, 'composition')
confidence_consumption_food = create_confidence_array(dfs_food, 'consumption')
confidence_composition_bio = create_confidence_array(dfs_bio, 'composition')
confidence_consumption_bio = create_confidence_array(dfs_bio, 'consumption')

print("Confidence arrays created for composition and consumption scores.")

Confidence arrays created for composition and consumption scores.


In [6]:
confidence_consumption_food.shape, confidence_composition_food.shape

((369, 13), (369, 13))

In [7]:
confidence_consumption_bio.shape, confidence_composition_bio.shape

((369, 5), (369, 5))

In [8]:
import numpy as np
import pandas as pd
from typing import Dict

def create_priority_array(dataframes: Dict[str, pd.DataFrame], score_type: str) -> np.ndarray:
    """Create a numpy array of priority scores from a dictionary of DataFrames.
    
    Args:
        dataframes (Dict[str, pd.DataFrame]): Dictionary of DataFrames containing priority scores.
        score_type (str): Type of score to extract (e.g., 'composition', 'consumption').

    Returns:
        np.ndarray: Numpy array of priority scores.
    """
    # Get the first key to determine the shape of the array
    first_key = list(dataframes.keys())[0]
    array_shape = (len(dataframes[first_key]), len(dataframes))
    
    # Initialize the priority array with zeros
    priority_array = np.zeros(array_shape, dtype=int)
    
    # Populate the priority array with values from the DataFrames
    for index, (_, dataframe) in enumerate(dataframes.items()):
        priority_array[:, index] = dataframe[f'priority_{score_type}'].values
    
    # Replace zeros with the most common value in the second column
    most_common_value = np.argmax(np.bincount(priority_array[:, 1]))
    priority_array[priority_array == 0] = most_common_value
    
    return priority_array

# Create priority arrays for different score types and datasets
priority_composition_food = create_priority_array(dfs_food, 'composition')
priority_consumption_food = create_priority_array(dfs_food, 'consumption')
priority_composition_bio = create_priority_array(dfs_bio, 'composition')
priority_consumption_bio = create_priority_array(dfs_bio, 'consumption')

print("Priority arrays created for composition and consumption scores.")

Priority arrays created for composition and consumption scores.


In [9]:
priority_composition_food.shape, priority_consumption_food.shape

((369, 13), (369, 13))

In [10]:
priority_composition_bio.shape, priority_consumption_bio.shape

((369, 5), (369, 5))

In [11]:

# Define feature columns
feature_columns = [
    'grupo de alimentos ', 'distribuição por estado ',
    'risco de extinção', 'origem', 'cultivada comercialmente',
    'Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?',
    'Quantidade de receitas informadas com esse ingrediente na Pesquisa de Orçamentos Familiares (POF/IBGE).',
    'Quantidade de receitas informadas com esse ingrediente em sites e livros populares.',
    'labeler_from_food_science'
]

biologist_example = list(dfs_bio.keys())[0]
food_scientist_example = list(dfs_food.keys())[0]

# Combine data from biologists and food scientists
combined_features =  pd.concat([dfs_bio[biologist_example][feature_columns], dfs_food[food_scientist_example][feature_columns]])

# Calculate weighted averages for composition and consumption priorities
weighted_composition_priority_bio = np.average(priority_composition_bio, weights=confidence_composition_bio, axis=1)
weighted_consumption_priority_bio = np.average(priority_consumption_bio, weights=confidence_consumption_bio, axis=1)
weighted_composition_priority_food = np.average(priority_composition_food, weights=confidence_composition_food, axis=1)
weighted_consumption_priority_food = np.average(priority_consumption_food, weights=confidence_consumption_food, axis=1)

# Combine weighted averages
combined_composition_priority = np.concatenate([weighted_composition_priority_bio, weighted_composition_priority_food])
combined_consumption_priority = np.concatenate([weighted_consumption_priority_bio, weighted_consumption_priority_food])

# Print shapes of the resulting matrices and vectors
print(f"Shape of feature matrix combined_features: {combined_features.shape}")
print(f"Shape of composition priority vector: {combined_composition_priority.shape}")
print(f"Shape of consumption priority vector: {combined_consumption_priority.shape}")

Shape of feature matrix combined_features: (738, 9)
Shape of composition priority vector: (738,)
Shape of consumption priority vector: (738,)


In [12]:
combined_features

Unnamed: 0,grupo de alimentos,distribuição por estado,risco de extinção,origem,cultivada comercialmente,Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?,Quantidade de receitas informadas com esse ingrediente na Pesquisa de Orçamentos Familiares (POF/IBGE).,Quantidade de receitas informadas com esse ingrediente em sites e livros populares.,labeler_from_food_science
0,alga,BA,Não avaliada,Nativa,Não,Não existe,0,0.00000,0
1,alga,"BA, CE, PB, PE, PI, RN, SE, ES, RJ",Não avaliada,Nativa,Não,Não existe,0,0.00000,0
2,alga,"BA, CE, PE, SE, RJ, SP, PR, SC",Não avaliada,Nativa,Não,Não existe,0,0.00000,0
3,alga,"AL, BA, CE, PB, PE, PI, RN, ES, RJ, SP",Não avaliada,Nativa,Não,Não existe,0,0.00000,0
4,alga,"AL, BA, CE, MA, PB, PE, RN, ES",Não avaliada,Nativa,Não,Não existe,0,0.00000,0
...,...,...,...,...,...,...,...,...,...
364,planta,"AL, BA, PE, SE",Em Perigo,Nativa,Não,"Sim, em nível de espécie",0,1.00000,1
365,planta,"AC, AM, PA, RO, RR, TO, AL, BA, CE, MA, PE, PI...",Não avaliada,Nativa,Não,Não existe,0,0.00000,1
366,planta,"AC, AM, AP, PA, RO, RR, TO, AL, BA, CE, MA, PB...",Não avaliada,Nativa,Sim,Não existe,0,10.00000,1
367,planta,"GO, MS, MG, RJ, SP, PR, RS, SC",Não avaliada,Nativa,Não,"Sim, em nível de espécie",0,0.00000,1


In [13]:
# Feature engineering: Create 'number_states' feature
combined_features['number_states'] = combined_features['distribuição por estado '].astype(str).apply(lambda x: len(x.split(',')))

# Update feature columns list
new_feature_cols = [
    'grupo de alimentos ', 'number_states',
    'risco de extinção', 'origem', 'cultivada comercialmente',
    'Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?',
    'Quantidade de receitas informadas com esse ingrediente na Pesquisa de Orçamentos Familiares (POF/IBGE).',
    'Quantidade de receitas informadas com esse ingrediente em sites e livros populares.',
    'labeler_from_food_science'
]

X = combined_features[new_feature_cols]

# Encode categorical variables
X['cultivada comercialmente'] = X['cultivada comercialmente'].replace({'Não': 0, 'Sim': 1})
X['Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?'] = X['Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?'].replace({
    'Não existe': 0,
    'Sim, pelo nome popular': 1,
    'Sim, em nível de gênero': 2,
    'Sim, em nível de espécie': 3
})

# One-hot encode remaining categorical variables
X = pd.get_dummies(X, columns=['grupo de alimentos ', 'risco de extinção', 'origem'])

# Fill NaN values with 0
X = X.fillna(0)

print("Feature engineering and encoding completed.")
print(f"Final shape of feature matrix X: {X.shape}")

Feature engineering and encoding completed.
Final shape of feature matrix X: (738, 23)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cultivada comercialmente'] = X['cultivada comercialmente'].replace({'Não': 0, 'Sim': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?'] = X['Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?'].replace({


In [14]:
X

Unnamed: 0,number_states,cultivada comercialmente,Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?,Quantidade de receitas informadas com esse ingrediente na Pesquisa de Orçamentos Familiares (POF/IBGE).,Quantidade de receitas informadas com esse ingrediente em sites e livros populares.,labeler_from_food_science,grupo de alimentos _alga,grupo de alimentos _caça,grupo de alimentos _cogumelo,grupo de alimentos _inseto,...,risco de extinção_Extinta,risco de extinção_Não avaliada,risco de extinção_Quase Ameaçada,risco de extinção_Segura ou Pouco Preocupante,risco de extinção_Sem dados,risco de extinção_Vulnerável,origem_Exótica,origem_Nativa,origem_Ocorre naturalmente,origem_Residente
0,1,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
1,9,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
2,8,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
3,10,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
4,8,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,4,0,3,0,1.00000,1,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
365,22,0,0,0,0.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
366,27,1,0,0,10.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
367,8,0,3,0,0.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False


## Model Training and Evaluation

In [15]:
def train_models(X, y):
    """Train and evaluate multiple regression models."""
    random_state = 314
    
    models = [
        ('LinearSVR', LinearSVR(random_state=random_state)),
        ('LinearRegression', LinearRegression(n_jobs=-1)),
        ('RandomForestRegressor', RandomForestRegressor(random_state=random_state)),
        ('LGBMRegressor', LGBMRegressor(random_state=random_state, verbose=-1)),
        ('XGBRegressor', XGBRegressor(random_state=random_state, verbosity=0)),
        ('MLPRegressor', MLPRegressor(random_state=random_state)),
        ('SGDRegressor', SGDRegressor(random_state=random_state)),
        ('KNeighborsRegressor', KNeighborsRegressor(n_jobs=-1)),
        ('DecisionTreeRegressor', DecisionTreeRegressor(random_state=random_state)),
        ('ExtraTreesRegressor', ExtraTreesRegressor(random_state=random_state))
    ]

    results = []
    cv = KFold(n_splits=10, shuffle=True, random_state=random_state)

    for name, model in models:
        start_time = time.time()

        try:
            pred = cross_val_predict(estimator=model, X=X, y=y, cv=cv, method="predict", n_jobs=2)
            rmse = np.sqrt(mean_squared_error(y, pred))
            mae = mean_absolute_error(y, pred)
            r2 = r2_score(y, pred)
            elapsed = time.time() - start_time
            results.append([name, elapsed, rmse, mae, r2])
            print(f'Name: {name} - Elapsed: {elapsed:.2f}s - RMSE: {rmse:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}')
        except Exception as e:
            print(f'Error {name} - {e}')

    columns = ['Model', 'Time', 'RMSE', 'MAE', 'R2']
    df_results = pd.DataFrame(results, columns=columns)
    return df_results

print("Training models for composition priority...")
df_results_composition_priority = train_models(X, combined_composition_priority)

print("\nTraining models for consumption priority...")
df_results_consumption_priority = train_models(X, combined_consumption_priority)

print("\nResults for composition priority:")
print(df_results_composition_priority.sort_values(by='RMSE', ascending=True))

print("\nResults for consumption priority:")
print(df_results_consumption_priority.sort_values(by='RMSE', ascending=True))

Training models for composition priority...




Name: LinearSVR - Elapsed: 0.86s - RMSE: 5.1774 - MAE: 0.9295 - R2: -46.4303
Name: LinearRegression - Elapsed: 0.02s - RMSE: 0.6032 - MAE: 0.4810 - R2: 0.3562
Name: RandomForestRegressor - Elapsed: 0.96s - RMSE: 0.2288 - MAE: 0.1617 - R2: 0.9074
Name: LGBMRegressor - Elapsed: 0.52s - RMSE: 0.2210 - MAE: 0.1589 - R2: 0.9136
Name: XGBRegressor - Elapsed: 0.69s - RMSE: 0.2348 - MAE: 0.1650 - R2: 0.9025
Name: MLPRegressor - Elapsed: 1.41s - RMSE: 0.6665 - MAE: 0.3753 - R2: 0.2141
Name: SGDRegressor - Elapsed: 0.03s - RMSE: 1077502199761170.0000 - MAE: 140684982800482.0781 - R2: -2054318513873156574835040583680.0000
Name: KNeighborsRegressor - Elapsed: 0.06s - RMSE: 0.4168 - MAE: 0.3090 - R2: 0.6926
Name: DecisionTreeRegressor - Elapsed: 0.03s - RMSE: 0.2808 - MAE: 0.1897 - R2: 0.8605
Name: ExtraTreesRegressor - Elapsed: 0.81s - RMSE: 0.2820 - MAE: 0.1863 - R2: 0.8593

Training models for consumption priority...
Name: LinearSVR - Elapsed: 0.15s - RMSE: 5.1545 - MAE: 0.9350 - R2: -29.3127
Na



Name: RandomForestRegressor - Elapsed: 0.94s - RMSE: 0.2732 - MAE: 0.1857 - R2: 0.9148
Name: LGBMRegressor - Elapsed: 0.33s - RMSE: 0.2590 - MAE: 0.1832 - R2: 0.9234
Name: XGBRegressor - Elapsed: 0.76s - RMSE: 0.2751 - MAE: 0.1894 - R2: 0.9137
Name: MLPRegressor - Elapsed: 1.56s - RMSE: 0.6405 - MAE: 0.3868 - R2: 0.5320
Name: SGDRegressor - Elapsed: 0.03s - RMSE: 1077621050415473.0000 - MAE: 140782670376055.2500 - R2: -1324909786874205145077508997120.0000
Name: KNeighborsRegressor - Elapsed: 0.05s - RMSE: 0.5075 - MAE: 0.3890 - R2: 0.7061
Name: DecisionTreeRegressor - Elapsed: 0.03s - RMSE: 0.3448 - MAE: 0.2323 - R2: 0.8643
Name: ExtraTreesRegressor - Elapsed: 0.84s - RMSE: 0.2841 - MAE: 0.1953 - R2: 0.9079

Results for composition priority:
                   Model    Time                   RMSE                   MAE  \
3          LGBMRegressor 0.51719                0.22103               0.15892   
2  RandomForestRegressor 0.95877                0.22878               0.16170   
4    

In [16]:
df_results_consumption_priority.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Model,Time,RMSE,MAE,R2
3,LGBMRegressor,0.33373,0.25905,0.18318,0.92344
2,RandomForestRegressor,0.9387,0.27323,0.18569,0.91482
4,XGBRegressor,0.75768,0.27505,0.18943,0.91369
9,ExtraTreesRegressor,0.83787,0.2841,0.19526,0.90791
8,DecisionTreeRegressor,0.0324,0.34484,0.23234,0.86433
7,KNeighborsRegressor,0.04567,0.50752,0.38902,0.70613
5,MLPRegressor,1.56165,0.64049,0.38677,0.53196
1,LinearRegression,0.02189,0.6423,0.49957,0.52932
0,LinearSVR,0.1532,5.15448,0.93496,-29.31269
6,SGDRegressor,0.03277,1077621050415473.0,140782670376055.25,-1.3249097868742051e+30


In [17]:
df_results_composition_priority.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Model,Time,RMSE,MAE,R2
3,LGBMRegressor,0.51719,0.22103,0.15892,0.91356
2,RandomForestRegressor,0.95877,0.22878,0.1617,0.90739
4,XGBRegressor,0.68816,0.23479,0.16502,0.90246
8,DecisionTreeRegressor,0.03263,0.28081,0.18966,0.86048
9,ExtraTreesRegressor,0.80756,0.282,0.18625,0.85929
7,KNeighborsRegressor,0.06307,0.41681,0.30899,0.6926
1,LinearRegression,0.02285,0.6032,0.48096,0.3562
5,MLPRegressor,1.41105,0.66647,0.37535,0.21405
0,LinearSVR,0.85731,5.1774,0.92953,-46.43026
6,SGDRegressor,0.03338,1077502199761170.0,140684982800482.08,-2.0543185138731563e+30


In [18]:
X

Unnamed: 0,number_states,cultivada comercialmente,Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?,Quantidade de receitas informadas com esse ingrediente na Pesquisa de Orçamentos Familiares (POF/IBGE).,Quantidade de receitas informadas com esse ingrediente em sites e livros populares.,labeler_from_food_science,grupo de alimentos _alga,grupo de alimentos _caça,grupo de alimentos _cogumelo,grupo de alimentos _inseto,...,risco de extinção_Extinta,risco de extinção_Não avaliada,risco de extinção_Quase Ameaçada,risco de extinção_Segura ou Pouco Preocupante,risco de extinção_Sem dados,risco de extinção_Vulnerável,origem_Exótica,origem_Nativa,origem_Ocorre naturalmente,origem_Residente
0,1,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
1,9,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
2,8,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
3,10,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
4,8,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,4,0,3,0,1.00000,1,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
365,22,0,0,0,0.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
366,27,1,0,0,10.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
367,8,0,3,0,0.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False


In [19]:
remap_dct = {
    'Quantidade de receitas informadas com esse ingrediente em sites e livros populares.': 'Number of recipes from popular books and websites',
    'Existe informação nutricional desta espécie disponível em tabela de composição de alimentos?': 'Availability of nutritional composition data',
    'labeler_from_food_science': 'Labeler from nutrition science',
    'number_states': 'Occurrence in Brazilian states',
    'cultivada comercialmente': 'Commercially cultivated',
    'grupo de alimentos _planta': 'Food category - plants',
    'risco de extinção_Segura ou Pouco Preocupante ': 'Conservation status - least concern',
    'grupo de alimentos _caça': 'Food category - wild animals',
    'risco de extinção_Não avaliada ': 'Conservation status - not evaluated',
    'risco de extinção_Sem dados ': 'Conservation status - data deficient',
    'origem_Nativa': 'Origin - native',
    'grupo de alimentos _cogumelo': 'Food category - mushrooms',
    'origem_Exótica': 'Origin - exotic',
    'origem_Residente': 'Origin - resident',
    'origem_Ocorre naturalmente': 'Origin - naturally occurring',
    'Quantidade de receitas informadas com esse ingrediente na Pesquisa de Orçamentos Familiares (POF/IBGE).': 'Number of recipes in the Brazilian Household Budget Survey',
    'grupo de alimentos _pescado ou fruto do mar': 'Food category - fish and seafood',
    'risco de extinção_Vulnerável ': 'Conservation status - vulnerable',
    'grupo de alimentos _inseto': 'Food category - insects',
    'risco de extinção_Quase Ameaçada ': 'Conservation status - near threatened',
    'grupo de alimentos _alga': 'Food category - algae',
    'risco de extinção_Em Perigo ': 'Conservation status - endangered',
    'risco de extinção_Extinta ': 'Conservation status - extinct',
}

for col in X.columns:
    if col not in remap_dct:
        print(col)



In [20]:
X = X.rename(columns=remap_dct)
X

Unnamed: 0,Occurrence in Brazilian states,Commercially cultivated,Availability of nutritional composition data,Number of recipes in the Brazilian Household Budget Survey,Number of recipes from popular books and websites,Labeler from nutrition science,Food category - algae,Food category - wild animals,Food category - mushrooms,Food category - insects,...,Conservation status - extinct,Conservation status - not evaluated,Conservation status - near threatened,Conservation status - least concern,Conservation status - data deficient,Conservation status - vulnerable,Origin - exotic,Origin - native,Origin - naturally occurring,Origin - resident
0,1,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
1,9,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
2,8,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
3,10,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
4,8,0,0,0,0.00000,0,True,False,False,False,...,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,4,0,3,0,1.00000,1,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
365,22,0,0,0,0.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
366,27,1,0,0,10.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
367,8,0,3,0,0.00000,1,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False


## Feature Importance Analysis

In [21]:
# Train LightGBM models for feature importance analysis
lgbm_composition_priority = LGBMRegressor(random_state=314, verbose=-1)
lgbm_composition_priority.fit(X, combined_composition_priority)

lgbm_consumption_priority = LGBMRegressor(random_state=314, verbose=-1)
lgbm_consumption_priority.fit(X, combined_consumption_priority)

print("LightGBM models trained for feature importance analysis.")

LightGBM models trained for feature importance analysis.


In [22]:
import shap
import matplotlib.pyplot as plt

def plot_shap_summary(model, X, y, title: str, output_file: str) -> None:
    """Plot SHAP summary and save the figure.
    
    Args:
        model: The model to explain.
        X: The input features.
        y: The target values.
        title (str): The title of the plot.
        output_file (str): The file path to save the plot.
    """
    # Create SHAP explainer and compute SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    

    ax = shap.plots.beeswarm(shap_values, show=False, plot_size=(20, 10))
 
    # Center title
    plt.suptitle(title, x=0.5, y=1.02, ha='center', fontsize='x-large')

    plt.tight_layout()
    
    # Save the plot to the specified file
    plt.savefig(output_file, bbox_inches='tight', dpi=300)
    plt.close()
    
    # Print confirmation message
    print(f"SHAP summary plot saved to {output_file}")

plot_shap_summary(lgbm_composition_priority, X, combined_composition_priority,
                  'Feature Importance for Composition Priority', 
                  'output/shap_composition_priority.png')

plot_shap_summary(lgbm_consumption_priority, X, combined_consumption_priority,
                  'Feature Importance for Consumption Priority', 
                  'output/shap_consumption_priority.png')

SHAP summary plot saved to output/shap_composition_priority.png
SHAP summary plot saved to output/shap_consumption_priority.png


In [23]:
X.columns

Index(['Occurrence in Brazilian states', 'Commercially cultivated',
       'Availability of nutritional composition data',
       'Number of recipes in the Brazilian Household Budget Survey',
       'Number of recipes from popular books and websites',
       'Labeler from nutrition science', 'Food category - algae',
       'Food category - wild animals', 'Food category - mushrooms',
       'Food category - insects', 'Food category - fish and seafood',
       'Food category - plants', 'Conservation status - endangered',
       'Conservation status - extinct', 'Conservation status - not evaluated',
       'Conservation status - near threatened',
       'Conservation status - least concern',
       'Conservation status - data deficient',
       'Conservation status - vulnerable', 'Origin - exotic',
       'Origin - native', 'Origin - naturally occurring', 'Origin - resident'],
      dtype='object')

In [24]:
import shap
import matplotlib.pyplot as plt

def plot_shap_cohort(model, X, y, title: str, output_file: str) -> None:
    """Plot SHAP cohort and save the figure.
    
    Args:
        model: The model to explain.
        X: The input features.
        y: The target values.
        title (str): The title of the plot.
        output_file (str): The file path to save the plot.
    """
    # Create SHAP explainer and compute SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    
    # Create a new axis for the plot with size 20x10
    fig, ax = plt.subplots(figsize=(20, 10))

    labeler = ["Nutrition Science" if shap_values[i, 'Labeler from nutrition science'].data == 1 else "Biology" for i in range(shap_values.shape[0])]

    new_cols = [i for i in X.columns if i != 'Labeler from nutrition science']

    # remove 'Labeler from nutrition science' from the plot
    shap_values = shap_values[:, new_cols]
    
    ax = shap.plots.bar(shap_values.cohorts(labeler).abs.mean(0), show=False, ax=ax)
        
    # Set plot title and layout
    plt.title(title)
    plt.tight_layout()
    
    # Save the plot to the specified file
    plt.savefig(output_file, bbox_inches='tight', dpi=300)
    plt.close()
    
    # Print confirmation message
    print(f"SHAP cohort plot saved to {output_file}")

plot_shap_cohort(lgbm_composition_priority, X, combined_composition_priority,
                  'Feature Importance for Composition Priority', 
                  'output/shap_cohort_composition_priority.png')

plot_shap_cohort(lgbm_consumption_priority, X, combined_consumption_priority,
                  'Feature Importance for Consumption Priority', 
                  'output/shap_cohort_consumption_priority.png')

SHAP cohort plot saved to output/shap_cohort_composition_priority.png
SHAP cohort plot saved to output/shap_cohort_consumption_priority.png


## Data Export

In [25]:
# Prepare data for export
X_reset = X.reset_index(drop=True)
combined_composition_priority_reset = pd.Series(combined_composition_priority, name='label_composition').reset_index(drop=True)
combined_consumption_priority_reset = pd.Series(combined_consumption_priority, name='label_consumption').reset_index(drop=True)

data_output = pd.concat([X_reset, combined_composition_priority_reset, combined_consumption_priority_reset], axis=1)
data_output['id'] = data_output.index + 1
data_output = data_output[['id'] + [col for col in data_output.columns if col != 'id']]

# Export to Excel
output_file = 'output/results.xlsx'
data_output.to_excel(output_file, index=False)
print(f"Results exported to {output_file}")

Results exported to output/results.xlsx
