# Regression Tests for Bionutrient Analysis

This notebook performs regression analysis on nutrient data, focusing on the prioritization of food composition and consumption research. It includes data preprocessing, model training, and visualization of results using various machine learning techniques.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from glob import glob
import re
import time
from multiprocessing import cpu_count

# Machine learning libraries
from sklearn.model_selection import StratifiedKFold, cross_val_predict, train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Machine learning models
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

# Set pandas display options
pd.options.display.float_format = '{:.5f}'.format

## Data Loading and Preprocessing

In [2]:
# Load Excel files from specified directories
files_bio = glob('input/BIOLOGOS/*.xlsx')
files_food = glob('input/ALIMENTOS/*.xlsx')

print(f"Number of biologist files: {len(files_bio)}, Number of food scientist files: {len(files_food)}")

Number of biologist files: 5, Number of food scientist files: 13


In [3]:
def process_files(file_list, labeler_type):
    """Process Excel files and return a dictionary of DataFrames."""
    dfs = {}
    for file in file_list:
        df = pd.read_excel(file)
        labeler = file.split('/')[-1].split('.')[0]
        match = re.findall(r'BIONUT - (\w+)', labeler)
        if match:
            labeler = match[0]
        df['person'] = labeler
        df['labeler_from_food_science'] = 1 if labeler_type == 'FOOD' else 0
        print(f"Processed file for: {labeler}")
        dfs[labeler] = df
    return dfs

dfs_bio = process_files(files_bio, 'BIO')
dfs_food = process_files(files_food, 'FOOD')

Processed file for: ALVES
Processed file for: BIZRI
Processed file for: MORCATTY
Processed file for: OLIVEIRA
Processed file for: ZENOBIA
Processed file for: AQUINO
Processed file for: BATISTA
Processed file for: BEZERRA
Processed file for: CARIOCA
Processed file for: CAZARIN
Processed file for: GIUTINI
Processed file for: LIMA
Processed file for: MORAIS
Processed file for: NORDE
Processed file for: OLIVEIRA
Processed file for: SANTOS
Processed file for: STELUTI
Processed file for: TEIXEIRA


In [4]:
# Define Likert scale mapping
likert_scale = {
    'Muito Baixa': 1,
    'Baixa': 2,
    'Neutra': 3,
    'Alta': 4,
    'Muito Alta': 5
}

def process_likert_scores(dfs):
    """Process Likert scale scores in the DataFrames."""
    for _, df in dfs.items():
        df['priority_composition'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimento seja analisado em estudos de composição nutricional? '].replace(likert_scale).fillna(0)
        df['priority_consumption'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimento seja incluído em pesquisas de consumo alimentar?'].replace(likert_scale).fillna(0)
        df['confidence_composition'] = df['Qual o seu nível de confiança nessa resposta relacionada à priorização de composição?'].replace(likert_scale).fillna(0)
        df['confidence_consumption'] = df['Qual o seu nível de confiança na sua resposta relacionada à priorização de consumo?'].replace(likert_scale).fillna(0)

process_likert_scores(dfs_bio)
process_likert_scores(dfs_food)

  df['priority_composition'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimento seja analisado em estudos de composição nutricional? '].replace(likert_scale).fillna(0)
  df['priority_consumption'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimento seja incluído em pesquisas de consumo alimentar?'].replace(likert_scale).fillna(0)
  df['confidence_composition'] = df['Qual o seu nível de confiança nessa resposta relacionada à priorização de composição?'].replace(likert_scale).fillna(0)
  df['confidence_consumption'] = df['Qual o seu nível de confiança na sua resposta relacionada à priorização de consumo?'].replace(likert_scale).fillna(0)
  df['priority_composition'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimento seja analisado em estudos de composição nutricional? '].replace(likert_scale).fillna(0)
  df['priority_consumption'] = df['Considerando a escala seguinte, qual a prioridade para que esse alimen

## Data Processing and Feature Engineering

In [5]:
import numpy as np
import pandas as pd
from typing import Dict

def create_confidence_array(dataframes: Dict[str, pd.DataFrame], score_type: str) -> np.ndarray:
    """Create a numpy array of confidence scores from a dictionary of DataFrames.
    
    Args:
        dataframes (Dict[str, pd.DataFrame]): Dictionary of DataFrames containing confidence scores.
        score_type (str): Type of score to extract (e.g., 'composition', 'consumption').

    Returns:
        np.ndarray: Numpy array of confidence scores.
    """
    # Get the first key to determine the shape of the array
    first_key = list(dataframes.keys())[0]
    array_shape = (len(dataframes[first_key]), len(dataframes))
    
    # Initialize the confidence array with zeros
    confidence_array = np.zeros(array_shape, dtype=int)
    
    # Populate the confidence array with values from the DataFrames
    for index, (_, dataframe) in enumerate(dataframes.items()):
        confidence_array[:, index] = dataframe[f'confidence_{score_type}'].values
    
    # Replace zeros with the most common value in the second column
    most_common_value = np.argmax(np.bincount(confidence_array[:, 1]))
    confidence_array[confidence_array == 0] = most_common_value
    
    return confidence_array

# Create confidence arrays for different score types and datasets
confidence_composition_food = create_confidence_array(dfs_food, 'composition')
confidence_consumption_food = create_confidence_array(dfs_food, 'consumption')
confidence_composition_bio = create_confidence_array(dfs_bio, 'composition')
confidence_consumption_bio = create_confidence_array(dfs_bio, 'consumption')

print("Confidence arrays created for composition and consumption scores.")

Confidence arrays created for composition and consumption scores.


In [6]:
confidence_consumption_food.shape, confidence_composition_food.shape

((369, 13), (369, 13))

In [7]:
confidence_consumption_bio.shape, confidence_composition_bio.shape

((369, 5), (369, 5))

In [8]:
import numpy as np
import pandas as pd
from typing import Dict

def create_priority_array(dataframes: Dict[str, pd.DataFrame], score_type: str) -> np.ndarray:
    """Create a numpy array of priority scores from a dictionary of DataFrames.
    
    Args:
        dataframes (Dict[str, pd.DataFrame]): Dictionary of DataFrames containing priority scores.
        score_type (str): Type of score to extract (e.g., 'composition', 'consumption').

    Returns:
        np.ndarray: Numpy array of priority scores.
    """
    # Get the first key to determine the shape of the array
    first_key = list(dataframes.keys())[0]
    array_shape = (len(dataframes[first_key]), len(dataframes))
    
    # Initialize the priority array with zeros
    priority_array = np.zeros(array_shape, dtype=int)
    
    # Populate the priority array with values from the DataFrames
    for index, (_, dataframe) in enumerate(dataframes.items()):
        priority_array[:, index] = dataframe[f'priority_{score_type}'].values
    
    # Replace zeros with the most common value in the second column
    most_common_value = np.argmax(np.bincount(priority_array[:, 1]))
    priority_array[priority_array == 0] = most_common_value
    
    return priority_array

# Create priority arrays for different score types and datasets
priority_composition_food = create_priority_array(dfs_food, 'composition')
priority_consumption_food = create_priority_array(dfs_food, 'consumption')
priority_composition_bio = create_priority_array(dfs_bio, 'composition')
priority_consumption_bio = create_priority_array(dfs_bio, 'consumption')

print("Priority arrays created for composition and consumption scores.")

Priority arrays created for composition and consumption scores.


In [9]:
priority_composition_food.shape, priority_consumption_food.shape

((369, 13), (369, 13))

In [10]:
priority_composition_bio.shape, priority_consumption_bio.shape

((369, 5), (369, 5))

In [11]:

# Define feature columns
feature_columns = [
    'food category', 'occurrence by region',
    'conservation status', 'origin', 'commercial cultivation',
    'availability of nutritional composition data',
    'number of recipes (POF/IBGE)',
    'number of recipes - other sources',
    'labeler_from_food_science'
]

biologist_example = list(dfs_bio.keys())[0]
food_scientist_example = list(dfs_food.keys())[0]

# Combine data from biologists and food scientists
combined_features =  pd.concat([dfs_bio[biologist_example][feature_columns], dfs_food[food_scientist_example][feature_columns]])

# Calculate weighted averages for composition and consumption priorities
weighted_composition_priority_bio = np.average(priority_composition_bio, weights=confidence_composition_bio, axis=1)
weighted_consumption_priority_bio = np.average(priority_consumption_bio, weights=confidence_consumption_bio, axis=1)
weighted_composition_priority_food = np.average(priority_composition_food, weights=confidence_composition_food, axis=1)
weighted_consumption_priority_food = np.average(priority_consumption_food, weights=confidence_consumption_food, axis=1)

# Combine weighted averages
combined_composition_priority = np.concatenate([weighted_composition_priority_bio, weighted_composition_priority_food])
combined_consumption_priority = np.concatenate([weighted_consumption_priority_bio, weighted_consumption_priority_food])

# Print shapes of the resulting matrices and vectors
print(f"Shape of feature matrix combined_features: {combined_features.shape}")
print(f"Shape of composition priority vector: {combined_composition_priority.shape}")
print(f"Shape of consumption priority vector: {combined_consumption_priority.shape}")

Shape of feature matrix combined_features: (738, 9)
Shape of composition priority vector: (738,)
Shape of consumption priority vector: (738,)


In [12]:
combined_features

Unnamed: 0,food category,occurrence by region,conservation status,origin,commercial cultivation,availability of nutritional composition data,number of recipes (POF/IBGE),number of recipes - other sources,labeler_from_food_science
0,algae,BA,Not Evaluated (NE),Native,No,Does not exist,0,0,0
1,algae,"BA, CE, PB, PE, PI, RN, SE, ES, RJ",Not Evaluated (NE),Native,No,Does not exist,0,0,0
2,algae,"BA, CE, PE, SE, RJ, SP, PR, SC",Not Evaluated (NE),Native,No,Does not exist,0,4,0
3,algae,"AL, BA, CE, PB, PE, PI, RN, ES, RJ, SP",Not Evaluated (NE),Native,No,Does not exist,0,0,0
4,algae,"AL, BA, CE, MA, PB, PE, RN, ES",Not Evaluated (NE),Native,No,Does not exist,0,0,0
...,...,...,...,...,...,...,...,...,...
364,plant,"AL, BA, PE, SE",Not Evaluated (NE),Native,No,"Yes, at the species level",0,2,1
365,plant,"AC, AM, PA, RO, RR, TO, AL, BA, CE, MA, PE, PI...",Least Concern (LC),Native,No,Does not exist,0,2,1
366,plant,"AC, AM, AP, PA, RO, RR, TO, AL, BA, CE, MA, PB...",Least Concern (LC),Native,Yes,Does not exist,0,3225,1
367,plant,"GO, MS, MG, RJ, SP, PR, RS, SC",Least Concern (LC),Native,No,"Yes, at the species level",0,1,1


In [13]:
# Feature engineering: Create 'number_states' feature
combined_features['number_states'] = combined_features['occurrence by region'].astype(str).apply(lambda x: len(x.split(',')))

# Update feature columns list
new_feature_cols = [
    'food category', 'number_states',
    'conservation status', 'origin', 'commercial cultivation',
    'availability of nutritional composition data',
    'number of recipes (POF/IBGE)',
    'number of recipes - other sources',
    'labeler_from_food_science'
]

X = combined_features[new_feature_cols]

# Encode categorical variables
X['commercial cultivation'] = X['commercial cultivation'].replace({'No': 0, 'Yes ': 1})
X['availability of nutritional composition data'] = X['availability of nutritional composition data'].replace({
    'Does not exist': 0,
    'Yes, by common name': 1,
    'Yes, at the genus level': 2,
    'Yes, at the species level': 3
})

# One-hot encode remaining categorical variables
X = pd.get_dummies(X, columns=['food category', 'conservation status', 'origin'])

# Fill NaN values with 0
X = X.fillna(0)

print("Feature engineering and encoding completed.")
print(f"Final shape of feature matrix X: {X.shape}")

Feature engineering and encoding completed.
Final shape of feature matrix X: (738, 23)


  X['commercial cultivation'] = X['commercial cultivation'].replace({'No': 0, 'Yes ': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['commercial cultivation'] = X['commercial cultivation'].replace({'No': 0, 'Yes ': 1})
  X['availability of nutritional composition data'] = X['availability of nutritional composition data'].replace({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['availability of nutritional composition data'] = X['availability of nutritional composition data'].replace({


In [14]:
X

Unnamed: 0,number_states,commercial cultivation,availability of nutritional composition data,number of recipes (POF/IBGE),number of recipes - other sources,labeler_from_food_science,food category_algae,food category_fish and seadfood,food category_insect,food category_mushrooms,...,conservation status_Data Deficient (DD),conservation status_Endangered (EN),conservation status_Least Concern (LC),conservation status_Near Threatened (NT),conservation status_Not Evaluated (NE),conservation status_Vulnerable (VU),origin_Exotic,origin_Native,origin_Naturally occurring,origin_Resident
0,1,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
1,9,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
2,8,0,0,0,4,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
3,10,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
4,8,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,4,0,3,0,2,1,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
365,22,0,0,0,2,1,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
366,27,1,0,0,3225,1,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
367,8,0,3,0,1,1,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False


## Model Training and Evaluation

In [15]:
def train_models(X, y):
    """Train and evaluate multiple regression models."""
    random_state = 271828
    
    models = [
        ('LinearSVR', LinearSVR(random_state=random_state)),
        ('LinearRegression', LinearRegression(n_jobs=-1)),
        ('RandomForestRegressor', RandomForestRegressor(random_state=random_state)),
        ('LGBMRegressor', LGBMRegressor(random_state=random_state, verbose=-1)),
        ('XGBRegressor', XGBRegressor(random_state=random_state, verbosity=0)),
        ('MLPRegressor', MLPRegressor(random_state=random_state)),
        ('SGDRegressor', SGDRegressor(random_state=random_state)),
        ('KNeighborsRegressor', KNeighborsRegressor(n_jobs=-1)),
        ('DecisionTreeRegressor', DecisionTreeRegressor(random_state=random_state)),
        ('ExtraTreesRegressor', ExtraTreesRegressor(random_state=random_state))
    ]

    results = []
    cv = KFold(n_splits=10, shuffle=True, random_state=random_state)

    for name, model in models:
        start_time = time.time()

        try:
            pred = cross_val_predict(estimator=model, X=X, y=y, cv=cv, method="predict", n_jobs=2)
            rmse = np.sqrt(mean_squared_error(y, pred))
            mae = mean_absolute_error(y, pred)
            r2 = r2_score(y, pred)
            elapsed = time.time() - start_time
            results.append([name, elapsed, rmse, mae, r2])
            print(f'Name: {name} - Elapsed: {elapsed:.2f}s - RMSE: {rmse:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}')
        except Exception as e:
            print(f'Error {name} - {e}')

    columns = ['Model', 'Time', 'RMSE', 'MAE', 'R2']
    df_results = pd.DataFrame(results, columns=columns)
    return df_results

print("Training models for composition priority...")
df_results_composition_priority = train_models(X, combined_composition_priority)

print("\nTraining models for consumption priority...")
df_results_consumption_priority = train_models(X, combined_consumption_priority)

print("\nResults for composition priority:")
print(df_results_composition_priority.sort_values(by='RMSE', ascending=True))

print("\nResults for consumption priority:")
print(df_results_consumption_priority.sort_values(by='RMSE', ascending=True))

Training models for composition priority...




Name: LinearSVR - Elapsed: 0.93s - RMSE: 13.4295 - MAE: 1.8942 - R2: -315.3539
Name: LinearRegression - Elapsed: 0.02s - RMSE: 0.6182 - MAE: 0.4849 - R2: 0.3295
Name: RandomForestRegressor - Elapsed: 0.99s - RMSE: 0.4474 - MAE: 0.3057 - R2: 0.6488
Name: LGBMRegressor - Elapsed: 0.91s - RMSE: 0.4295 - MAE: 0.3045 - R2: 0.6765
Name: XGBRegressor - Elapsed: 0.79s - RMSE: 0.4237 - MAE: 0.2853 - R2: 0.6851
Name: MLPRegressor - Elapsed: 0.74s - RMSE: 0.7126 - MAE: 0.4487 - R2: 0.1092
Name: SGDRegressor - Elapsed: 0.03s - RMSE: 1093415686955079.6250 - MAE: 129692634149714.5312 - R2: -2097109940670760562070868983808.0000
Name: KNeighborsRegressor - Elapsed: 0.17s - RMSE: 0.5537 - MAE: 0.4116 - R2: 0.4621
Name: DecisionTreeRegressor - Elapsed: 0.03s - RMSE: 0.5225 - MAE: 0.3570 - R2: 0.5211
Name: ExtraTreesRegressor - Elapsed: 0.80s - RMSE: 0.5052 - MAE: 0.3405 - R2: 0.5523

Training models for consumption priority...
Name: LinearSVR - Elapsed: 0.14s - RMSE: 6.4399 - MAE: 1.1844 - R2: -46.1188




Name: RandomForestRegressor - Elapsed: 0.96s - RMSE: 0.4664 - MAE: 0.3222 - R2: 0.7528
Name: LGBMRegressor - Elapsed: 0.52s - RMSE: 0.4382 - MAE: 0.3120 - R2: 0.7819
Name: XGBRegressor - Elapsed: 0.43s - RMSE: 0.4507 - MAE: 0.3175 - R2: 0.7692
Name: MLPRegressor - Elapsed: 0.85s - RMSE: 0.6512 - MAE: 0.4439 - R2: 0.5182
Name: SGDRegressor - Elapsed: 0.02s - RMSE: 1006071321068825.2500 - MAE: 110634811678612.3438 - R2: -1150004542928355650793501622272.0000
Name: KNeighborsRegressor - Elapsed: 0.23s - RMSE: 0.5859 - MAE: 0.4514 - R2: 0.6100
Name: DecisionTreeRegressor - Elapsed: 0.03s - RMSE: 0.6144 - MAE: 0.4055 - R2: 0.5712
Name: ExtraTreesRegressor - Elapsed: 0.82s - RMSE: 0.4932 - MAE: 0.3340 - R2: 0.7236

Results for composition priority:
                   Model    Time                   RMSE                   MAE  \
4           XGBRegressor 0.78804                0.42373               0.28534   
3          LGBMRegressor 0.90833                0.42948               0.30445   
2  Ra

In [16]:
df_results_consumption_priority.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Model,Time,RMSE,MAE,R2
3,LGBMRegressor,0.51903,0.43816,0.31199,0.78188
4,XGBRegressor,0.43485,0.45068,0.31746,0.76923
2,RandomForestRegressor,0.95986,0.4664,0.32225,0.75285
9,ExtraTreesRegressor,0.81904,0.49319,0.33395,0.72364
7,KNeighborsRegressor,0.23376,0.5859,0.4514,0.60998
8,DecisionTreeRegressor,0.03265,0.61435,0.40551,0.57118
1,LinearRegression,0.02235,0.64907,0.50271,0.52134
5,MLPRegressor,0.8485,0.65123,0.44392,0.51816
0,LinearSVR,0.14357,6.43986,1.18443,-46.1188
6,SGDRegressor,0.0227,1006071321068825.2,110634811678612.34,-1.1500045429283557e+30


In [17]:
df_results_composition_priority.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Model,Time,RMSE,MAE,R2
4,XGBRegressor,0.78804,0.42373,0.28534,0.68506
3,LGBMRegressor,0.90833,0.42948,0.30445,0.67645
2,RandomForestRegressor,0.98975,0.44745,0.30571,0.64882
9,ExtraTreesRegressor,0.79814,0.5052,0.34051,0.5523
8,DecisionTreeRegressor,0.03282,0.52252,0.35704,0.52109
7,KNeighborsRegressor,0.1742,0.55374,0.41159,0.46214
1,LinearRegression,0.02307,0.61825,0.4849,0.32954
5,MLPRegressor,0.73783,0.71264,0.44865,0.10917
0,LinearSVR,0.92684,13.42955,1.89424,-315.35394
6,SGDRegressor,0.0329,1093415686955079.6,129692634149714.52,-2.0971099406707603e+30


In [18]:
X

Unnamed: 0,number_states,commercial cultivation,availability of nutritional composition data,number of recipes (POF/IBGE),number of recipes - other sources,labeler_from_food_science,food category_algae,food category_fish and seadfood,food category_insect,food category_mushrooms,...,conservation status_Data Deficient (DD),conservation status_Endangered (EN),conservation status_Least Concern (LC),conservation status_Near Threatened (NT),conservation status_Not Evaluated (NE),conservation status_Vulnerable (VU),origin_Exotic,origin_Native,origin_Naturally occurring,origin_Resident
0,1,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
1,9,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
2,8,0,0,0,4,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
3,10,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
4,8,0,0,0,0,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,4,0,3,0,2,1,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
365,22,0,0,0,2,1,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
366,27,1,0,0,3225,1,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
367,8,0,3,0,1,1,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False


In [19]:
X.columns

Index(['number_states', 'commercial cultivation',
       'availability of nutritional composition data',
       'number of recipes (POF/IBGE)', 'number of recipes - other sources',
       'labeler_from_food_science', 'food category_algae',
       'food category_fish and seadfood', 'food category_insect',
       'food category_mushrooms', 'food category_plant',
       'food category_wild animals',
       'conservation status_Critically Endangered (CR)',
       'conservation status_Data Deficient (DD)',
       'conservation status_Endangered (EN)',
       'conservation status_Least Concern (LC)',
       'conservation status_Near Threatened (NT)',
       'conservation status_Not Evaluated (NE)',
       'conservation status_Vulnerable (VU)', 'origin_Exotic', 'origin_Native',
       'origin_Naturally occurring', 'origin_Resident'],
      dtype='object')

## Feature Importance Analysis

In [20]:
# Train LightGBM models for feature importance analysis
lgbm_composition_priority = LGBMRegressor(random_state=271828, verbose=-1)
lgbm_composition_priority.fit(X, combined_composition_priority)

lgbm_consumption_priority = LGBMRegressor(random_state=271828, verbose=-1)
lgbm_consumption_priority.fit(X, combined_consumption_priority)

print("LightGBM models trained for feature importance analysis.")

LightGBM models trained for feature importance analysis.


In [21]:
import shap
import matplotlib.pyplot as plt

def plot_shap_summary(model, X, y, title: str, output_file: str) -> None:
    """Plot SHAP summary and save the figure.
    
    Args:
        model: The model to explain.
        X: The input features.
        y: The target values.
        title (str): The title of the plot.
        output_file (str): The file path to save the plot.
    """
    # Create SHAP explainer and compute SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X)


    ax = shap.plots.beeswarm(shap_values, show=False, plot_size=(15, 7))
 
    # Center title
    plt.suptitle(title, x=0.5, y=1.02, ha='center', fontsize=20)

    plt.tight_layout()
    
    # Save the plot to the specified file
    plt.savefig(output_file, bbox_inches='tight', dpi=600)
    plt.close()
    
    # Print confirmation message
    print(f"SHAP summary plot saved to {output_file}")

plot_shap_summary(lgbm_composition_priority, X, combined_composition_priority,
                  'Feature Importance for Composition Priority', 
                  'output/shap_composition_priority.png')

plot_shap_summary(lgbm_consumption_priority, X, combined_consumption_priority,
                  'Feature Importance for Consumption Priority', 
                  'output/shap_consumption_priority.png')

SHAP summary plot saved to output/shap_composition_priority.png
SHAP summary plot saved to output/shap_consumption_priority.png


In [22]:
X.columns

Index(['number_states', 'commercial cultivation',
       'availability of nutritional composition data',
       'number of recipes (POF/IBGE)', 'number of recipes - other sources',
       'labeler_from_food_science', 'food category_algae',
       'food category_fish and seadfood', 'food category_insect',
       'food category_mushrooms', 'food category_plant',
       'food category_wild animals',
       'conservation status_Critically Endangered (CR)',
       'conservation status_Data Deficient (DD)',
       'conservation status_Endangered (EN)',
       'conservation status_Least Concern (LC)',
       'conservation status_Near Threatened (NT)',
       'conservation status_Not Evaluated (NE)',
       'conservation status_Vulnerable (VU)', 'origin_Exotic', 'origin_Native',
       'origin_Naturally occurring', 'origin_Resident'],
      dtype='object')

In [23]:
import shap
import matplotlib.pyplot as plt

def plot_shap_cohort(model, X, y, title: str, output_file: str) -> None:
    """Plot SHAP cohort and save the figure.
    
    Args:
        model: The model to explain.
        X: The input features.
        y: The target values.
        title (str): The title of the plot.
        output_file (str): The file path to save the plot.
    """
    # Create SHAP explainer and compute SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    
    # Create a new axis for the plot with size 15x7
    fig, ax = plt.subplots(figsize=(15, 7))

    labeler = ["Nutrition Science" if shap_values[i, 'labeler_from_food_science'].data == 1 else "Biology" for i in range(shap_values.shape[0])]

    new_cols = [i for i in X.columns if i != 'labeler_from_food_science']

    # remove 'labeler_from_food_science' from the plot
    shap_values = shap_values[:, new_cols]
    
    ax = shap.plots.bar(shap_values.cohorts(labeler).abs.mean(0), show=False, ax=ax)
        
    # Set plot title and layout
    plt.suptitle(title, x=0.5, y=1.02, ha='center', fontsize=20)
    plt.tight_layout()
    
    # Save the plot to the specified file
    plt.savefig(output_file, bbox_inches='tight', dpi=600)
    plt.close()
    
    # Print confirmation message
    print(f"SHAP cohort plot saved to {output_file}")

plot_shap_cohort(lgbm_composition_priority, X, combined_composition_priority,
                  'Feature Importance for Composition Priority', 
                  'output/shap_cohort_composition_priority.png')

plot_shap_cohort(lgbm_consumption_priority, X, combined_consumption_priority,
                  'Feature Importance for Consumption Priority', 
                  'output/shap_cohort_consumption_priority.png')

SHAP cohort plot saved to output/shap_cohort_composition_priority.png
SHAP cohort plot saved to output/shap_cohort_consumption_priority.png


## Data Export

In [24]:
# Prepare data for export
X_reset = X.reset_index(drop=True)
combined_composition_priority_reset = pd.Series(combined_composition_priority, name='label_composition').reset_index(drop=True)
combined_consumption_priority_reset = pd.Series(combined_consumption_priority, name='label_consumption').reset_index(drop=True)

data_output = pd.concat([X_reset, combined_composition_priority_reset, combined_consumption_priority_reset], axis=1)
data_output['id'] = data_output.index + 1
data_output = data_output[['id'] + [col for col in data_output.columns if col != 'id']]

# Export to Excel
output_file = 'output/results.xlsx'
data_output.to_excel(output_file, index=False)
print(f"Results exported to {output_file}")

Results exported to output/results.xlsx
