# **Machine Learning (Random Forest)**

In [6]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.inspection import partial_dependence
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data (cleaning and scaling)
df = pd.read_csv('AVONETplusClim.csv')
df_clean = df.copy()
df_clean = df_clean[(df_clean['Mass'] > 0) & (df_clean['Mass'] < 11500)]
df_clean = df_clean[(df_clean['Tail.Length'] > 0.1) & (df_clean['Tail.Length'] < 535)]
df_clean = df_clean[(df_clean['Tarsus.Length'] > 0) & (df_clean['Tarsus.Length'] < 350)]
df_clean = df_clean[(df_clean['Wing.Length'] > 0.1) & (df_clean['Wing.Length'] < 650)]
df_clean = df_clean[df_clean['Hand-Wing.Index'] > 3]
df_clean['Log_Mass'] = np.log10(df_clean['Mass'])
df_clean['Log_Tail'] = np.log10(df_clean['Tail.Length'])
df_clean['Log_Tarsus'] = np.log10(df_clean['Tarsus.Length'])
df_clean['Log_Beak'] = np.log10(df_clean['Beak.Length_Culmen'])
df_clean['Log_Wing'] = np.log10(df_clean['Wing.Length'])
df_clean['Log_HWI'] = np.log10(df_clean['Hand-Wing.Index'])

## **1. Body Size (Mass)**

In [7]:
target = "Log_Mass"
eco_predictors = ['Trophic.Niche', 'Primary.Lifestyle', 'Habitat', 'Migration']
clim_predictors = ['Mean.Temperature', 'Temp.Seasonality', 'Max.Temperature', 'Min.Temperature', 'Mean.Precipitation']

X_eco = pd.get_dummies(df_clean[eco_predictors], drop_first=False)
X_clim = df_clean[clim_predictors]

X = pd.concat([X_eco, X_clim], axis=1)
y = df_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)
rf.fit(X_train, y_train)
score = r2_score(y_test, rf.predict(X_test))

importances = rf.feature_importances_
feature_names = X_train.columns

df_imp = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=True)

In [None]:
label_dict = {

    'Trophic.Niche': 'Trophic Niche',
    'Primary.Lifestyle': 'Primary Lifestyle',
    'Habitat': 'Habitat',
    'Migration': 'Migration',
    'Trophic.Level': 'Trophic Level',

    'Mean.Temperature': 'Mean Temperature',
    'Temp.Seasonality': 'Temperature Seasonality',
    'Max.Temperature': 'Max Temperature',
    'Min.Temperature': 'Min Temperature',
    'Mean.Precipitation': 'Annual Precipitation',

    'Log_Mass': 'Log10(Mass)',
    'Mass': 'Body Mass (g)'
}

def get_label(col_name):
    return label_dict.get(col_name, col_name.replace('.', ' ').replace('_', ' '))

grouped_scores = {col: 0.0 for col in eco_predictors + clim_predictors}
raw_importances = pd.Series(rf.feature_importances_, index=X_train.columns)

for feature, score in raw_importances.items():
    found_group = False
    for group in eco_predictors:
        if feature.startswith(group):
            grouped_scores[group] += score
            found_group = True
            break
    if not found_group and feature in clim_predictors:
        grouped_scores[feature] += score

df_grouped_imp = pd.DataFrame([
    {'Feature': k, 'Importance': v, 'Label': get_label(k)} 
    for k, v in grouped_scores.items() if v > 0
]).sort_values(by='Importance', ascending=True)

fig_imp = px.bar(
    df_grouped_imp,
    x='Importance',
    y='Label',
    orientation='h',
    title='<b>Relative Importance of Predictors for Body Mass</b>',
    text_auto='.1%'
)

fig_imp.update_traces(
    marker_color='teal',
    textposition='outside',
    cliponaxis=False
)

fig_imp.update_layout(
    template="plotly_white",
    title_x=0.5,
    height=500,
    xaxis_title="Proportion of Variance Explained",
    yaxis_title="", 
    xaxis=dict(
        tickformat='.0%',
        range=[0, df_grouped_imp['Importance'].max() * 1.15]
    ),
    margin=dict(r=50)
)

fig_imp.show()

In [20]:
# =============================================================================
# PART 2: CLIMATE PDP
# =============================================================================

all_climate_vars = [f for f in df_imp.sort_values(by='Importance', ascending=False)['Feature'] 
                    if f in clim_predictors]
axis_labels = {
    'Mean.Temperature': 'Mean Temperature (°C)',
    'Temp.Seasonality': 'Temperature Seasonality',
    'Max.Temperature': 'Max Temperature (°C)',
    'Min.Temperature': 'Min Temperature (°C)',
    'Mean.Precipitation': 'Annual Precipitation (mm)'
}

fig_pdp = make_subplots(
    rows=2, cols=3, 
    vertical_spacing=0.12,
    horizontal_spacing=0.08
)

for i, feature in enumerate(all_climate_vars):
    
    row = (i // 3) + 1
    col = (i % 3) + 1

    pdp_results = partial_dependence(
        rf, X_train, [feature], kind="average", grid_resolution=50
    )

    x_vals = pdp_results['grid_values'][0]
    y_vals_grams = np.power(10, pdp_results['average'][0])

    fig_pdp.add_trace(
        go.Scatter(x=x_vals, y=y_vals_grams, mode='lines', name=feature, line=dict(width=3)),
        row=row, col=col
    )

    fig_pdp.update_xaxes(
        title_text=axis_labels.get(feature, feature),
        row=row, col=col
    )
    
    fig_pdp.update_yaxes(
        title_text="Predicted Mass (g)" if col == 1 else "",
        type="log",
        tickformat=".3s",
        minor=dict(showgrid=True),
        row=row, col=col
    )

fig_pdp.update_layout(
    template="plotly_white",
    title_text="<b>Predicted Mass Based on Climatic Variables</b>",
    title_x=0.5,
    height=700,
    showlegend=False,
    margin=dict(t=80, b=60, l=70, r=40),
)

fig_pdp.show()

In [None]:
for col in eco_predictors:

    if col in df_clean.columns:

        order = df_clean.groupby(col)['Mass'].mean().sort_values(ascending=False).index

        var_importance = df_grouped_imp.loc[df_grouped_imp['Feature'] == col, 'Importance']
        imp_text = f"(Importance: {var_importance.values[0]:.1%})" if not var_importance.empty else ""

        fig_eco = px.box(
            df_clean, 
            x=col, 
            y='Mass', 
            color=col,
            category_orders={col: order},
            log_y=True,
            title=f"<b>Distribution of Mass Based on {get_label(col)}</b> {imp_text}",
            labels={col: get_label(col), 'Mass': 'Body Mass (g) [Log Scale]'}, 
            points=False
        )
        
        fig_eco.update_layout(
            title_x=0.5,
            yaxis=dict(
                dtick=1, 
                tickformat=".0f"
            ),
            showlegend=False, 
            height=500,
            template="plotly_white"
        )
        fig_eco.show()