# **Machine Learning of Ecologic Variables**

In [None]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data
df = pd.read_csv('AVONETplusClim.csv' )

## **1. Random Forest Regressor**

### **1.1. Mass**

In [None]:
df_clean = df[(df['Mass'] < 11500) & (df['Mass'] > 0)].copy()
df_clean['Log_Mass'] = np.log10(df_clean['Mass'])

target = "Mass"
predictors = ['Trophic.Niche', 'Primary.Lifestyle', 'Habitat', 'Migration']

X = pd.get_dummies(df_clean[predictors], drop_first=False)
y = df_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

score = r2_score(y_test, rf.predict(X_test))

importances = pd.Series(rf.feature_importances_, index=X.columns)

print("="*20)
print(f"R2 Score: {score:.3f}")
print("="*20)

for col in predictors:
    print(df_clean.groupby(col)[target].mean().sort_values(ascending=False))
    print("-"*20)


In [None]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=["Diet (Trophic)", "Lifestyle", "Habitat", "Migration"],
    horizontal_spacing=0.2, # Increased spacing so Y-labels fit
    vertical_spacing=0.15
)

# Helper function
def add_panel(prefix, axis_title, row, col, bar_color):
    subset = importances[importances.index.str.contains(prefix)].copy()
    
    # Clean Labels
    subset.index = subset.index.str.replace(f"{prefix}_", "").str.replace(f"{prefix}.", "")
    
    subset = subset.sort_values(ascending=True)
    
    fig.add_trace(
        go.Bar(
            x=subset.values,
            y=subset.index,
            orientation='h',
            marker=dict(color=bar_color),
            name=prefix
        ),
        row=row, col=col
    )
    
    # --- ADD LABELS HERE ---
    fig.update_xaxes(title_text="Importance", row=row, col=col)
    fig.update_yaxes(title_text=axis_title, row=row, col=col)

# Add Panels with specific Y-Axis Titles
add_panel("Trophic.Niche", "Diet Type", 1, 1, "#1f77b4")
add_panel("Primary.Lifestyle", "Movement Style", 1, 2, "#2ca02c")
add_panel("Habitat", "Habitat Type", 2, 1, "#ff7f0e")
add_panel("Migration", "Migration Status", 2, 2, "#d62728")

fig.update_layout(
    title_text="Ecological Drivers of Body Mass (Category Breakdown)",
    height=800,
    width=1100,
    showlegend=False,
    font=dict(size=12),
    margin=dict(l=100) # Add left margin to ensure first label isn't cut off
)

fig.show()

In [None]:
eco_cols = ['Trophic.Niche', 'Primary.Lifestyle', 'Habitat', 'Migration']
morph_cols = ['Wing.Length', 'Beak.Length_Culmen', 'Tarsus.Length', 'Mass']

X = pd.get_dummies(df[eco_cols], drop_first=True)
y = df[morph_cols]

top_predictors_map = {}

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=morph_cols,
    horizontal_spacing=0.15,
    vertical_spacing=0.1
)

print(f"{'Trait':<20} | {'R2 Score':<10} | {'Top Specific Driver'}")
print("-" * 65)

for idx, trait in enumerate(morph_cols):

    X_train, X_test, y_train, y_test = train_test_split(X, y[trait], test_size=0.2, random_state=42)
    
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    score = r2_score(y_test, rf.predict(X_test))

    importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=True).tail(5)
    
    top_driver_specific = importances.index[-1]
    top_predictors_map[trait] = top_driver_specific
    
    print(f"{trait:<20} | {score:.3f}      | {top_driver_specific}")

    row = (idx // 2) + 1
    col = (idx % 2) + 1
    
    fig.add_trace(
        go.Bar(
            x=importances.values, 
            y=importances.index, 
            orientation='h',
            name=trait,
            marker=dict(color=importances.values, colorscale='Viridis')
        ),
        row=row, col=col
    )

fig.update_layout(
    title_text="Impact of Ecological Variables on Morphology (Feature Importance)",
    height=800,
    width=1000,
    showlegend=False
)
fig.show()

print("\n" + "="*35)
print("  AUTOMATED BIOLOGICAL CONCLUSIONS")
print("="*35)

for trait, driver_string in top_predictors_map.items():

    original_eco_col = next((col for col in eco_cols if col in driver_string), None)
    
    if original_eco_col:
        print(f"\n>>> CONCLUSION FOR: {trait}")
        print(f"The model found that '{original_eco_col}' is the strongest predictor.")
        print(f"Here is how {trait} changes across {original_eco_col}:")
        
        summary = df.groupby(original_eco_col)[trait].mean().sort_values(ascending=False)
        print(summary)
        print("-" * 50)