# **Exploratory Data Analysis of Ecologic Variables**

In [1]:
# Libraries
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, kruskal, f_oneway
from scikit_posthocs import posthoc_dunn
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data (cleaning and scaling)
df = pd.read_csv('AVONETplusClim.csv')
df_clean = df.copy()
df_clean = df_clean[(df_clean['Mass'] > 0) & (df_clean['Mass'] < 11500)]
df_clean = df_clean[(df_clean['Tarsus.Length'] > 0.1) & (df_clean['Tarsus.Length'] < 535)]
df_clean = df_clean[(df_clean['Tail.Length'] > 0) & (df_clean['Tail.Length'] < 350)]
df_clean = df_clean[(df_clean['Wing.Length'] > 0.1) & (df_clean['Wing.Length'] < 650)]
df_clean = df_clean[df_clean['Hand-Wing.Index'] > 3]
df_clean['log_Mass'] = np.log10(df_clean['Mass'])
df_clean['log_Tarsus'] = np.log10(df_clean['Tarsus.Length'])
df_clean['log_Tail'] = np.log10(df_clean['Tail.Length'])
df_clean['log_Beak'] = np.log10(df_clean['Beak.Length_Culmen'])
df_clean['log_Wing'] = np.log10(df_clean['Wing.Length'])
df_clean['log_HWI'] = np.log10(df_clean['Hand-Wing.Index'])

## **1. Body Size (Mass)**

### **1.1. Mass and Habitat**

Before starting, distribution of mass within each habitat group is explored based on skewness and kurtosis. Since they are numerous, habitat types are classified into three categories (open, closed, aquatic). 

In [2]:
conditions = [
    # Aquatic/Marine
    df_clean["Habitat"].isin(['Coastal', 'Marine', 'Riverine', 'Wetland']),
    # Closed/Forest
    df_clean["Habitat"].isin(['Forest', 'Woodland', 'Shrubland']),
    # Open/Terrestrial
    df_clean["Habitat"].isin(['Grassland', 'Desert', 'Rock', 'Human Modified'])
]
choices = ['Aquatic', 'Closed', 'Open']
df_clean["Habitat_Group"] = np.select(conditions, choices, default=None)

########################################################################

fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=("Closed Habitat", "Open Habitat", "Aquatic Habitat")
)

plot_configs = [
    ('Closed', 'forestgreen', 1),
    ('Open', 'sandybrown', 2),
    ('Aquatic', 'deepskyblue', 3)
]

for group_name, color, col_idx in plot_configs:

    subset = df_clean[df_clean['Habitat_Group'] == group_name]['log_Mass']
    s, k = skew(subset), kurtosis(subset)
    fig.add_trace(
        go.Histogram(x=subset, nbinsx=50, name=group_name, marker_color=color),
        row=1, col=col_idx
    )
    
    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )
    
    fig.update_xaxes(title_text="log(Mass) (log(g))", row=1, col=col_idx)

fig.update_layout(
    title_text="<b>Distribution of Mass by Habitat Group</b>",
    title_x=0.5,
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The closed habitat group could be assumed to show a normal distribution of mass. However, the open hapitat group has a high skew (skewness = 1) and the aquatic habitat group shows a flat distribution (kurtosis < -1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis. 

* H0: The distribution of mass is the same among habitat groups.
* H1: The distribution of mass significantly differ among at least two habitat groups.

In [3]:
groups = sorted(df_clean['Habitat_Group'].unique())
data_groups = [df_clean[df_clean['Habitat_Group'] == g]['Mass'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 612.5816
P-value: 9.5412e-134


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of mass among at least two habitat groups. To see which habitat groups differ in terms of mass, Dunn's test is done, in which

* H0: The distribution of mass is the same for the two habitat groups tested.
* H1: The distribution of mass differs between the two habitat groups tested.

In [4]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Mass', 
                                group_col='Habitat_Group', 
                                p_adjust='bonferroni')

print(dunn_results)

               Aquatic         Closed          Open
Aquatic   1.000000e+00  5.098032e-131  2.440586e-46
Closed   5.098032e-131   1.000000e+00  5.307519e-11
Open      2.440586e-46   5.307519e-11  1.000000e+00


Since p << 0.05, the null hypothesis (H0) is rejected; there are statistically significant differences in mass distribution between every single pair of habitat groups. Finally, to illustrate the distribution, a boxplot is created.

In [5]:
df_plot = df_clean[df_clean['Habitat_Group'].isin(['Closed','Open', 'Aquatic'])].copy()

fig = px.box(
    df_plot, 
    x='Habitat_Group', 
    y='log_Mass',
    title='<b>Distribution of Mass by Habitat Group</b>',
    color='Habitat_Group',
    color_discrete_sequence=['forestgreen', 'sandybrown', 'deepskyblue'],
    category_orders={"Habitat": ['Closed', 'Open', 'Aquatic']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Habitat Group',
    yaxis_title='log(Mass) (log(g))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **1.2. Mass and Migration**

Before starting, distribution of mass within each migration behavior is explored based on skewness and kurtosis.

In [6]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Sedentary', 'Partial', 'Migratory')
)

configs = [
    ('Sedentary', 'saddlebrown', 1),
    ('Partial', 'goldenrod', 2),
    ('Migratory', 'skyblue', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Migration'] == label]['log_Mass']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Mass) (log(g))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Mass by Migration</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The sedentary and partial classes could be assumed to show a normal distribution of mass. However, the migratory class has a high skew (skewness > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of mass is the same among migratory classes.
* H1: The distribution of mass significantly differ among at least two migratory classes.

In [7]:
groups = sorted(df_clean['Migration'].unique())
data_groups = [df_clean[df_clean['Migration'] == g]['Mass'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 97.5648
P-value: 6.5175e-22


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference of mass among at least two migratory classes. To see which migratory classess differ in terms of mass, Dunn's test is done, in which

* H0: The distribution of mass is the same for the two migratory classes tested.
* H1: The distribution of mass differs between the two migratory classes tested.

In [8]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Mass', 
                                group_col='Migration', 
                                p_adjust='bonferroni')

print(dunn_results)

              Migratory       Partial     Sedentary
Migratory  1.000000e+00  7.753828e-07  4.400106e-02
Partial    7.753828e-07  1.000000e+00  4.494725e-22
Sedentary  4.400106e-02  4.494725e-22  1.000000e+00


Between sedentary and migratory classes, p > 0.05, thus the null hypothesis (H0) is failed to reject. However, for the pairs partial-sedentary and partial-migratory, p << 0.05, thus there are statistically significant differences in mass distribution between these migratory classes. Finally, to illustrate the distribution, a boxplot is created.

In [9]:
df_plot = df_clean[df_clean['Migration'].isin(['Sedentary', 'Partial', 'Migratory'])].copy()

fig = px.box(
    df_plot, 
    x='Migration', 
    y='log_Mass',
    title='<b>Distribution of Mass by Migration</b>',
    color='Migration',
    color_discrete_sequence=['saddlebrown', 'goldenrod', 'skyblue'],
    category_orders={"Migration": ['Sedentary', 'Partial', 'Migratory']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Migration Behavior',
    yaxis_title='log(Mass) (log(g))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **1.3. Mass and Nutrition (Trophic Level)**

Before starting, distribution of mass within each trophic level is explored based on skewness and kurtosis.

In [10]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Carnivore', 'Herbivore', 'Omnivore')
)

configs = [
    ('Carnivore', 'firebrick', 1),
    ('Herbivore', 'forestgreen', 2),
    ('Omnivore', 'goldenrod', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Trophic.Level'] == label]['log_Mass']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Mass) (log(g))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Mass by Trophic Level</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The herbivores could be assumed to show a normal distribution of mass. However, the carnivores and omnivores have a high skew (skewness > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of mass is the same among trophic levels.
* H1: The distribution of mass significantly differ among at least two trophic levels.

In [11]:
df_trophic = df_clean[df_clean['Trophic.Level'].isin(['Carnivore', 'Herbivore', 'Omnivore'])]
data_groups = [df_trophic[df_trophic['Trophic.Level'] == g]['Mass'] for g in ['Carnivore', 'Herbivore', 'Omnivore']]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 67.3658
P-value: 2.3534e-15


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of mass among at least two trophic levels. To see which trophic levels differ in terms of mass, Dunn's test is done, in which

* H0: The distribution of mass is the same for the two trophic levels tested.
* H1: The distribution of mass differs between the two trophic levels tested.

In [12]:
dunn_results = posthoc_dunn(a=df_trophic, 
                                val_col='Mass', 
                                group_col='Trophic.Level', 
                                p_adjust='bonferroni')

print(dunn_results)

              Carnivore     Herbivore  Omnivore
Carnivore  1.000000e+00  6.844638e-15  0.000053
Herbivore  6.844638e-15  1.000000e+00  0.102522
Omnivore   5.302594e-05  1.025219e-01  1.000000


Between herbivores and omnivores, p > 0.05, thus the null hypothesis is failed to reject. However, the remaining pairs show statistically significant differences (p < 0.05) in mass distribution. Finally, to illustrate the distribution, a boxplot is created.

In [13]:
fig = px.box(
    df_trophic, 
    x='Trophic.Level', 
    y='log_Mass',
    title='<b>Distribution of Mass by Trophic Level</b>',
    color='Trophic.Level',
    color_discrete_sequence=['firebrick', 'forestgreen', 'goldenrod'],
    category_orders={"Trophic.Level": ['Carnivore', 'Herbivore', 'Omnivore']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Trophic Level',
    yaxis_title='log(Mass) (log(g))',
    template='plotly_white',
    showlegend=False
)

fig.show()

## **2. Leg Size (Tarsus Length)**

### **2.1. Tarsus Length and Habitat**

Before starting, distribution of tarsus length within each habitat group is explored based on skewness and kurtosis.

In [14]:
conditions = [
    # Aquatic/Marine
    df_clean["Habitat"].isin(['Coastal', 'Marine', 'Riverine', 'Wetland']),
    # Closed/Forest
    df_clean["Habitat"].isin(['Forest', 'Woodland', 'Shrubland']),
    # Open/Terrestrial
    df_clean["Habitat"].isin(['Grassland', 'Desert', 'Rock', 'Human Modified'])
]
choices = ['Aquatic', 'Closed', 'Open']
df_clean["Habitat_Group"] = np.select(conditions, choices, default=None)

########################################################################

fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=("Closed Habitat", "Open Habitat", "Aquatic Habitat")
)

plot_configs = [
    ('Closed', 'forestgreen', 1),
    ('Open', 'sandybrown', 2),
    ('Aquatic', 'deepskyblue', 3)
]

for group_name, color, col_idx in plot_configs:

    subset = df_clean[df_clean['Habitat_Group'] == group_name]['log_Tarsus']
    s, k = skew(subset), kurtosis(subset)
    fig.add_trace(
        go.Histogram(x=subset, nbinsx=50, name=group_name, marker_color=color),
        row=1, col=col_idx
    )
    
    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )
    
    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_text="<b>Distribution of Tarsus Length by Habitat Group</b>",
    title_x=0.5,
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The aquatic habitat group could be assumed to show a normal distribution of tarsus length. However, the closed and open habitat groups have high peaks (kurtosis > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of tarsus length is the same among habitat groups.
* H1: The distribution of tarsus length significantly differ among at least two habitat groups.

In [15]:
groups = sorted(df_clean['Habitat_Group'].unique())
data_groups = [df_clean[df_clean['Habitat_Group'] == g]['Tarsus.Length'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 574.2063
P-value: 2.0544e-125


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of tail length among at least two habitat groups. To see which habitat groups differ in terms of tarsus length, Dunn's test is done, in which

* H0: The distribution of tarsus length is the same for the two habitat groups tested.
* H1: The distribution of tarsus length differs between the two habitat groups tested.

In [16]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Tarsus.Length', 
                                group_col='Habitat_Group', 
                                p_adjust='bonferroni')

print(dunn_results)

               Aquatic         Closed          Open
Aquatic   1.000000e+00  2.158756e-121  1.350664e-39
Closed   2.158756e-121   1.000000e+00  1.012546e-12
Open      1.350664e-39   1.012546e-12  1.000000e+00


Since p << 0.05, the null hypothesis (H0) is rejected; there are statistically significant differences in tarsus length between every single pair of habitat groups. Finally, to illustrate the distribution, a boxplot is created.

In [17]:
df_plot = df_clean[df_clean['Habitat_Group'].isin(['Closed','Open', 'Aquatic'])].copy()

fig = px.box(
    df_plot, 
    x='Habitat_Group', 
    y='log_Tarsus',
    title='<b>Distribution of Tarsus Length by Habitat Group</b>',
    color='Habitat_Group',
    color_discrete_sequence=['forestgreen', 'sandybrown', 'deepskyblue'],
    category_orders={"Habitat": ['Closed', 'Open', 'Aquatic']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Habitat Group',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **2.2. Tarsus Length and Migration**

Before starting, distribution of tarsus length within each migration behavior is explored based on skewness and kurtosis.

In [18]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Sedentary', 'Partial', 'Migratory')
)

configs = [
    ('Sedentary', 'saddlebrown', 1),
    ('Partial', 'goldenrod', 2),
    ('Migratory', 'skyblue', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Migration'] == label]['log_Tarsus']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Tarsus Length by Migration</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

All the migratory classes have high peaks (kurtosis > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of tarsus length is the same among migratory classes.
* H1: The distribution of tarsus length significantly differ among at least two migratory classes.

In [19]:
groups = sorted(df_clean['Migration'].unique())
data_groups = [df_clean[df_clean['Migration'] == g]['Tarsus.Length'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 73.5431
P-value: 1.0723e-16


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of tarsus length among at least two migratory classes. To see which migratory classes differ in terms of tarsus length, Dunn's test is done, in which

* H0: The distribution of tarsus length is the same for the two migratory classes tested.
* H1: The distribution of tarsus length differs between the two migratory classes tested.

In [20]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Tarsus.Length', 
                                group_col='Migration', 
                                p_adjust='bonferroni')

print(dunn_results)

           Migratory       Partial     Sedentary
Migratory   1.000000  2.395446e-02  1.989164e-04
Partial     0.023954  1.000000e+00  5.137527e-15
Sedentary   0.000199  5.137527e-15  1.000000e+00


Since p << 0.05, the null hypothesis (H0) is rejected; there are statistically significant differences in tarsus length between every single pair of migratory classes. Finally, to illustrate the distribution, a boxplot is created.

In [21]:
df_plot = df_clean[df_clean['Migration'].isin(['Sedentary', 'Partial', 'Migratory'])].copy()

fig = px.box(
    df_plot, 
    x='Migration', 
    y='log_Tarsus',
    title='<b>Distribution of Tarsus Length by Migration</b>',
    color='Migration',
    color_discrete_sequence=['saddlebrown', 'goldenrod', 'skyblue'],
    category_orders={"Migration": ['Sedentary', 'Partial', 'Migratory']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Migration Behavior',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **2.3. Tarsus Length and Nutrition (Trophic Level)**

Before starting, distribution of tarsus length within each trophic level is explored based on skewness and kurtosis.

In [22]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Carnivore', 'Herbivore', 'Omnivore')
)

configs = [
    ('Carnivore', 'firebrick', 1),
    ('Herbivore', 'forestgreen', 2),
    ('Omnivore', 'goldenrod', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Trophic.Level'] == label]['log_Tarsus']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Tarsus Length by Trophic Level</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The herbivores could be assumed to show a normal distribution of tarsus length. However, the carnivores and omnivores have high peaks (kurtosis > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of tarsus length is the same among trophic levels.
* H1: The distribution of tarsus length significantly differ among at least two trophic levels.

In [23]:
df_trophic = df_clean[df_clean['Trophic.Level'].isin(['Carnivore', 'Herbivore', 'Omnivore'])]
data_groups = [df_trophic[df_trophic['Trophic.Level'] == g]['Tarsus.Length'] for g in ['Carnivore', 'Herbivore', 'Omnivore']]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 259.7519
P-value: 3.9408e-57


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of tarsus length among at least two trophic levels. To see which trophic levels differ in terms of tarsus length, Dunn's test is done, in which

* H0: The distribution of tarsus length is the same for the two trophic levels tested.
* H1: The distribution of tarsus length differs between the two trophic levels tested.

In [24]:
dunn_results = posthoc_dunn(a=df_trophic, 
                                val_col='Tarsus.Length', 
                                group_col='Trophic.Level', 
                                p_adjust='bonferroni')

print(dunn_results)

              Carnivore     Herbivore      Omnivore
Carnivore  1.000000e+00  1.378995e-42  9.724796e-05
Herbivore  1.378995e-42  1.000000e+00  5.274888e-45
Omnivore   9.724796e-05  5.274888e-45  1.000000e+00


Since p << 0.05, the null hypothesis (H0) is rejected; there are statistically significant differences in tarsus length between every single pair of trophic levels. Finally, to illustrate the distribution, a boxplot is created.

In [25]:
fig = px.box(
    df_trophic, 
    x='Trophic.Level', 
    y='log_Tarsus',
    title='<b>Distribution of Tarsus Length by Trophic Level</b>',
    color='Trophic.Level',
    color_discrete_sequence=['firebrick', 'forestgreen', 'goldenrod'],
    category_orders={"Trophic.Level": ['Carnivore', 'Herbivore', 'Omnivore']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Trophic Level',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

## **3. Tail Size (Tail Length)**

### **3.1. Tail Length and Habitat**

Before starting, distribution of tail length within each habitat group is explored based on skewness and kurtosis.

In [26]:
conditions = [
    # Aquatic/Marine
    df_clean["Habitat"].isin(['Coastal', 'Marine', 'Riverine', 'Wetland']),
    # Closed/Forest
    df_clean["Habitat"].isin(['Forest', 'Woodland', 'Shrubland']),
    # Open/Terrestrial
    df_clean["Habitat"].isin(['Grassland', 'Desert', 'Rock', 'Human Modified'])
]
choices = ['Aquatic', 'Closed', 'Open']
df_clean["Habitat_Group"] = np.select(conditions, choices, default=None)

########################################################################

fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=("Closed Habitat", "Open Habitat", "Aquatic Habitat")
)

plot_configs = [
    ('Closed', 'forestgreen', 1),
    ('Open', 'sandybrown', 2),
    ('Aquatic', 'deepskyblue', 3)
]

for group_name, color, col_idx in plot_configs:

    subset = df_clean[df_clean['Habitat_Group'] == group_name]['log_Tail']
    s, k = skew(subset), kurtosis(subset)
    fig.add_trace(
        go.Histogram(x=subset, nbinsx=50, name=group_name, marker_color=color),
        row=1, col=col_idx
    )
    
    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )
    
    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_text="<b>Distribution of Tail Length by Habitat Group</b>",
    title_x=0.5,
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

All the habitat groups have moderate skew (-1 < skewness < 1) and moderate peak (-1 < kurtosis < 1) and therefore can be assumed to show a normal distribution of tail length. It is safe to use a parametric test, which is ANOVA.

* H0: The distribution of tail length is the same among habitat groups.
* H1: The distribution of tail length significantly differ among at least two habitat groups.

In [27]:
groups = sorted(df_clean['Habitat_Group'].unique())
data_groups = [df_clean[df_clean['Habitat_Group'] == g]['Tail.Length'] for g in groups]

f_stat, p_val = f_oneway(*data_groups)

print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_val:.4e}")

F-statistic: 8.2766
P-value: 2.5620e-04


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of tail length among at least two habitat groups. To see which habitat groups differ in terms of tail length, Tukey's test is done, in which

* H0: The distribution of tail length is the same for the two habitat groups tested.
* H1: The distribution of tail length differs between the two habitat groups tested.

In [28]:
tukey = pairwise_tukeyhsd(
    endog=df_clean['Tail.Length'],
    groups=df_clean['Habitat_Group'],
    alpha=0.05
)

print(tukey)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1 group2 meandiff p-adj   lower    upper  reject
------------------------------------------------------
Aquatic Closed   -5.895 0.0045 -10.2656 -1.5243   True
Aquatic   Open   -1.033 0.9012  -6.6051  4.5392  False
 Closed   Open    4.862  0.011   0.9089  8.8152   True
------------------------------------------------------


Between aquatic and open habitats, p > 0.05, thus the null hypothesis (H0) is failed to reject. However, the remaining pairs show statistically significant differences (p < 0.05) in tail length distribution. Finally, to illustrate the distribution, a boxplot is created.

In [29]:
df_plot = df_clean[df_clean['Habitat_Group'].isin(['Closed','Open', 'Aquatic'])].copy()

fig = px.box(
    df_plot, 
    x='Habitat_Group', 
    y='log_Tail',
    title='<b>Distribution of Tail Length by Habitat Group</b>',
    color='Habitat_Group',
    color_discrete_sequence=['forestgreen', 'sandybrown', 'deepskyblue'],
    category_orders={"Habitat": ['Closed', 'Open', 'Aquatic']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Habitat Group',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **3.2. Tail Length and Migration**

Before starting, distribution of tail length within each migration behavior is explored based on skewness and kurtosis.

In [30]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Sedentary', 'Partial', 'Migratory')
)

configs = [
    ('Sedentary', 'saddlebrown', 1),
    ('Partial', 'goldenrod', 2),
    ('Migratory', 'skyblue', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Migration'] == label]['log_Tail']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Tail Length by Migration</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

All the migratory classes have moderate skew (-1 < skewness < 1) and moderate peak (-1 < kurtosis < 1) and therefore can be assumed to show a normal distribution of tail length. It is safe to use a parametric test, which is ANOVA.

* H0: The distribution of tail length is the same among migratory classes.
* H1: The distribution of tail length significantly differ among at least two migratory classes.

In [31]:
groups = sorted(df_clean['Migration'].unique())
data_groups = [df_clean[df_clean['Migration'] == g]['Tail.Length'] for g in groups]

f_stat, p_val = f_oneway(*data_groups)

print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_val:.4e}")

F-statistic: 7.8175
P-value: 4.0516e-04


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of tarsus length among at least two migratory classes. To see which migratory classess differ in terms of tarsus length, Tukey's test is done, in which

* H0: The distribution of tarsus length is the same for the two migratory classes tested.
* H1: The distribution of tarsus length differs between the two migratory classes tested.

In [32]:
tukey = pairwise_tukeyhsd(
    endog=df_clean['Tail.Length'],
    groups=df_clean['Migration'],
    alpha=0.05
)

print(tukey)

    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
  group1    group2  meandiff p-adj   lower    upper  reject
-----------------------------------------------------------
Migratory   Partial   8.9279 0.0006   3.2699  14.586   True
Migratory Sedentary   2.9437 0.2542   -1.423  7.3104  False
  Partial Sedentary  -5.9842 0.0017 -10.0578 -1.9105   True
-----------------------------------------------------------


Between migratory and sedentary behaviors, p > 0.05, thus the null hypothesis (H0) is failed to reject. However, for the remaining pairs show statistically significant differences (p < 0.05) in tail length distribution. Finally, to illustrate the distribution, a boxplot is created.

In [33]:
df_plot = df_clean[df_clean['Migration'].isin(['Sedentary', 'Partial', 'Migratory'])].copy()

fig = px.box(
    df_plot, 
    x='Migration', 
    y='log_Tail',
    title='<b>Distribution of Tail Length by Migration</b>',
    color='Migration',
    color_discrete_sequence=['saddlebrown', 'goldenrod', 'skyblue'],
    category_orders={"Migration": ['Sedentary', 'Partial', 'Migratory']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Migration Behavior',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **3.3. Tail Length and Nutrition (Trophic Level)**

Before starting, distribution of tail length within each trophic level is explored based on skewness and kurtosis.

In [34]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Carnivore', 'Herbivore', 'Omnivore')
)

configs = [
    ('Carnivore', 'firebrick', 1),
    ('Herbivore', 'forestgreen', 2),
    ('Omnivore', 'goldenrod', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Trophic.Level'] == label]['log_Tail']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Tail Length by Trophic Level</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

All the trophic levels have moderate skew (-1 < skewness < 1) and moderate peak (-1 < kurtosis < 1) and therefore can be assumed to show a normal distribution of tail length. It is safe to use a parametric test, which is ANOVA.

* H0: The distribution of tail length is the same among trophic levels.
* H1: The distribution of tail length significantly differ among at least two trophic levels.

In [35]:
df_trophic = df_clean[df_clean['Trophic.Level'].isin(['Carnivore', 'Herbivore', 'Omnivore'])]
data_groups = [df_trophic[df_trophic['Trophic.Level'] == g]['Tarsus.Length'] for g in ['Carnivore', 'Herbivore', 'Omnivore']]

f_stat, p_val = f_oneway(*data_groups)

print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_val:.4e}")

F-statistic: 33.5704
P-value: 2.9542e-15


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of tail length among at least two trophic levels. To see which trophic levels differ in terms of tail length, Tukey's test is done, in which

* H0: The distribution of tail length is the same for the two trophic levels tested.
* H1: The distribution of tail length differs between the two trophic levels tested.

In [36]:
tukey = pairwise_tukeyhsd(
    endog=df_trophic['Tarsus.Length'],
    groups=df_trophic['Trophic.Level'],
    alpha=0.05
)

print(tukey)

   Multiple Comparison of Means - Tukey HSD, FWER=0.05    
  group1    group2  meandiff p-adj   lower   upper  reject
----------------------------------------------------------
Carnivore Herbivore   -3.891    0.0 -5.0842 -2.6977   True
Carnivore  Omnivore   0.4055 0.7825  -1.019  1.8299  False
Herbivore  Omnivore   4.2965    0.0  2.7184  5.8745   True
----------------------------------------------------------


Between carnivores and omnivores, p > 0.05, thus the null hypothesis (H0) is failed to reject. However, for the remaining pairs show statistically significant differences (p < 0.05) in tail length distribution. Finally, to illustrate the distribution, a boxplot is created.

In [37]:
fig = px.box(
    df_trophic, 
    x='Trophic.Level', 
    y='log_Tail',
    title='<b>Distribution of Tail Length by Trophic Level</b>',
    color='Trophic.Level',
    color_discrete_sequence=['firebrick', 'forestgreen', 'goldenrod'],
    category_orders={"Trophic.Level": ['Carnivore', 'Herbivore', 'Omnivore']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Trophic Level',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

## **4. Beak Size (Beak Length)**

### **4.1. Beak Length and Habitat**

Before starting, distribution of beak length within each habitat group is explored based on skewness and kurtosis.

In [38]:
conditions = [
    # Aquatic/Marine
    df_clean["Habitat"].isin(['Coastal', 'Marine', 'Riverine', 'Wetland']),
    # Closed/Forest
    df_clean["Habitat"].isin(['Forest', 'Woodland', 'Shrubland']),
    # Open/Terrestrial
    df_clean["Habitat"].isin(['Grassland', 'Desert', 'Rock', 'Human Modified'])
]
choices = ['Aquatic', 'Closed', 'Open']
df_clean["Habitat_Group"] = np.select(conditions, choices, default=None)

########################################################################

fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=("Closed Habitat", "Open Habitat", "Aquatic Habitat")
)

plot_configs = [
    ('Closed', 'forestgreen', 1),
    ('Open', 'sandybrown', 2),
    ('Aquatic', 'deepskyblue', 3)
]

for group_name, color, col_idx in plot_configs:

    subset = df_clean[df_clean['Habitat_Group'] == group_name]['log_Beak']
    s, k = skew(subset), kurtosis(subset)
    fig.add_trace(
        go.Histogram(x=subset, nbinsx=50, name=group_name, marker_color=color),
        row=1, col=col_idx
    )
    
    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )
    
    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_text="<b>Distribution of Beak Length by Habitat Group</b>",
    title_x=0.5,
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The aquatic habitat group could be assumed to show a normal distribution of tarsus length. However, the closed and open habitat groups have high peaks (kurtosis > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of beak length is the same among habitat groups.
* H1: The distribution of beak length significantly differ among at least two habitat groups.

In [39]:
groups = sorted(df_clean['Habitat_Group'].unique())
data_groups = [df_clean[df_clean['Habitat_Group'] == g]['Beak.Length_Culmen'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 636.5529
P-value: 5.9470e-139


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of beak length among at least two habitat groups. To see which habitat groups differ in terms of beak length, Dunn's test is done, in which

* H0: The distribution of beak length is the same for the two habitat groups tested.
* H1: The distribution of beak length differs between the two habitat groups tested.

In [40]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Beak.Length_Culmen', 
                                group_col='Habitat_Group', 
                                p_adjust='bonferroni')

print(dunn_results)

               Aquatic         Closed           Open
Aquatic   1.000000e+00  5.660210e-132  2.400852e-103
Closed   5.660210e-132   1.000000e+00   1.862251e-03
Open     2.400852e-103   1.862251e-03   1.000000e+00


Since p << 0.05, the null hypothesis (H0) is rejected; there are statistically significant differences in beak length between every single pair of habitat groups. Finally, to illustrate the distribution, a boxplot is created.

In [41]:
df_plot = df_clean[df_clean['Habitat_Group'].isin(['Closed','Open', 'Aquatic'])].copy()

fig = px.box(
    df_plot, 
    x='Habitat_Group', 
    y='log_Beak',
    title='<b>Distribution of Beak Length by Habitat Group</b>',
    color='Habitat_Group',
    color_discrete_sequence=['forestgreen', 'sandybrown', 'deepskyblue'],
    category_orders={"Habitat": ['Closed', 'Open', 'Aquatic']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Habitat Group',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **4.2. Beak Length and Migration**

Before starting, distribution of beak length within each migration behavior is explored based on skewness and kurtosis.

In [42]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Sedentary', 'Partial', 'Migratory')
)

configs = [
    ('Sedentary', 'saddlebrown', 1),
    ('Partial', 'goldenrod', 2),
    ('Migratory', 'skyblue', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Migration'] == label]['log_Beak']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Beak Length by Migration</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The partial class could be assumed to show a normal distribution of beak length. However, the sedentary and migratory classes have a high skew (skewness > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of beak length is the same among migratory classes.
* H1: The distribution of beak length significantly differ among at least two migratory classes.

In [43]:
groups = sorted(df_clean['Migration'].unique())
data_groups = [df_clean[df_clean['Migration'] == g]['Beak.Length_Culmen'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 16.8386
P-value: 2.2057e-04


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of beak length among at least two migratory classes. To see which migratory classes differ in terms of beak length, Dunn's test is done, in which

* H0: The distribution of beak length is the same for the two migratory classes tested.
* H1: The distribution of beak length differs between the two migratory classes tested.

In [44]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Beak.Length_Culmen', 
                                group_col='Migration', 
                                p_adjust='bonferroni')

print(dunn_results)

           Migratory   Partial  Sedentary
Migratory   1.000000  0.000252   0.170426
Partial     0.000252  1.000000   0.001878
Sedentary   0.170426  0.001878   1.000000


Between migratory and sedentary behaviors, p > 0.05, thus the null hypothesis (H0) is failed to reject. However, for the remaining pairs show statistically significant differences (p < 0.05) in beak length distribution. Finally, to illustrate the distribution, a boxplot is created.

In [45]:
df_plot = df_clean[df_clean['Migration'].isin(['Sedentary', 'Partial', 'Migratory'])].copy()

fig = px.box(
    df_plot, 
    x='Migration', 
    y='log_Beak',
    title='<b>Distribution of Beak Length by Migration</b>',
    color='Migration',
    color_discrete_sequence=['saddlebrown', 'goldenrod', 'skyblue'],
    category_orders={"Migration": ['Sedentary', 'Partial', 'Migratory']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Migration Behavior',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **4.3. Beak Length and Nutrition (Trophic Level)**

Before starting, distribution of beak length within each trophic level is explored based on skewness and kurtosis.

In [46]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Carnivore', 'Herbivore', 'Omnivore')
)

configs = [
    ('Carnivore', 'firebrick', 1),
    ('Herbivore', 'forestgreen', 2),
    ('Omnivore', 'goldenrod', 3)
]

for label, color, col_idx in configs:
    d_subset = df_clean[df_clean['Trophic.Level'] == label]['log_Beak']
    s, k = skew(d_subset), kurtosis(d_subset)

    fig.add_trace(
        go.Histogram(x=d_subset, nbinsx=50, name=label, marker_color=color), 
        row=1, col=col_idx
    )

    fig.add_annotation(
        text=f"Skewness: {s:.2f}<br>Kurtosis: {k:.2f}",
        xref="x domain", yref="y domain",
        x=1, y=0.95, showarrow=False,
        bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
        row=1, col=col_idx
    )

    fig.update_xaxes(title_text="log(Length) (log(mm))", row=1, col=col_idx)

fig.update_layout(
    title_x=0.5,
    title_text="<b>Distribution of Beak Length by Trophic Level</b>",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

All the trophic levels have high peaks (kurtosis > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

* H0: The distribution of beak length is the same among trophic levels.
* H1: The distribution of beak length significantly differ among at least two trophic levels.

In [47]:
df_trophic = df_clean[df_clean['Trophic.Level'].isin(['Carnivore', 'Herbivore', 'Omnivore'])]
data_groups = [df_trophic[df_trophic['Trophic.Level'] == g]['Beak.Length_Culmen'] for g in ['Carnivore', 'Herbivore', 'Omnivore']]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 9.2636
P-value: 9.7370e-03


Since p < 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of beak length among at least two trophic levels. To see which trophic levels differ in terms of beak length, Dunn's test is done, in which

* H0: The distribution of beak length is the same for the two trophic levels tested.
* H1: The distribution of beak length differs between the two trophic levels tested.

In [48]:
dunn_results = posthoc_dunn(a=df_trophic, 
                                val_col='Tarsus.Length', 
                                group_col='Trophic.Level', 
                                p_adjust='bonferroni')

print(dunn_results)

              Carnivore     Herbivore      Omnivore
Carnivore  1.000000e+00  1.378995e-42  9.724796e-05
Herbivore  1.378995e-42  1.000000e+00  5.274888e-45
Omnivore   9.724796e-05  5.274888e-45  1.000000e+00


Since p << 0.05, the null hypothesis (H0) is rejected; there are statistically significant differences in beak length between every single pair of trophic levels. Finally, to illustrate the distribution, a boxplot is created.

In [49]:
fig = px.box(
    df_trophic, 
    x='Trophic.Level', 
    y='log_Beak',
    title='<b>Distribution of Beak Length by Trophic Level</b>',
    color='Trophic.Level',
    color_discrete_sequence=['firebrick', 'forestgreen', 'goldenrod'],
    category_orders={"Trophic.Level": ['Carnivore', 'Herbivore', 'Omnivore']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Trophic Level',
    yaxis_title='log(Length) (log(mm))',
    template='plotly_white',
    showlegend=False
)

fig.show()