# **Exploratory Data Analysis of Ecologic Variables**

In [1]:
# Libraries
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, kruskal
from scikit_posthocs import posthoc_dunn
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data
df = pd.read_csv('AVONETplusClim.csv')
df_clean = df.copy()
df_clean = df_clean[(df_clean['Mass'] > 0) & (df_clean['Mass'] < 11500)]
df_clean = df_clean[(df_clean['Tail.Length'] > 0.1) & (df_clean['Tail.Length'] < 535)]
df_clean = df_clean[(df_clean['Tarsus.Length'] > 0) & (df_clean['Tarsus.Length'] < 350)]
df_clean = df_clean[(df_clean['Wing.Length'] > 0.1) & (df_clean['Wing.Length'] < 650)]
df_clean = df_clean[df_clean['Hand-Wing.Index'] > 3]
df_clean['Log_Mass'] = np.log10(df_clean['Mass'])
df_clean['Log_Tail'] = np.log10(df_clean['Tail.Length'])
df_clean['Log_Tarsus'] = np.log10(df_clean['Tarsus.Length'])
df_clean['Log_Beak'] = np.log10(df_clean['Beak.Length_Culmen'])
df_clean['Log_Wing'] = np.log10(df_clean['Wing.Length'])
df_clean['Log_HWI'] = np.log10(df_clean['Hand-Wing.Index'])

## **1. Body Size (Mass)**

### **1.1. Mass and Habitat**

Before starting, distribution of mass within each habitat group is explored based on skewness and kurtosis.

In [2]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=("Closed Habitat", "Open Habitat", "Aquatic Habitat")
)

d_closed = df_clean[df_clean['Habitat'] == 'Closed']['Log_Mass']
s_closed, k_closed = skew(d_closed), kurtosis(d_closed)

fig.add_trace(go.Histogram(x=d_closed, nbinsx=50, name='Closed', marker_color='forestgreen'), row=1, col=1)

fig.add_annotation(
    text=f"Skewness: {s_closed:.2f}<br>Kurtosis: {k_closed:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=1
)

d_open = df_clean[df_clean['Habitat'] == 'Open']['Log_Mass']
s_open, k_open = skew(d_open), kurtosis(d_open)

fig.add_trace(go.Histogram(x=d_open, nbinsx=50, name='Open', marker_color='sandybrown'), row=1, col=2)

fig.add_annotation(
    text=f"Skewness: {s_open:.2f}<br>Kurtosis: {k_open:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=2
)

d_aquatic = df_clean[df_clean['Habitat'] == 'Aquatic']['Log_Mass']
s_aquatic, k_aquatic = skew(d_aquatic), kurtosis(d_aquatic)

fig.add_trace(go.Histogram(x=d_aquatic, nbinsx=50, name='Aquatic', marker_color='deepskyblue'), row=1, col=3)

fig.add_annotation(
    text=f"Skewness: {s_aquatic:.2f}<br>Kurtosis: {k_aquatic:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=3
)

fig.update_layout(
    title_x=0.5,
    title_text="Distributions of Mass by Habitat",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=1)
fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=2)
fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=3)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The closed habitat group could be assumed to show a normal distribution of mass. However, the open hapitat group has a high skew (skewness = 1) and the aquatic habitat group shows a flat distribution (kurtosis < -1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis. 

In [3]:
groups = sorted(df_clean['Habitat'].unique())
data_groups = [df_clean[df_clean['Habitat'] == g]['Mass'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 601.7524
P-value: 2.1435e-131


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of mass among at least two habitat groups. To see which habitat groups differ in terms of mass, Dunn's test is done, in which

* H0: The distribution of Mass is the same for the two Habitat groups tested.
* H1: The distribution of Mass differs between the two Habitat groups tested.

In [4]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Mass', 
                                group_col='Habitat', 
                                p_adjust='bonferroni')

print(dunn_results)

               Aquatic         Closed          Open
Aquatic   1.000000e+00  7.062095e-129  5.282201e-46
Closed   7.062095e-129   1.000000e+00  1.449479e-10
Open      5.282201e-46   1.449479e-10  1.000000e+00


Since p << 0.05, the null hypothesis (H0) is rejected; there are statistically significant differences in mass distribution between every single pair of habitat groups. Finally, to illustrate the distribution, a boxplot is created.

In [5]:
df_plot = df_clean[df_clean['Habitat'].isin(['Closed','Open', 'Aquatic'])].copy()

fig = px.box(
    df_plot, 
    x='Habitat', 
    y='Log_Mass',
    title='Log(Mass) by Habitat',
    color='Habitat',
    color_discrete_sequence=['forestgreen', 'sandybrown', 'deepskyblue'],
    category_orders={"Habitat": ['Closed', 'Open', 'Aquatic']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Habitat Group',
    yaxis_title='Log(Mass) (log(g))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **1.2. Mass and Migration**

Before starting, distribution of mass within each migration behavior is explored based on skewness and kurtosis.

In [6]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Sedentary', 'Partial', 'Migratory')
)

d_closed = df_clean[df_clean['Migration'] == 'Sedentary']['Log_Mass']
s_closed, k_closed = skew(d_closed), kurtosis(d_closed)

fig.add_trace(go.Histogram(x=d_closed, nbinsx=50, name='Sedentary', marker_color='saddlebrown'), row=1, col=1)

fig.add_annotation(
    text=f"Skewness: {s_closed:.2f}<br>Kurtosis: {k_closed:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=1
)

d_open = df_clean[df_clean['Migration'] == 'Partial']['Log_Mass']
s_open, k_open = skew(d_open), kurtosis(d_open)

fig.add_trace(go.Histogram(x=d_open, nbinsx=50, name='Partial', marker_color='goldenrod'), row=1, col=2)

fig.add_annotation(
    text=f"Skewness: {s_open:.2f}<br>Kurtosis: {k_open:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=2
)

d_aquatic = df_clean[df_clean['Migration'] == 'Migratory']['Log_Mass']
s_aquatic, k_aquatic = skew(d_aquatic), kurtosis(d_aquatic)

fig.add_trace(go.Histogram(x=d_aquatic, nbinsx=50, name='Migratory', marker_color='skyblue'), row=1, col=3)

fig.add_annotation(
    text=f"Skewness: {s_aquatic:.2f}<br>Kurtosis: {k_aquatic:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=3
)

fig.update_layout(
    title_x=0.5,
    title_text="Distributions of Mass by Migration",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=1)
fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=2)
fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=3)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The sedentary and partial classes could be assumed to show a normal distribution of mass. However, the migratory class has a high skew (skewness > 1) and Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

In [7]:
groups = sorted(df_clean['Migration'].unique())
data_groups = [df_clean[df_clean['Migration'] == g]['Mass'] for g in groups]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 93.4030
P-value: 5.2215e-21


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of mass among at least two migratory classes. To see which migratory classess differ in terms of mass, Dunn's test is done, in which

* H0: The distribution of mass is the same for the two migratory classes tested.
* H1: The distribution of mass differs between the two migratory classes tested.

In [8]:
dunn_results = posthoc_dunn(a=df_clean, 
                                val_col='Mass', 
                                group_col='Migration', 
                                p_adjust='bonferroni')

print(dunn_results)

              Migratory       Partial     Sedentary
Migratory  1.000000e+00  9.534305e-07  6.351779e-02
Partial    9.534305e-07  1.000000e+00  3.141202e-21
Sedentary  6.351779e-02  3.141202e-21  1.000000e+00


Between sedentary and migratory classes, p > 0.05, thus the null hypothesis (H0) is failed to reject. However, for the pairs partial-sedentary and partial-migratory, p << 0.05, thus there are statistically significant differences in mass distribution between these migratory classes. Finally, to illustrate the distribution, a boxplot is created.

In [9]:
df_plot = df_clean[df_clean['Migration'].isin(['Sedentary', 'Partial', 'Migratory'])].copy()

fig = px.box(
    df_plot, 
    x='Migration', 
    y='Log_Mass',
    title='Log(Mass) by Migration',
    color='Migration',
    color_discrete_sequence=['saddlebrown', 'goldenrod', 'skyblue'],
    category_orders={"Migration": ['Sedentary', 'Partial', 'Migratory']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Migration Behavior',
    yaxis_title='Log(Mass) (log(g))',
    template='plotly_white',
    showlegend=False
)

fig.show()

### **1.3. Mass and Nutrition (Trophic Level)**

Before starting, distribution of mass within each trophic level is explored based on skewness and kurtosis.

In [10]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=('Carnivore', 'Herbivore', 'Omnivore')
)

d_closed = df_clean[df_clean['Trophic.Level'] == 'Carnivore']['Log_Mass']
s_closed, k_closed = skew(d_closed), kurtosis(d_closed)

fig.add_trace(go.Histogram(x=d_closed, nbinsx=50, name='Carnivore', marker_color='saddlebrown'), row=1, col=1)

fig.add_annotation(
    text=f"Skewness: {s_closed:.2f}<br>Kurtosis: {k_closed:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=1
)

d_open = df_clean[df_clean['Trophic.Level'] == 'Herbivore']['Log_Mass']
s_open, k_open = skew(d_open), kurtosis(d_open)

fig.add_trace(go.Histogram(x=d_open, nbinsx=50, name='Herbivore', marker_color='goldenrod'), row=1, col=2)

fig.add_annotation(
    text=f"Skewness: {s_open:.2f}<br>Kurtosis: {k_open:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=2
)

d_aquatic = df_clean[df_clean['Trophic.Level'] == 'Omnivore']['Log_Mass']
s_aquatic, k_aquatic = skew(d_aquatic), kurtosis(d_aquatic)

fig.add_trace(go.Histogram(x=d_aquatic, nbinsx=50, name='Omnivore', marker_color='skyblue'), row=1, col=3)

fig.add_annotation(
    text=f"Skewness: {s_aquatic:.2f}<br>Kurtosis: {k_aquatic:.2f}",
    xref="x domain", yref="y domain",
    x=1, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=3
)

fig.update_layout(
    title_x=0.5,
    title_text="Distributions of Mass by Trophic Level",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=1)
fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=2)
fig.update_xaxes(title_text="Log(Mass) (Log(g))", row=1, col=3)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The herbivores could be assumed to show a normal distribution of mass. However, the carnivores and omnivores have a high skew (skewness > 1). Therefore, using a non-parametric test is is the safer option, which is Kruskal-Wallis.

In [11]:
df_trophic = df_clean[df_clean['Trophic.Level'].isin(['Carnivore', 'Herbivore', 'Omnivore'])]
data_groups = [df_trophic[df_trophic['Trophic.Level'] == g]['Mass'] for g in ['Carnivore', 'Herbivore', 'Omnivore']]

h_stat, p_val = kruskal(*data_groups)

print(f"H-statistic: {h_stat:.4f}")
print(f"P-value: {p_val:.4e}")

H-statistic: 72.4215
P-value: 1.8788e-16


Since p << 0.05, the null hypothesis (H0) is rejected. There is a statistically significant difference in the distribution of mass among at least two trophic levels. To see which trophic levels differ in terms of mass, Dunn's test is done, in which

* H0: The distribution of mass is the same for the two trophic levels tested.
* H1: The distribution of mass differs between the two trophic levels tested.

In [12]:
dunn_results = posthoc_dunn(a=df_trophic, 
                                val_col='Mass', 
                                group_col='Trophic.Level', 
                                p_adjust='bonferroni')

print(dunn_results)

              Carnivore     Herbivore  Omnivore
Carnivore  1.000000e+00  4.122494e-16  0.000055
Herbivore  4.122494e-16  1.000000e+00  0.052144
Omnivore   5.539712e-05  5.214397e-02  1.000000


Between herbivores and omnivores, p > 0.05, thus the null hypothesis is failed to reject. However, for the pairs carnivore-herbivore and carnivore-omnivore, p << 0.05, thus there are statistically significant differences in mass distribution between these trophic levels. Finally, to illustrate the distribution, a boxplot is created.

In [13]:
fig = px.box(
    df_trophic, 
    x='Trophic.Level', 
    y='Log_Mass',
    title='Log(Mass) by Trophic Level',
    color='Trophic.Level',
    color_discrete_sequence=['firebrick', 'forestgreen', 'goldenrod'],
    category_orders={"Trophic.Level": ['Carnivore', 'Herbivore', 'Omnivore']}
)

fig.update_layout(
    title_x=0.5,
    xaxis_title='Trophic Level',
    yaxis_title='Log(Mass) (log(g))',
    template='plotly_white',
    showlegend=False
)

fig.show()