# **EDA and Hypothesis Testing of Climatic Variables**

In [1]:
# Libraries
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, spearmanr, pearsonr
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data
df = pd.read_csv('AVONETplusClim.csv' )

## **1. Body Size (Mass)**

First of all, distribution of mass is explored using histograms. For the raw data: 

In [2]:
# 1. Raw Data Plot (Histogram + Box)
fig1 = px.histogram(
    df, 
    x='Mass', 
    title='Raw Mass Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)
fig1.update_layout(title_x=0.5, xaxis_title="Mass (g)", yaxis_title="Count", template="plotly_white")
fig1.show()

The data is extremely right-skewed. In addition, there are 11 outliers that disrupt the visualization (>10000 g). To approach a biologically meaningful distribution and improve the visualization, log-transformation is applied. Additionally, the outliers are excluded since they make up a negligible fraction of the dataset (11/9878):

In [3]:
# 2. Filter & Log Transform
df_clean = df[(df['Mass'] < 11500) & (df['Mass'] > 0)].copy()
df_clean['Log_Mass'] = np.log10(df_clean['Mass'])

# 3. Log Data Plot (Histogram + Box)
fig2 = px.histogram(
    df_clean, 
    x='Log_Mass', 
    title='Log10(Mass) Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)
fig2.update_layout(title_x=0.5, xaxis_title="Log10(Mass) (Log10(g))", yaxis_title="Count", template="plotly_white")
fig2.show()

The data is less but still visibly right-skewed. To assess the normality of distribution, skewness and kurtosis are calculated since Shapiro-Wilk test is unreliable for large sample sizes (>5000):

In [4]:
skew_val = skew(df_clean['Log_Mass'])
kurt_val = kurtosis(df_clean['Log_Mass'])

print(f"Skewness: {skew_val:.4f}")
print(f"Kurtosis: {kurt_val:.4f}")

Skewness: 0.8518
Kurtosis: 0.3485


Both skewness and kurtosis fall between -1 and 1, thus the distribution is safe to assume to be normal.

### **1.1. Mass and Temperature**

Before testing with temperature, similar to mass, distribution of temperature is explored based on skewness and kurtosis. For minimum, maximum, and average temperatures:

In [5]:
fig = make_subplots(
    rows=1, cols=3, 
    subplot_titles=("Minimum Temperature", "Maximum Temperature", "Average Temperature")
)

d6 = df_clean['Min.Temperature']
s6, k6 = skew(d6), kurtosis(d6)

fig.add_trace(go.Histogram(x=d6, nbinsx=50, name='Min.Temperature', marker_color='blue'), row=1, col=1)

fig.add_annotation(
    text=f"Skewness: {s6:.2f}<br>Kurtosis: {k6:.2f}",
    xref="x domain", yref="y domain",
    x=0.05, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=1
)

d5 = df_clean['Max.Temperature']
s5, k5 = skew(d5), kurtosis(d5)

fig.add_trace(go.Histogram(x=d5, nbinsx=50, name='Max.Temperature', marker_color='red'), row=1, col=2)

fig.add_annotation(
    text=f"Skewness: {s5:.2f}<br>Kurtosis: {k5:.2f}",
    xref="x domain", yref="y domain",
    x=0.05, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=2
)

d1 = df_clean['Mean.Temperature']
s1, k1 = skew(d1), kurtosis(d1)

fig.add_trace(go.Histogram(x=d1, nbinsx=50, name='Mean.Temperature', marker_color='green'), row=1, col=3)

fig.add_annotation(
    text=f"Skewness: {s1:.2f}<br>Kurtosis: {k1:.2f}",
    xref="x domain", yref="y domain",
    x=0.05, y=0.95, showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)", bordercolor="black", borderwidth=1,
    row=1, col=3
)

fig.update_layout(
    title_x=0.5,
    title_text="Distributions of Temperature Variables",
    showlegend=False,
    bargap=0.1,
    template="plotly_white"
)

fig.update_xaxes(title_text="Temperature (°C)", row=1, col=1)
fig.update_xaxes(title_text="Temperature (°C)", row=1, col=2)
fig.update_xaxes(title_text="Temperature (°C)", row=1, col=3)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.show()

The data is highly left-skewed. Attempts to normalize the data, such as shifting and squaring the values, could work but the biological meaning would be lost since temperature is measured as an intensity, which does not follow multiplicative scaling (i.e. 20 °C is not twice as hot as 10 °C). Therefore, to be able to directly interpret the values, raw scale is kept for temperature variables. Consequently, rejecting a normal distribution, a non-parametric measure is used to assess correlation, which is Spearman's rank correlation coefficient.

For minimum temperature:

* H0: There is no correlation between mass and minimum temperature.
* H1: There is a correlation between mass and minimum temperature. 

In [6]:
correlation_rho, p_value_spearman = spearmanr(df['Min.Temperature'], df['Mass'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0346
p-value: 5.9205e-04


For maximum temperature:

* H0: There is no correlation between mass and maximum temperature.
* H1: There is a correlation between mass and maximum temperature. 

In [7]:
correlation_rho, p_value_spearman = spearmanr(df['Max.Temperature'], df['Mass'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: 0.0285
p-value: 4.6358e-03


For average temperature:

* H0: There is no correlation between mass and average temperature.
* H1: There is a correlation between mass and average temperature. 

In [8]:
correlation_rho, p_value_spearman = spearmanr(df['Mean.Temperature'], df['Mass'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0120
p-value: 2.3125e-01


For minimum and maximum temperature, p < 0.05, thus the null hypotheses are rejected. There is a statistically significant relationship between mass and minimum temperature, as well as maximum temperature. However, for average temperature, p > 0.05, thus the null hypothesis is failed to reject. To illustrate the relationships, scatter plots are created:

In [9]:
color_map = {
    'Min.Temperature': 'blue',
    'Max.Temperature': 'red',
    'Mean.Temperature': 'green'
}

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Minimum Temperature", "Maximum Temperature", "Average Temperature"),
    horizontal_spacing=0.05
)

for i, var in enumerate(['Min.Temperature', 'Max.Temperature', 'Mean.Temperature']):
    x_data = df_clean[var]
    y_data = df_clean['Log_Mass']

    rho, p_val = spearmanr(x_data, y_data)
    p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

    fig.add_trace(
        go.Scatter(
            x=x_data, y=y_data, 
            mode='markers', 
            name='Data',
            marker=dict(size=3, color=color_map[var], opacity=0.3),
            showlegend=False
        ),
        row=1, col=i+1
    )

    m, b = np.polyfit(x_data, y_data, 1)
    x_line = np.linspace(x_data.min(), x_data.max(), 100)
    y_line = m * x_line + b
    
    fig.add_trace(
        go.Scatter(
            x=x_line, y=y_line, 
            mode='lines', 
            name='Trend',
            line=dict(color='black', width=2),
            showlegend=False
        ),
        row=1, col=i+1
    )

    fig.update_xaxes(title_text="Temperature (°C)", row=1, col=i+1)

fig.update_yaxes(title_text="Log10(Mass)", row=1, col=1)
fig.update_layout(
    title_x=0.5,
    title_text="Log10(Mass) vs Temperature Variables",
    height=500,
    width=1200,
    template="plotly_white"
)

fig.show()

### **1.2. Mass and Precipitation**

Before testing with precipitation, distribution of precipitation is explored based on skewness and kurtosis:

In [10]:
fig = px.histogram(
    df_clean, 
    x='Mean.Precipitation', 
    nbins=50, 
    title='Distribution of Precipitation',
    labels={'Mean.Temperature': 'Precipitation (mm)'},
    marginal='box',
    opacity=0.7,
    color_discrete_sequence=['royalblue']
)

s = skew(df_clean['Mean.Temperature'])
k = kurtosis(df_clean['Mean.Temperature'])

fig.add_annotation(
    text=f"Skewness: {s:.4f} | Kurtosis: {k:.4f}",
    xref="paper", yref="paper",
    x=0.95, y=1.1,
    showarrow=False,
    font=dict(size=12)
)

fig.update_layout(
    title_x=0.5,
    yaxis_title="Count",
    xaxis_title="Precipitation (mm)",
    template="plotly_white"
)

fig.show()

The data is left-skewed. This time, the data can be normalized since precipitation is measured as a quantity, similar to mass. To normalize, square-root transformation is used:

In [11]:
precip_sqrt = np.sqrt(df_clean['Mean.Precipitation'])

s = skew(precip_sqrt)
k = kurtosis(precip_sqrt)

fig = px.histogram(
    x=precip_sqrt, 
    nbins=50, 
    title='Distribution of Square-Root Transformed Precipitation',
    labels={'x': 'Sqrt(Precipitation)'},
    marginal='box', 
    color_discrete_sequence=['royalblue']
)

fig.add_annotation(
    text=f"Skewness: {s:.4f} | Kurtosis: {k:.4f}",
    xref="paper", yref="paper",
    x=0.95, y=1.1,
    showarrow=False,
    font=dict(size=12)
)

fig.update_layout(title_x=0.5, yaxis_title="Count", template="plotly_white")
fig.show()

Now, since both mass and precipitation are normally distributed, a parametric measure is used, which is Pearson's correlation coefficient.

* H0: There is no correlation between mass and precipitation.
* H1: There is a correlation between mass and precipitation. 

In [12]:
df_clean['Sqrt_Precip'] = np.sqrt(df_clean['Mean.Precipitation'])

corr, p_val = pearsonr(df_clean['Log_Mass'], df_clean['Sqrt_Precip'])

print(f"Pearson Correlation Coefficient (r): {corr:.4f}")
print(f"P-value: {p_val:.4e}")

Pearson Correlation Coefficient (r): -0.0639
P-value: 2.1410e-10


Since p < 0.05, the null hypothesis is rejected. There is a statistically significant relationship between mass and precipitation. To illustrate the relationship, a scatter plot is created:

In [13]:
plot_data = df_clean[['Sqrt_Precip', 'Log_Mass']]

corr, p_val = pearsonr(df_clean['Log_Mass'], df_clean['Sqrt_Precip'])
p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

fig = px.scatter(
    plot_data, 
    x='Sqrt_Precip', 
    y='Log_Mass', 
    trendline='ols', 
    trendline_color_override='black',
    title='Log10(Mass) vs Sqrt(Precipitation)',
    labels={'Sqrt_Precip': 'Sqrt(Precipitation) (Sqrt(mm))', 'Log_Mass': 'Log10(Mass) (Log(g))'},
    opacity=0.3,
    template='plotly_white'
)

fig.update_layout(title_x=0.5)

fig.show()

## **2. Leg Size (Tarsus Length)**

First of all, distribution of tarsus length is explored using histograms, as well as skewness and kurtosis. For the raw data: 

In [14]:
# 1. Raw Data Plot (Histogram + Box)
fig1 = px.histogram(
    df, 
    x='Tarsus.Length', 
    title='Raw Tarsus Length Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)
fig1.update_layout(title_x=0.5, xaxis_title="Tarsus Length (mm)", yaxis_title="Count", template="plotly_white")
fig1.show()

The data is extremely right-skewed. In addition, there are 3 outliers that disrupt the visualization (>350). To approach a biologically meaningful distribution and improve the visualization, log-transformation is applied. Additionally, the outliers are excluded since they make up a negligible fraction of the dataset (3/9878). Skewness and kurtosis are calculated.

In [15]:
# 2. Filter & Log Transform
df_clean = df[(df['Tarsus.Length'] < 350) & (df['Tarsus.Length'] > 0)].copy()
df_clean['Log_Tarsus'] = np.log10(df_clean['Tarsus.Length'])

# 3. Log Data Plot (Histogram + Box)
fig2 = px.histogram(
    df_clean, 
    x='Log_Tarsus', 
    title='Log10(Tarsus Length) Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)

s = skew(df_clean['Log_Tarsus'])
k = kurtosis(df_clean['Log_Tarsus'])

fig2.add_annotation(
    text=f"Skewness: {s:.4f} | Kurtosis: {k:.4f}",
    xref="paper", yref="paper",
    x=0.95, y=1.1,
    showarrow=False,
    font=dict(size=12)
)

fig2.update_layout(title_x=0.5, xaxis_title="Log10(Tarsus Length) (Log10(mm))", yaxis_title="Count", template="plotly_white")
fig2.show()

Since kurtosis > 1, it is not safe to assume normal distribution. Therefore, log transformation is kept only for visualization.

### **2.1. Tarsus Length and Temperature**

Since neither tarsus length nor temperature is normally distributed, Spearman's rank correlation coefficient is used.

For minimum temperature:

* H0: There is no correlation between tarsus length and minimum temperature.
* H1: There is a correlation between tarsus length and minimum temperature. 

In [16]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Min.Temperature'], df_clean['Tarsus.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.1210
p-value: 1.5550e-33


For maximum temperature:

* H0: There is no correlation between tarsus length and maximum temperature.
* H1: There is a correlation between tarsus length and maximum temperature.

In [17]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Max.Temperature'], df_clean['Tarsus.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0421
p-value: 2.8860e-05


For average temperature:

* H0: There is no correlation between tarsus length and average temperature.
* H1: There is a correlation between tarsus length and average temperature.

In [18]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Mean.Temperature'], df_clean['Tarsus.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.1032
p-value: 8.6557e-25


For minimum, maximum, and average temperature, p << 0.05, thus the null hypotheses are rejected. Overall, there is a statistically significant relationship between tarsus length and temperature. To illustrate the relationships, scatter plots are created:

In [19]:
color_map = {
    'Min.Temperature': 'blue',
    'Max.Temperature': 'red',
    'Mean.Temperature': 'green'
}

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Minimum Temperature", "Maximum Temperature", "Average Temperature"),
    horizontal_spacing=0.05
)

for i, var in enumerate(['Min.Temperature', 'Max.Temperature', 'Mean.Temperature']):
    x_data = df_clean[var]
    y_data = df_clean['Log_Tarsus']

    rho, p_val = spearmanr(x_data, y_data)
    p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

    fig.add_trace(
        go.Scatter(
            x=x_data, y=y_data, 
            mode='markers', 
            name='Data',
            marker=dict(size=3, color=color_map[var], opacity=0.3),
            showlegend=False
        ),
        row=1, col=i+1
    )

    m, b = np.polyfit(x_data, y_data, 1)
    x_line = np.linspace(x_data.min(), x_data.max(), 100)
    y_line = m * x_line + b
    
    fig.add_trace(
        go.Scatter(
            x=x_line, y=y_line, 
            mode='lines', 
            name='Trend',
            line=dict(color='black', width=2),
            showlegend=False
        ),
        row=1, col=i+1
    )

    fig.update_xaxes(title_text="Temperature (°C)", row=1, col=i+1)

fig.update_yaxes(title_text="Log10(Tarsus Length)", row=1, col=1)
fig.update_layout(
    title_x=0.5,
    title_text="Log10(Tarsus Length) vs Temperature Variables",
    height=500,
    width=1200,
    template="plotly_white"
)

fig.show()

## **3. Tail Size (Tail Length)**

First of all, distribution of tail length is explored using histograms, as well as skewness and kurtosis. For the raw data: 

In [20]:
# 1. Raw Data Plot (Histogram + Box)
fig1 = px.histogram(
    df, 
    x='Tail.Length', 
    title='Raw Tail Length Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)
fig1.update_layout(title_x=0.5, xaxis_title="Tail Length (mm)", yaxis_title="Count", template="plotly_white")
fig1.show()

The data is extremely right-skewed. In addition, there are 13 outliers that disrupt the visualization (>535 mm). To approach a biologically meaningful distribution and improve the visualization, log-transformation is applied. Additionally, the outliers are excluded since they make up a negligible fraction of the dataset (13/9878). Skewness and kurtosis are calculated.

In [21]:
# 2. Filter & Log Transform
df_clean = df[(df['Tail.Length'] > 0.1) & (df['Tail.Length'] < 535)].copy()
df_clean['Log_Tail'] = np.log10(df_clean['Tail.Length'])

# 3. Log Data Plot (Histogram + Box)
fig2 = px.histogram(
    df_clean, 
    x='Log_Tail', 
    title='Log10(Tail Length) Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)

s = skew(df_clean['Log_Tail'])
k = kurtosis(df_clean['Log_Tail'])

fig2.add_annotation(
    text=f"Skewness: {s:.4f} | Kurtosis: {k:.4f}",
    xref="paper", yref="paper",
    x=0.95, y=1.1,
    showarrow=False,
    font=dict(size=12)
)

fig2.update_layout(title_x=0.5, xaxis_title="Log10(Tail Length) (Log10(mm))", yaxis_title="Count", template="plotly_white")
fig2.show()

Both skewness and kurtosis fall between -1 and 1, thus the distribution is safe to assume to be normal.

### **3.1. Tail Length and Temperature**

Since temperature is not normally distributed, Spearman's rank correlation coefficient is used.

For minimum temperature:

* H0: There is no correlation between tail length and minimum temperature.
* H1: There is a correlation between tail length and minimum temperature. 

In [22]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Min.Temperature'], df_clean['Tail.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0636
p-value: 2.6299e-10


For maximum temperature:

* H0: There is no correlation between tail length and maximum temperature.
* H1: There is a correlation between tail length and maximum temperature.

In [23]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Max.Temperature'], df_clean['Tail.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: 0.0289
p-value: 4.1619e-03


For average temperature:

* H0: There is no correlation between tail length and average temperature.
* H1: There is a correlation between tail length and average temperature.

In [24]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Mean.Temperature'], df_clean['Tail.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0295
p-value: 3.3999e-03


For minimum, maximum, and average temperature, p << 0.05, thus the null hypotheses are rejected. Overall, there is a statistically significant relationship between tail length and temperature. To illustrate the relationships, scatter plots are created:

In [25]:
color_map = {
    'Min.Temperature': 'blue',
    'Max.Temperature': 'red',
    'Mean.Temperature': 'green'
}

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Minimum Temperature", "Maximum Temperature", "Average Temperature"),
    horizontal_spacing=0.05
)

for i, var in enumerate(['Min.Temperature', 'Max.Temperature', 'Mean.Temperature']):
    x_data = df_clean[var]
    y_data = df_clean['Log_Tail']

    rho, p_val = spearmanr(x_data, y_data)
    p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

    fig.add_trace(
        go.Scatter(
            x=x_data, y=y_data, 
            mode='markers', 
            name='Data',
            marker=dict(size=3, color=color_map[var], opacity=0.3),
            showlegend=False
        ),
        row=1, col=i+1
    )

    m, b = np.polyfit(x_data, y_data, 1)
    x_line = np.linspace(x_data.min(), x_data.max(), 100)
    y_line = m * x_line + b
    
    fig.add_trace(
        go.Scatter(
            x=x_line, y=y_line, 
            mode='lines', 
            name='Trend',
            line=dict(color='black', width=2),
            showlegend=False
        ),
        row=1, col=i+1
    )

    fig.update_xaxes(title_text="Temperature (°C)", row=1, col=i+1)

fig.update_yaxes(title_text="Log10(Tail Length)", row=1, col=1)
fig.update_layout(
    title_x=0.5,
    title_text="Log10(Tail Length) vs Temperature Variables",
    height=500,
    width=1200,
    template="plotly_white"
)

fig.show()

## **4. Beak Size (Beak Length)**

First of all, distribution of beak length is explored using histograms, as well as skewness and kurtosis. For the raw data: 

In [26]:
# 1. Raw Data Plot (Histogram + Box)
fig1 = px.histogram(
    df, 
    x='Beak.Length_Culmen', 
    title='Raw Beak Length Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)
fig1.update_layout(title_x=0.5, xaxis_title="Beak Length (mm)", yaxis_title="Count", template="plotly_white")
fig1.show()

The data is extremely right-skewed. To approach a biologically meaningful distribution and improve the visualization, log-transformation is applied. Skewness and kurtosis are calculated.

In [27]:
# 2. Filter & Log Transform
df_clean['Log_Beak'] = np.log10(df_clean['Beak.Length_Culmen'])

# 3. Log Data Plot (Histogram + Box)
fig2 = px.histogram(
    df_clean, 
    x='Log_Beak', 
    title='Log10(Beak Length) Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)

s = skew(df_clean['Log_Beak'])
k = kurtosis(df_clean['Log_Beak'])

fig2.add_annotation(
    text=f"Skewness: {s:.4f} | Kurtosis: {k:.4f}",
    xref="paper", yref="paper",
    x=0.95, y=1.1,
    showarrow=False,
    font=dict(size=12)
)

fig2.update_layout(title_x=0.5, xaxis_title="Log10(Beak Length) (Log10(mm))", yaxis_title="Count", template="plotly_white")
fig2.show()

Since kurtosis > 2, it is not safe to assume normal distribution. Therefore, log transformation is kept only for visualization.

### **4.1. Beak Length and Temperature**

Since neither beak length nor temperature is normally distributed, Spearman's rank correlation coefficient is used.

For minimum temperature:

* H0: There is no correlation between beak length and minimum temperature.
* H1: There is a correlation between beak length and minimum temperature. 

In [28]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Min.Temperature'], df_clean['Beak.Length_Culmen'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: 0.0509
p-value: 4.2710e-07


For maximum temperature:

* H0: There is no correlation between tail length and maximum temperature.
* H1: There is a correlation between tail length and maximum temperature.

In [29]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Max.Temperature'], df_clean['Beak.Length_Culmen'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: 0.0265
p-value: 8.4965e-03


For average temperature:

* H0: There is no correlation between tail length and average temperature.
* H1: There is a correlation between tail length and average temperature.

In [30]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Mean.Temperature'], df_clean['Beak.Length_Culmen'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: 0.0478
p-value: 2.0996e-06


For minimum, maximum, and average temperature, p << 0.05, thus the null hypotheses are rejected. Overall, there is a statistically significant relationship between beak length and temperature. To illustrate the relationships, scatter plots are created:

In [31]:
color_map = {
    'Min.Temperature': 'blue',
    'Max.Temperature': 'red',
    'Mean.Temperature': 'green'
}

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Minimum Temperature", "Maximum Temperature", "Average Temperature"),
    horizontal_spacing=0.05
)

for i, var in enumerate(['Min.Temperature', 'Max.Temperature', 'Mean.Temperature']):
    x_data = df_clean[var]
    y_data = df_clean['Log_Beak']

    rho, p_val = spearmanr(x_data, y_data)
    p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

    fig.add_trace(
        go.Scatter(
            x=x_data, y=y_data, 
            mode='markers', 
            name='Data',
            marker=dict(size=3, color=color_map[var], opacity=0.3),
            showlegend=False
        ),
        row=1, col=i+1
    )

    m, b = np.polyfit(x_data, y_data, 1)
    x_line = np.linspace(x_data.min(), x_data.max(), 100)
    y_line = m * x_line + b
    
    fig.add_trace(
        go.Scatter(
            x=x_line, y=y_line, 
            mode='lines', 
            name='Trend',
            line=dict(color='black', width=2),
            showlegend=False
        ),
        row=1, col=i+1
    )

    fig.update_xaxes(title_text="Temperature (°C)", row=1, col=i+1)

fig.update_yaxes(title_text="Log10(Beak Length)", row=1, col=1)
fig.update_layout(
    title_x=0.5,
    title_text="Log10(Beak Length) vs Temperature Variables",
    height=500,
    width=1200,
    template="plotly_white"
)

fig.show()

### **4.2. Beak Length and Precipitation**

Since beak length is not normally distributed, Spearman test is used, in which

* H0: There is no monotonic relationship between beak length and precipitation.
* H1: There is a monotonic relationship between beak length and precipitation.

In [32]:
correlation_rho, p_value_spearman = spearmanr(df['Mean.Precipitation'], df['Beak.Length_Culmen'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: 0.0499
p-value: 7.1455e-07


Since p < 0.05, the null hypothesis (H0) is rejected. There is a statistically significant relationship between beak length and precipitation. To illustrate the relationship, a scatter plot is created.

In [33]:
df_clean['Sqrt_Precip'] = np.sqrt(df_clean['Mean.Precipitation'])
plot_data = df_clean[['Sqrt_Precip', 'Log_Beak']]

corr, p_val = pearsonr(df_clean['Log_Beak'], df_clean['Sqrt_Precip'])
p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

fig = px.scatter(
    plot_data, 
    x='Sqrt_Precip', 
    y='Log_Beak', 
    trendline='ols', 
    trendline_color_override='black',
    title='Log10(Beak Length) vs Sqrt(Precipitation)',
    labels={'Sqrt_Precip': 'Sqrt(Precipitation) (Sqrt(mm))', 'Log_Beak': 'Log10(Beak Length) (Log(mm))'},
    opacity=0.3,
    template='plotly_white'
)

fig.update_layout(title_x=0.5)

fig.show()

## **5. Wing Size and Shape (Wing Length and Hand-Wing Index)**

### **5.1. Wing Length and Temperature**

First of all, distribution of wing length is explored using histograms, as well as skewness and kurtosis. For the raw data: 

In [34]:
# 1. Raw Data Plot (Histogram + Box)
fig1 = px.histogram(
    df, 
    x='Wing.Length', 
    title='Raw Wing Length Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)
fig1.update_layout(title_x=0.5, xaxis_title="Wing Length (mm)", yaxis_title="Count", template="plotly_white")
fig1.show()

The data is extremely right-skewed. In addition, there are 10 outliers that disrupt the visualization (>650 mm). To approach a biologically meaningful distribution and improve the visualization, log-transformation is applied. Additionally, the outliers are excluded since they make up a negligible fraction of the dataset (10/9878). Skewness and kurtosis are calculated.

In [35]:
# 2. Filter & Log Transform
df_clean = df[(df['Wing.Length'] > 0.1) & (df['Wing.Length'] < 650)].copy()
df_clean['Log_Wing'] = np.log10(df_clean['Wing.Length'])

# 3. Log Data Plot (Histogram + Box)
fig2 = px.histogram(
    df_clean, 
    x='Log_Wing', 
    title='Log10(Wing Length) Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)

s = skew(df_clean['Log_Wing'])
k = kurtosis(df_clean['Log_Wing'])

fig2.add_annotation(
    text=f"Skewness: {s:.4f} | Kurtosis: {k:.4f}",
    xref="paper", yref="paper",
    x=0.95, y=1.1,
    showarrow=False,
    font=dict(size=12)
)

fig2.update_layout(title_x=0.5, xaxis_title="Log10(Wing Length) (Log10(mm))", yaxis_title="Count", template="plotly_white")
fig2.show()

Both skewness and kurtosis fall between -1 and 1, thus the distribution is safe to assume to be normal.

Since temperature is not normally distributed, Spearman's rank correlation coefficient is used.

For minimum temperature:

* H0: There is no correlation between wing length and minimum temperature.
* H1: There is a correlation between wing length and minimum temperature. 

In [36]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Min.Temperature'], df_clean['Wing.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0643
p-value: 1.6197e-10


For maximum temperature:

* H0: There is no correlation between wing length and maximum temperature.
* H1: There is a correlation between wing length and maximum temperature.

In [37]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Max.Temperature'], df_clean['Wing.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: 0.0313
p-value: 1.9102e-03


For average temperature:

* H0: There is no correlation between wing length and average temperature.
* H1: There is a correlation between wing length and average temperature.

In [38]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Mean.Temperature'], df_clean['Wing.Length'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0317
p-value: 1.6321e-03


For minimum, maximum, and average temperature, p << 0.05, thus the null hypotheses are rejected. Overall, there is a statistically significant relationship between wing length and temperature. To illustrate the relationships, scatter plots are created:

In [39]:
color_map = {
    'Min.Temperature': 'blue',
    'Max.Temperature': 'red',
    'Mean.Temperature': 'green'
}

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Minimum Temperature", "Maximum Temperature", "Average Temperature"),
    horizontal_spacing=0.05
)

for i, var in enumerate(['Min.Temperature', 'Max.Temperature', 'Mean.Temperature']):
    x_data = df_clean[var]
    y_data = df_clean['Log_Wing']

    rho, p_val = spearmanr(x_data, y_data)
    p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

    fig.add_trace(
        go.Scatter(
            x=x_data, y=y_data, 
            mode='markers', 
            name='Data',
            marker=dict(size=3, color=color_map[var], opacity=0.3),
            showlegend=False
        ),
        row=1, col=i+1
    )

    m, b = np.polyfit(x_data, y_data, 1)
    x_line = np.linspace(x_data.min(), x_data.max(), 100)
    y_line = m * x_line + b
    
    fig.add_trace(
        go.Scatter(
            x=x_line, y=y_line, 
            mode='lines', 
            name='Trend',
            line=dict(color='black', width=2),
            showlegend=False
        ),
        row=1, col=i+1
    )

    fig.update_xaxes(title_text="Temperature (°C)", row=1, col=i+1)

fig.update_yaxes(title_text="Log10(Wing Length)", row=1, col=1)
fig.update_layout(
    title_x=0.5,
    title_text="Log10(Wing Length) vs Temperature Variables",
    height=500,
    width=1200,
    template="plotly_white"
)

fig.show()

### **5.2. Hand-Wing Index (HWI) and Temperature**

First of all, distribution of HWI is explored using histograms, as well as skewness and kurtosis. For the raw data:

In [40]:
# 1. Raw Data Plot (Histogram + Box)
fig1 = px.histogram(
    df, 
    x='Hand-Wing.Index', 
    title='Raw HWI Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)
fig1.update_layout(title_x=0.5, xaxis_title="HWI", yaxis_title="Count", template="plotly_white")
fig1.show()

The data is right-skewed.To approach a biologically meaningful distribution and improve the visualization, log-transformation is applied. Skewness and kurtosis are calculated.

In [41]:
# 2. Filter & Log Transform
df_clean = df[ 3 < (df['Hand-Wing.Index'])].copy()
df_clean['Log_HWI'] = np.log10(df_clean['Hand-Wing.Index'])

# 3. Log Data Plot (Histogram + Box)
fig2 = px.histogram(
    df_clean, 
    x='Log_HWI', 
    title='Log10(HWI) Distribution', 
    nbins=50, 
    marginal='box',
    color_discrete_sequence=['salmon']
)

s = skew(df_clean['Log_HWI'])
k = kurtosis(df_clean['Log_HWI'])

fig2.add_annotation(
    text=f"Skewness: {s:.4f} | Kurtosis: {k:.4f}",
    xref="paper", yref="paper",
    x=0.95, y=1.1,
    showarrow=False,
    font=dict(size=12)
)

fig2.update_layout(title_x=0.5, xaxis_title="Log10(Log_HWI)", yaxis_title="Count", template="plotly_white")
fig2.show()

Both skewness and kurtosis fall between -1 and 1, thus the distribution is safe to assume to be normal.

Since temperature is not normally distributed, Spearman's rank correlation coefficient is used.

For minimum temperature:

* H0: There is no correlation between HWI and minimum temperature.
* H1: There is a correlation between HWI and minimum temperature. 

In [42]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Min.Temperature'], df_clean['Hand-Wing.Index'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.1690
p-value: 4.3279e-64


For maximum temperature:

* H0: There is no correlation between HWI and maximum temperature.
* H1: There is a correlation between HWI and maximum temperature.

In [43]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Max.Temperature'], df_clean['Hand-Wing.Index'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.0004
p-value: 9.6946e-01


For average temperature:

* H0: There is no correlation between wing length and average temperature.
* H1: There is a correlation between wing length and average temperature.

In [44]:
correlation_rho, p_value_spearman = spearmanr(df_clean['Mean.Temperature'], df_clean['Hand-Wing.Index'])

print(f"Spearman rho: {correlation_rho:.4f}")
print(f"p-value: {p_value_spearman:.4e}")

Spearman rho: -0.1244
p-value: 2.7667e-35


For minimum and average temperature, p < 0.05, thus the null hypotheses are rejected. There is a statistically significant relationship between mass and minimum temperature, as well as average temperature. However, for maximum temperature, p > 0.05, thus the null hypothesis is failed to reject. To illustrate the relationships, scatter plots are created:

In [45]:
color_map = {
    'Min.Temperature': 'blue',
    'Max.Temperature': 'red',
    'Mean.Temperature': 'green'
}

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Minimum Temperature", "Maximum Temperature", "Average Temperature"),
    horizontal_spacing=0.05
)

for i, var in enumerate(['Min.Temperature', 'Max.Temperature', 'Mean.Temperature']):
    x_data = df_clean[var]
    y_data = df_clean['Log_HWI']

    rho, p_val = spearmanr(x_data, y_data)
    p_text = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

    fig.add_trace(
        go.Scatter(
            x=x_data, y=y_data, 
            mode='markers', 
            name='Data',
            marker=dict(size=3, color=color_map[var], opacity=0.3),
            showlegend=False
        ),
        row=1, col=i+1
    )

    m, b = np.polyfit(x_data, y_data, 1)
    x_line = np.linspace(x_data.min(), x_data.max(), 100)
    y_line = m * x_line + b
    
    fig.add_trace(
        go.Scatter(
            x=x_line, y=y_line, 
            mode='lines', 
            name='Trend',
            line=dict(color='black', width=2),
            showlegend=False
        ),
        row=1, col=i+1
    )

    fig.update_xaxes(title_text="Temperature (°C)", row=1, col=i+1)

fig.update_yaxes(title_text="Log10(HWI)", row=1, col=1)
fig.update_layout(
    title_x=0.5,
    title_text="Log10(HWI) vs Temperature Variables",
    height=500,
    width=1200,
    template="plotly_white"
)

fig.show()