In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import seaborn as sns

In [2]:
file_path = 'C:\\Users\\Mahika\\Desktop\\NMIMS\\Sem 7\\RL\\project\\ELEData\\ELEData\\TraitData\\AVONET3_EBird.xlsx'
df = pd.read_excel(file_path, sheet_name='AVONET3_BirdTree')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Mahika\\Desktop\\NMIMS\\Sem 7\\RL\\project\\ELEData\\ELEData\\TraitData\\AVONET3_EBird.xlsx'

In [None]:
df.columns

In [None]:
df = df.drop(columns=['Mass.Source', 'Mass.Refs.Other', 'Inference', 'Traits.inferred', 'Reference.species'])
df.columns = df.columns.str.strip()
df = df.dropna()

In [None]:
df.shape

In [None]:
df.columns = df.columns.str.strip()
df.columns

In [None]:
families_per_order = df.groupby('Order3')['Family3'].nunique().sort_values(ascending=False)

colors = plt.cm.tab20.colors

plt.figure(figsize=(12, 6))
bars = plt.bar(families_per_order.index, families_per_order.values, color=colors[:len(families_per_order)])

for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f'{int(height)}',
        ha='center',
        va='bottom',
        fontsize=10,
        fontweight='bold'
    )

plt.ylabel('Number of Families')
plt.title('Number of Families per Order')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
species_per_family = df.groupby('Family3')['Species3'].nunique().sort_values(ascending=False)

colors = plt.cm.tab20.colors
if len(species_per_family) > 20:
    colors = list(itertools.islice(itertools.cycle(colors), len(species_per_family)))

plt.figure(figsize=(14, 6))
bars = plt.bar(species_per_family.index, species_per_family.values, color=colors)

for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f'{int(height)}',
        ha='center',
        va='bottom',
        fontsize=9,
        fontweight='bold'
    )

plt.ylabel('Number of Species')
plt.title('Number of Species per Family')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df.columns

In [None]:
numerical_metric_groups = {
    'Beak Measurements': ['Beak.Length_Culmen', 'Beak.Length_Nares', 'Beak.Width', 'Beak.Depth'],
    #'Tarsus Length': ['Tarsus.Length'],
    'Wing Measurements': ['Wing.Length', 'Kipps.Distance', 'Secondary1', 'Hand-Wing.Index'],
    'Tail Length': ['Tail.Length'],
    #'Mass': ['Mass'],
    'Geographical Coordinates': ['Min.Latitude', 'Max.Latitude', 'Centroid.Latitude', 'Centroid.Longitude'],
    'Individual Counts': ['Total.individuals', 'Female', 'Male', 'Unknown', 'Complete.measures']
}

In [None]:
categorical_columns_for_freq = [
    'Habitat',
    'Habitat.Density',
    'Migration',
    'Trophic.Level',
    'Trophic.Niche',
    'Primary.Lifestyle',
    'Species.Status'
]

In [None]:
print("\n--- Correlation Analysis for Numerical Metric Groups ---")
for group_name, metrics in numerical_metric_groups.items():
    current_numerical_metrics = [m for m in metrics if m in df.columns]

    if not current_numerical_metrics:
        print(f"Skipping {group_name}: No relevant columns found in the DataFrame.")
        continue

    numeric_group_df = df[current_numerical_metrics].copy()

    correlation_matrix = numeric_group_df.corr()

    print(f"\nCorrelation Matrix for {group_name}:")
    print(correlation_matrix.to_markdown(numalign="left", stralign="left"))

In [None]:
plt.figure(figsize=(len(current_numerical_metrics)*1.5 + 2, len(current_numerical_metrics)*1.5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title(f'Correlation Matrix for {group_name}')
plt.tight_layout()
# plt.savefig(f'correlation_heatmap_{group_name.replace(" ", "_").lower()}.png')
plt.show()

In [None]:
print("\n--- Frequency Analysis for Categorical Metrics ---")
for col_name in categorical_columns_for_freq:
    if col_name in df.columns:
        if (df[col_name].dtype == 'object' or col_name in ['Migration', 'Habitat.Density']) and df[col_name].nunique() < 50:
            value_counts = df[col_name].value_counts()

            n_categories = len(value_counts)
            
            colors_cat_map = plt.get_cmap('tab20') 
            palette = [colors_cat_map(i / (n_categories - 1) if n_categories > 1 else 0) for i in range(n_categories)] # Scale indices for palette, handle n_categories=1

            plt.figure(figsize=(10, 6))
            bars = sns.barplot(x=value_counts.index, y=value_counts.values, hue=value_counts.index, palette=palette, legend=False)
            
            for bar in bars.patches:
                plt.text(
                    bar.get_x() + bar.get_width() / 2,
                    bar.get_height(),
                    f'{int(bar.get_height())}',
                    ha='center',
                    va='bottom',
                    fontsize=10,
                    color='black'
                )

            plt.title(f'Frequency of {col_name}')
            plt.xlabel(col_name)
            plt.ylabel('Count')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()
        else:
            print(f"\nSkipping frequency barplot for '{col_name}' as it appears to be numerical with many unique values or an unsupported categorical type ({df[col_name].nunique()} unique values).")
    else:
        print(f"Skipping '{col_name}': Column not found in the DataFrame.")

**Habitat Density:**\
1 = Dense habitats. Species primarily lives in the lower or middle storey of forest, or in dense thickets, dense shrubland etc.\
2 = Semi-open habitats. Species primarily lives in open shrubland, scattered bushes, parkland, low dry or deciduous forest, thorn forest.\
3 = Open habitats. Species primarily lives in desert, grassland, open water, low shrubs, rocky habitats, seashores, cities. Also applies to species living mainly on top of forest canopy (i.e. mostly in the open).

**Migration:**\
1 = Sedentary.\
2 = Partially migratory, i.e. minority of population migrates long distances, or most of population undergoes short-distance migration, nomadic movements, distinct altitudinal migration, etc.\
3 = Migratory, i.e. majority of population undertakes long-distance migration.