# Sample size

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

scratch_dir = "~/scratch/birdclef/data/2025"
model_name = "Perch"
embed_dir = f"{scratch_dir}/train_audio-infer-soundscape/{model_name}/parts/embed/"


def preprocess_data(input_path: str) -> pd.DataFrame:
    df = pd.read_parquet(input_path)
    # concatenate all embeddings into a single DataFrame
    df["species_name"] = df["file"].apply(
        lambda x: x.split("train_audio/")[1].split("/")[0]
    )
    # train/test split requries y label to have at least 2 samples
    # remove species with less than 2 samples
    species_count = df["species_name"].value_counts()
    valid_species = species_count[species_count >= 2].index
    filtered_df = df[df["species_name"].isin(valid_species)].reset_index(drop=True)
    # concatenate embeddings
    embed_cols = list(map(str, range(1280)))
    filtered_df["embeddings"] = filtered_df[embed_cols].values.tolist()
    df_embs = filtered_df[["species_name", "embeddings"]].copy()
    print(f"DataFrame shape: {df_embs.shape}")
    print(f"Embedding size: {len(df_embs['embeddings'].iloc[0])}")
    return df_embs


df = preprocess_data(embed_dir)
df.head(5)

DataFrame shape: (187838, 2)
Embedding size: 1280


Unnamed: 0,species_name,embeddings
0,1139490,"[0.035192787647247314, 0.312198281288147, -0.1..."
1,1139490,"[-0.04997699335217476, -0.03501279279589653, 0..."
2,1139490,"[-0.01223329920321703, -0.025818223133683205, ..."
3,1139490,"[-0.027199307456612587, -0.029882565140724182,..."
4,1139490,"[-0.06566715240478516, -0.08326669782400131, -..."


In [3]:
import math

# SUPPORTED CONFIDENCE LEVELS: 50%, 68%, 90%, 95%, and 99%
confidence_level_constant = {
    50: 0.67,
    68: 0.99,
    90: 1.64,
    95: 1.96,
    99: 2.57,
}


def sample_size(population_size, confidence_level, confidence_interval):
    Z = confidence_level_constant.get(int(confidence_level), 0.0)
    p = 0.5
    e = confidence_interval / 100.0
    N = population_size

    if Z == 0.0:
        raise ValueError(f"Unsupported confidence level: {confidence_level}")

    n_0 = ((Z**2) * p * (1 - p)) / (e**2)
    n = n_0 / (1 + ((n_0 - 1) / float(N)))
    return int(math.ceil(n))


def compute_sample_size(
    population_size: int,
    confidence_level: float = 95.0,
    confidence_interval: float = 2.0,
):
    sample_sz = sample_size(population_size, confidence_level, confidence_interval)
    print(f"SAMPLE SIZE: {sample_sz}")
    return sample_sz


compute_sample_size(population_size=100000)

SAMPLE SIZE: 2345


2345

In [4]:
n_samples = len(df)
compute_sample_size(
    population_size=n_samples,
    confidence_level=95.0,
    confidence_interval=2.0,
)

SAMPLE SIZE: 2371


2371