# EDA on Shaye Annotations

This notebook explores the dataset `shaye_annotations_3.1.26_extended.csv`.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split

# Set plot style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## Load Data

In [None]:
file_path = os.path.expanduser("~/soundbay/shaye_annotations_3.1.26_extended.csv")

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    df = pd.read_csv(file_path)
    print("Data loaded successfully.")

## Overview

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Missing Values

In [None]:
missing_vals = df.isnull().sum()
missing_vals[missing_vals > 0]

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

## Distributions

In [None]:
# Call Length Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['call_length'], kde=True, bins=30)
plt.title('Distribution of Call Lengths')
plt.xlabel('Call Length (seconds)')
plt.show()

In [None]:
# Count of Labels, add number as well on each bar
if 'label' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(y=df['label'], order=df['label'].value_counts().index)
    for p in plt.gca().patches:
        plt.gca().text(p.get_width() + 0.01, p.get_y() + p.get_height()/2.,
                       str(int(p.get_width())),
                       ha='left', va='center', fontsize=12)
    plt.title('Count of Labels')
    plt.show()

In [None]:
# Count of Species
if 'Species' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(y=df['Species'], order=df['Species'].value_counts().index)
    plt.title('Count of Species')
    plt.show()

## Bivariate Analysis

In [None]:
# Remove 'species' column
df.drop(columns=['Species'], inplace=True)

In [None]:
df.shape

In [None]:
# analyze the null values
df_null = df.isnull().sum()
# analyze df_null regarding which categories are inside the null dataframe


In [None]:
df_null = df[df.isnull().any(axis=1)]
df_null.head()


In [None]:
df_null.shape

In [None]:
# remove rows where s3_path is NaN
df = df.dropna(subset=['s3_path'])
# remove rows where call_length is more than 10 seconds
df = df[df['call_length'] <= 1]
# check max 5 call length

In [None]:
df_positive = df[df['label'] == 1]
df_negative = df[df['label'] == 0]

from soundbay.utils.metadata_processing import bg_from_non_overlap_calls
df_bg = bg_from_non_overlap_calls(df_positive)



In [None]:
df_bg.shape, df_negative.shape, df_positive.shape

In [None]:
all_df = pd.concat([df_bg,df_negative, df_positive])

In [None]:

sorted_dfs = all_df.sort_values('begin_time', ascending=True)
newdf = bg_from_non_overlap_calls(sorted_dfs)

newdf['filename'] = newdf['filename'].apply(lambda x: x[:-4])
newdf['s3_path'] = newdf['s3_path'] + '/' + newdf['filename'] + '.wav'
newdf['filename'] = newdf['filename'].astype('str')
newdf['s3_path'] = newdf['s3_path'].astype('str')


group_labels = newdf.groupby('annotations_filename')['label'].agg(lambda x: x.mode()[0])

train_groups, test_groups = train_test_split(
    group_labels.index,
    stratify=group_labels.values,
    test_size=0.2,
    random_state=42
)

df_train = newdf[newdf['annotations_filename'].isin(train_groups)].reset_index(drop=True)
df_val = newdf[newdf['annotations_filename'].isin(test_groups)].reset_index(drop=True)

In [None]:
df_val.shape, df_train.shape
# save both to csv in /home/ubuntu/soundbay/datasets/shaye_dfs
df_train.to_csv('/home/ubuntu/soundbay/datasets/shaye_dfs/shaye_train_10_1_26.csv')
df_val.to_csv('/home/ubuntu/soundbay/datasets/shaye_dfs/shaye_val_10_1_26.csv')