In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv(r"C:\D Drive\College\Big Data Analytics\Project 5\Final Submission\Data\train.csv", usecols=['ebird_code', 'channels', 'duration', 'filename', 'species', 'secondary_labels', 'sci_name', 'sampling_rate', 'bitrate_of_mp3', 'file_type'])

# Print the first few rows of the DataFrame
print(df.columns)

Index(['ebird_code', 'channels', 'duration', 'filename', 'species',
       'secondary_labels', 'sci_name', 'sampling_rate', 'bitrate_of_mp3',
       'file_type'],
      dtype='object')


In [2]:
df = df[df['file_type'] == 'mp3']
print(df.shape[0])
df.drop('file_type', axis=1, inplace=True)
df = df[df['secondary_labels'] == '[]']
df.drop('secondary_labels', axis=1, inplace=True)
print(df.shape[0])

21367
13144


In [3]:
df.head()

Unnamed: 0,ebird_code,channels,duration,filename,species,sci_name,sampling_rate,bitrate_of_mp3
1,aldfly,2 (stereo),36,XC135454.mp3,Alder Flycatcher,Empidonax alnorum,44100 (Hz),128000 (bps)
2,aldfly,2 (stereo),39,XC135455.mp3,Alder Flycatcher,Empidonax alnorum,44100 (Hz),128000 (bps)
4,aldfly,2 (stereo),36,XC135457.mp3,Alder Flycatcher,Empidonax alnorum,44100 (Hz),128000 (bps)
5,aldfly,2 (stereo),7,XC135459.mp3,Alder Flycatcher,Empidonax alnorum,44100 (Hz),128000 (bps)
6,aldfly,2 (stereo),45,XC135460.mp3,Alder Flycatcher,Empidonax alnorum,44100 (Hz),128000 (bps)


In [4]:
from tabulate import tabulate
import os
import soundfile as sf

list = []
df['actual_duration'] = None
df['File Count'] = 1

for index, row in df.iterrows():
    filepath = os.path.join("../Data/train_audio", row['ebird_code'], row['filename'])
    if not os.path.exists(filepath):
        print(filepath, "does not exist")
    else:
        try:
            with sf.SoundFile(filepath) as f:
                duration = len(f) / f.samplerate
                df.at[index, 'actual_duration'] = duration
        except Exception as e:
            print(f"Error reading {filepath}: {e}")

summary_stats = df.groupby('ebird_code').agg({'actual_duration': 'sum', 'File Count': 'count'}).reset_index()
summary_stats.rename(columns={'actual_duration': 'Total_duration'}, inplace=True)
df.drop('File Count', axis=1, inplace=True)
df = pd.merge(df, summary_stats, on='ebird_code', how='left')

list.append(["Initial Dataset Shape", df.shape[0]])
df = df[(df['actual_duration']-df['duration']).abs() <= 10]
list.append(["Shape after duration filter", df.shape[0]])
df = df[(df['actual_duration']>=3) & (df['actual_duration']<=300)]
list.append(["Shape after actual duration range filter", df.shape[0]])
df = df[(df['Total_duration'] >= 3000) & (df['Total_duration'] <= 4500)]
list.append(["Shape after total duration range filter", df.shape[0]])
print(tabulate(list, tablefmt='fancy_grid'))

╒══════════════════════════════════════════╤═══════╕
│ Initial Dataset Shape                    │ 13144 │
├──────────────────────────────────────────┼───────┤
│ Shape after duration filter              │ 13100 │
├──────────────────────────────────────────┼───────┤
│ Shape after actual duration range filter │ 12613 │
├──────────────────────────────────────────┼───────┤
│ Shape after total duration range filter  │  3069 │
╘══════════════════════════════════════════╧═══════╛


In [5]:
df.head(10)

Unnamed: 0,ebird_code,channels,duration,filename,species,sci_name,sampling_rate,bitrate_of_mp3,actual_duration,Total_duration,File Count
322,amered,2 (stereo),21,XC125512.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),22.079138,4041.620156,73
323,amered,2 (stereo),13,XC134496.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),14.139297,4041.620156,73
324,amered,2 (stereo),74,XC134499.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),74.372971,4041.620156,73
325,amered,2 (stereo),72,XC135440.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),72.85551,4041.620156,73
326,amered,2 (stereo),32,XC135462.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),32.602608,4041.620156,73
327,amered,2 (stereo),18,XC135463.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),18.837392,4041.620156,73
328,amered,2 (stereo),12,XC135466.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),12.780862,4041.620156,73
329,amered,2 (stereo),23,XC135468.mp3,American Redstart,Setophaga ruticilla,44100 (Hz),128000 (bps),23.725102,4041.620156,73
330,amered,1 (mono),64,XC137604.mp3,American Redstart,Setophaga ruticilla,48000 (Hz),110260 (bps),64.948521,4041.620156,73
331,amered,1 (mono),11,XC137610.mp3,American Redstart,Setophaga ruticilla,48000 (Hz),128000 (bps),12.131813,4041.620156,73


In [6]:
# remaining ebirds
print(len(df['ebird_code'].unique()))
print(df['ebird_code'].unique())

49
['amered' 'amewoo' 'bewwre' 'bkhgro' 'bktspa' 'brespa' 'brncre' 'buggna'
 'buhvir' 'carwre' 'casvir' 'chispa' 'comred' 'comter' 'comyel' 'eastow'
 'foxspa' 'gnttow' 'gockin' 'gocspa' 'grtgra' 'hoowar' 'houfin' 'indbun'
 'lesgol' 'linspa' 'logshr' 'marwre' 'norcar' 'norpar' 'norwat' 'orcwar'
 'pinwar' 'purfin' 'redcro' 'reevir1' 'renpha' 'rocwre' 'ruckin' 'spotow'
 'stejay' 'tuftit' 'veery' 'westan' 'whbnut' 'whcspa' 'whtspa' 'woothr'
 'yerwar']


In [7]:
df_to_train = df[(df['File Count'] > 50) & (df['File Count'] < 80)]

In [8]:
# remaining ebirds
print(len(df_to_train['ebird_code'].unique()))
print(df_to_train['ebird_code'].unique())

40
['amered' 'amewoo' 'bewwre' 'bkhgro' 'bktspa' 'brespa' 'brncre' 'buggna'
 'chispa' 'comter' 'comyel' 'eastow' 'foxspa' 'gnttow' 'gockin' 'grtgra'
 'hoowar' 'houfin' 'indbun' 'lesgol' 'linspa' 'logshr' 'marwre' 'norcar'
 'norpar' 'norwat' 'orcwar' 'pinwar' 'purfin' 'reevir1' 'renpha' 'stejay'
 'tuftit' 'veery' 'westan' 'whbnut' 'whcspa' 'whtspa' 'woothr' 'yerwar']


In [9]:
import shutil
import numpy as np
actual_train_audio_path = r'./train_audio/'
num_input_species = int(input("Enter number of input species: "))

if not os.path.exists(actual_train_audio_path):
    os.makedirs(actual_train_audio_path)
species = df_to_train['ebird_code'].unique()
np.random.shuffle(species)
for _ in range(num_input_species):
    folder_names = [folder for folder in os.listdir(actual_train_audio_path) if os.path.isdir(os.path.join(actual_train_audio_path, folder))]
    species = [i for i in species if i not in folder_names]
    if len(species) > 0:
        filenames = df_to_train[df_to_train['ebird_code'] == species[0]]['filename'].tolist()
        if not os.path.exists(actual_train_audio_path + species[0]):
            os.makedirs(actual_train_audio_path + species[0])
        for j in filenames:
            shutil.copyfile("../Data/train_audio/" + species[0] + "/" + j, actual_train_audio_path + species[0] + "/" + j)
    else:
        print("Currently there are", len(folder_names), "species in the dataset.\nNo more species left to copy.")
        break

Currently there are 40 species in the dataset.
No more species left to copy.
