In [None]:
!pip install torchaudio pandas

import os
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, random_split
import pandas as pd
import torchaudio
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# Charger les métadonnées
train_metadata = pd.read_csv('/kaggle/input/birdclef-2024/train_metadata.csv')
taxonomy = pd.read_csv('/kaggle/input/birdclef-2024/eBird_Taxonomy_v2021.csv')
sample_submission = pd.read_csv('/kaggle/input/birdclef-2024/sample_submission.csv')

# Afficher les premières lignes des métadonnées
print(train_metadata.head())
print(taxonomy.head())
print(sample_submission.head())

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchaudio
import pandas as pd
import os

# Charger les métadonnées pour déterminer le nombre de classes
train_metadata = pd.read_csv('/kaggle/input/birdclef-2024/train_metadata.csv')
num_classes = train_metadata['primary_label'].nunique()

class CNNNetwork(nn.Module):
    def __init__(self, num_classes):
        super(CNNNetwork, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.5)
        self.linear = None  # Placeholder, we will set this later
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        print("Shape after conv1:", x.shape)
        x = self.conv2(x)
        print("Shape after conv2:", x.shape)
        x = self.conv3(x)
        print("Shape after conv3:", x.shape)
        x = self.conv4(x)
        print("Shape after conv4:", x.shape)
        x = self.flatten(x)
        print("Shape after flatten:", x.shape)
        x = self.dropout(x)

        # Initialize linear layer based on flattened size
        if self.linear is None:
            self.linear = nn.Linear(x.shape[1], num_classes).to(x.device)
        
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [None]:
import os
import torch
import torchaudio
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# Définissez votre configuration
ANNOTATIONS_FILE = '/kaggle/input/birdclef-2024/train_metadata.csv'
AUDIO_DIR = '/kaggle/input/birdclef-2024/train_audio'
FOLDER_FILTERS = ['barfly1', 'asbfly', 'bkrfla1', 'brakit1', 'categr']
SAMPLE_RATE = 16000
NUM_SAMPLES = 16000
BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Définir le dataset
class BirdCLEFDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device, folder_filters=None):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        
        if folder_filters is not None:
            print(f"Applying folder filters: {', '.join(folder_filters)}")
            initial_count = len(self.annotations)
            self.annotations = self.annotations[self.annotations['filename'].str.contains('|'.join(folder_filters))]
            filtered_count = len(self.annotations)
            if filtered_count == 0:
                raise ValueError(f"No files found in folders: {', '.join(folder_filters)}")
            print(f"Filtered dataset from {initial_count} to {filtered_count} samples.")

        self.annotations.reset_index(drop=True, inplace=True)
        self.label_to_index = {label: idx for idx, label in enumerate(self.annotations['primary_label'].unique())}

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        filename = self.annotations.iloc[index]['filename']
        path = os.path.join(self.audio_dir, filename)
        return path

    def _get_audio_sample_label(self, index):
        label = self.annotations.iloc[index]['primary_label']
        label_index = self.label_to_index[label]
        return label_index

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_mels=64,
    n_fft=1024,
    hop_length=512
)

# Charger toutes les annotations et filtrer si nécessaire
dataset = BirdCLEFDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device, folder_filters=FOLDER_FILTERS)

# Diviser le dataset en ensemble d'entraînement et de validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

class CNNNetwork(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 16 * 8, 128)  # Corrected dimensions here
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.maxpool(x)
        x = self.relu(self.conv2(x))
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x
# Instancier le modèle
num_classes = len(dataset.label_to_index)
model = CNNNetwork(num_classes=num_classes).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Fonction de formation
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

# Fonction de test
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct

# Entraîner le modèle
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    accuracy = test(val_loader, model, loss_fn)
    print(f"Validation Accuracy: {accuracy:.2f}")

# Sauvegarder le modèle entraîné
torch.save(model.state_dict(), 'birdclef_cnn_model.pth')

In [None]:
import os
import torch
import torchaudio
import pandas as pd
import torch.nn as nn

class CNNNetwork(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 16 * 8, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.maxpool(x)
        x = self.relu(self.conv2(x))
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def process_audio_file(file_path, transformation, target_sample_rate, num_samples, device):
    signal, sr = torchaudio.load(file_path)
    signal = signal.to(device)
    if sr != target_sample_rate:
        resampler = torchaudio.transforms.Resample(sr, target_sample_rate).to(device)
        signal = resampler(signal)
    if signal.shape[1] > num_samples:
        signal = signal[:, :num_samples]
    elif signal.shape[1] < num_samples:
        num_missing_samples = num_samples - signal.shape[1]
        signal = torch.nn.functional.pad(signal, (0, num_missing_samples))
    signal = transformation(signal)
    return signal

def predict(model, input, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index.item()]
    return predicted

def main():
    # Définir les dossiers spécifiques
    FOLDER_FILTERS = ['barfly1', 'asbfly', 'bkrfla1', 'brakit1', 'categr']

    # Charger les métadonnées pour déterminer le nombre de classes
    ANNOTATIONS_FILE = '/kaggle/input/birdclef-2024/train_metadata.csv'
    train_metadata = pd.read_csv(ANNOTATIONS_FILE)
    filtered_metadata = train_metadata[train_metadata['filename'].str.contains('|'.join(FOLDER_FILTERS))]
    num_classes = filtered_metadata['primary_label'].nunique()
    class_mapping = {label: i for i, label in enumerate(filtered_metadata['primary_label'].unique())}
    inv_class_mapping = {v: k for k, v in class_mapping.items()}

    # Charger le modèle
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CNNNetwork(num_classes=num_classes).to(device)
    
    # Charger l'état du modèle
    state_dict = torch.load('birdclef_cnn_model.pth', map_location=device)
    model.load_state_dict(state_dict)
    model.eval()

    # Transformation
    SAMPLE_RATE = 16000
    NUM_SAMPLES = 16000
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_mels=64,
        n_fft=1024,
        hop_length=512
    ).to(device)

    # Parcourir les dossiers spécifiés et prédire les classes
    audio_dir = '/kaggle/input/birdclef-2024/train_audio'
    results = []

    for folder in FOLDER_FILTERS:
        folder_path = os.path.join(audio_dir, folder)
        if os.path.exists(folder_path):
            print(f"Processing folder: {folder}")  # Log the folder being processed
            for root, _, files in os.walk(folder_path):
                for file in files:
                    if file.endswith('.ogg'):
                        file_path = os.path.join(root, file)
                        print(f"Processing file: {file_path}")  # Log the file being processed
                        audio_tensor = process_audio_file(file_path, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device)
                        audio_tensor = audio_tensor.unsqueeze(0)  # Ajouter une dimension pour le batch
                        predicted_class = predict(model, audio_tensor, inv_class_mapping)
                        filename = os.path.basename(file_path)  # Extraire le nom du fichier
                        results.append({'filename': filename, 'bird_species': predicted_class})
                        print(f"Predicted class for {file_path}: {predicted_class}")  # Log the prediction
        else:
            print(f"Folder {folder} does not exist in the audio directory.")

    # Créer le fichier de soumission
    submission = pd.DataFrame(results)
    submission.to_csv('submission.csv', index=False)

    print("Fichier de soumission créé : submission.csv")

if __name__ == "__main__":
    main()