In [1]:
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import glob
import torchvision
from torch.utils.data import Dataset, DataLoader
import random

import matplotlib.pyplot as plt
import IPython.display as ipd
import torchaudio
import torchaudio.transforms as AT
import warnings
warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"

tug_path = glob.glob('/data/hrjung/DeepShip_Raw/Tug/*.wav')
cargo_path = glob.glob('/data/hrjung/DeepShip_Raw/Cargo/*.wav')
passenger_path = glob.glob('/data/hrjung/DeepShip_Raw/Passengership/*.wav')
tanker_path = glob.glob('/data/hrjung/DeepShip_Raw/Tanker/*.wav')

print(len(tug_path), len(cargo_path), len(passenger_path), len(tanker_path))

69 109 190 237


In [None]:
import numpy as np
import glob

def slice_path(path):
    num_samples = len(path)
    train_slice = int(num_samples * 0.7)
    val_slice = int(num_samples * 0.2)
    
    train_path = path[:train_slice]
    val_path = path[train_slice:train_slice + val_slice]
    test_path = path[train_slice + val_slice:]
    
    return train_path, val_path, test_path


def slice_audio(path):
    arr = []
    
    for i in tqdm(range(len(path))):
        audio, sr = torchaudio.load(path[i])
        audio = audio.squeeze()
        audio = audio/torch.max(audio)
        
        # 공백 영역을 제거
        nonzero_indices = torch.nonzero(audio)
        start_idx = nonzero_indices[0].item()
        end_idx = nonzero_indices[-1].item()
        audio = audio[start_idx:end_idx+1]
        
        # 5초 단위로 자름
        sr_5s = sr * 5
        for j in range(0, len(audio), sr_5s):
            sliced_audio = audio[j:j+sr_5s]
            # 만약 마지막 클립이 5초보다 짧다면 추가x
            if len(sliced_audio) == sr_5s:
                arr.append(sliced_audio)
            
    return np.array(arr)

def extract_random_frames(audio_sample, num_samples=20480):
    total_samples = len(audio_sample)

    start_index = np.random.randint(0, total_samples - num_samples+1)
    extracted_frame = audio_sample[start_index:start_index + num_samples]
    
    return extracted_frame

class AudioDataset(Dataset):
    def __init__(self, file_paths, label, n_mfcc=60):
        self.file_paths = file_paths
        self.label = label
        self.n_mfcc = n_mfcc
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        audio = self.file_paths[idx]
        audio = np.array(audio)
        extracted_frame = extract_random_frames(audio)
        result = feature_extract.stack_features(waveform=extracted_frame)
        result[1] = spec(result[1])
        
        label = int(self.label[idx])
        
        return result.float(), label

def load_data():
    # Load data paths as in the original code
    pass


In [3]:
train_tug, val_tug, test_tug = slice_path(tug_path)
train_cargo, val_cargo, test_cargo = slice_path(cargo_path)
train_passenger, val_passenger, test_passenger = slice_path(passenger_path)
train_tanker, val_tanker, test_tanker = slice_path(tanker_path)
#각각의 클래스에서 trian, val, test로 split

In [None]:
sliced_tug_train = slice(train_tug)
sliced_tug_val = slice(val_tug)
sliced_tug_test = slice(test_tug)

sliced_cargo_train = slice(train_cargo)
sliced_cargo_val = slice(val_cargo)
sliced_cargo_test = slice(test_cargo)

sliced_passenger_train = slice(train_passenger)
sliced_passenger_val = slice(val_passenger)
sliced_passenger_test = slice(test_passenger)

sliced_tanker_train = slice(train_tanker)
sliced_tanker_val = slice(val_tanker)
sliced_tanker_test = slice(test_tanker)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:14<00:00,  3.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:03<00:00,  3.91it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:02<00:00,  3.26it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 76/76 [00:13<00:00,  5.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:03<00:00,  6.20it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00,  6.66it/s]
100%|███████████████████████████████████████████████

In [None]:
train_sliced_audio = np.concatenate((np.array(sliced_tug_train), np.array(sliced_cargo_train), np.array(sliced_passenger_train), np.array(sliced_tanker_train)), axis= 0)
val_sliced_audio = np.concatenate((np.array(sliced_tug_val), np.array(sliced_cargo_val), np.array(sliced_passenger_val), np.array(sliced_tanker_val)), axis= 0)
test_sliced_audio = np.concatenate((np.array(sliced_tug_test), np.array(sliced_cargo_test), np.array(sliced_passenger_test), np.array(sliced_tanker_test)), axis= 0)

In [None]:
sliced_tug_label = np.ones(len(sliced_tug_train)) * 0
sliced_cargo_label = np.ones(len(sliced_cargo_train)) * 1 
sliced_passenger_label = np.ones(len(sliced_passenger_train)) *2
sliced_tanker_label = np.ones(len(sliced_tanker_train)) * 3

train_label = np.concatenate((np.array(sliced_tug_label), np.array(sliced_cargo_label), np.array(sliced_passenger_label), np.array(sliced_tanker_label)), axis= 0)

sliced_tug_label = np.ones(len(sliced_tug_val)) * 0
sliced_cargo_label = np.ones(len(sliced_cargo_val)) * 1 
sliced_passenger_label = np.ones(len(sliced_passenger_val)) *2
sliced_tanker_label = np.ones(len(sliced_tanker_val)) * 3

val_label = np.concatenate((np.array(sliced_tug_label), np.array(sliced_cargo_label), np.array(sliced_passenger_label), np.array(sliced_tanker_label)), axis= 0)

sliced_tug_label = np.ones(len(sliced_tug_test)) * 0
sliced_cargo_label = np.ones(len(sliced_cargo_test)) * 1 
sliced_passenger_label = np.ones(len(sliced_passenger_test)) *2
sliced_tanker_label = np.ones(len(sliced_tanker_test)) * 3

test_label = np.concatenate((np.array(sliced_tug_label), np.array(sliced_cargo_label), np.array(sliced_passenger_label), np.array(sliced_tanker_label)), axis= 0)

# shuffle all segment

In [None]:
sliced_audio = np.concatenate((train_sliced_audio,val_sliced_audio,test_sliced_audio), axis= 0)
sliced_label = np.concatenate((train_label,val_label,test_label), axis= 0)

In [None]:
# def extract_random_frames(audio_sample, num_samples=20480):
#     total_samples = len(audio_sample)

#     start_index = np.random.randint(0, total_samples - num_samples+1)
#     extracted_frame = audio_sample[start_index:start_index + num_samples]
    
#     return extracted_frame

In [None]:
class preprocessor:
    def __init__(self):
        self.n_mfcc = 60
        self.n_mels = 60
        self.sr = 22050

    def extract_mfcc(self, waveform):
        mfcc_feature = librosa.feature.mfcc(y=waveform, sr=self.sr, n_mfcc=self.n_mfcc, hop_length=512)
        return torch.tensor(mfcc_feature)   

    def extract_log_mel(self, waveform):
        mel_spectrogram = librosa.feature.melspectrogram(y=waveform, sr=self.sr, n_mels=self.n_mels, hop_length=512)
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        return torch.tensor(log_mel_spectrogram)

    def extract_cctz(self, waveform):
        chroma = librosa.feature.chroma_stft(y=waveform, sr=self.sr, hop_length=512)
        contrast = librosa.feature.spectral_contrast(y=waveform, sr=self.sr, hop_length=512)
        tonnetz = librosa.feature.tonnetz(y=waveform, sr=self.sr, hop_length=512)
        zero_cross_rate = librosa.feature.zero_crossing_rate(waveform, hop_length=512)

        chroma_tensor = torch.tensor(chroma)
        contrast_tensor = torch.tensor(contrast)
        tonnetz_tensor = torch.tensor(tonnetz)
        zero_cross_rate_tensor = torch.tensor(zero_cross_rate)

        cctz_features = torch.cat([chroma_tensor, contrast_tensor, tonnetz_tensor, zero_cross_rate_tensor], dim=0)
        return cctz_features

    def stack_features(self, waveform):
        mfcc_feature = self.extract_mfcc(waveform=waveform)
        log_mel_feature = self.extract_log_mel(waveform=waveform)
        cctz_feature = self.extract_cctz(waveform=waveform)
        
        # Find the maximum feature dimension
        max_feature_dim = max(mfcc_feature.size(0), log_mel_feature.size(0), cctz_feature.size(0))
    
        # Centered zero padding for feature dimension alignment
        mfcc_tensor = torch.nn.functional.pad(mfcc_feature, (0, 0, (max_feature_dim - mfcc_feature.size(0)) // 2, (max_feature_dim - mfcc_feature.size(0) + 1) // 2))
        log_mel_tensor = torch.nn.functional.pad(log_mel_feature, (0, 0, (max_feature_dim - log_mel_feature.size(0)) // 2, (max_feature_dim - log_mel_feature.size(0) + 1) // 2))
        cctz_tensor = torch.nn.functional.pad(cctz_feature, (0, 0, (max_feature_dim - cctz_feature.size(0)) // 2, (max_feature_dim - cctz_feature.size(0) + 1) // 2))
        
        # pdb.set_trace()
        # Stack the features along a new dimension
        stacked_features = torch.stack([mfcc_tensor, log_mel_tensor, cctz_tensor], dim=0)
        #stacked_features = torch.stack([mfcc_tensor], dim=0)
        return stacked_features
    
feature_extract = preprocessor()

In [None]:
class spec_transform(nn.Module):
    def __init__(self):
        super(spec_transform,self).__init__()
        sr = 22050
        
        self.time = AT.TimeMasking(time_mask_param=3)
        self.freq = AT.FrequencyMasking(freq_mask_param=5)
        
    def forward(self, spec):
        spec = self.time(spec)
        spec = self.freq(spec)
        return spec
    
spec = spec_transform()

In [None]:
num_total_samples = len(sliced_audio)
x = np.arange(num_total_samples)
np.random.shuffle(x)

num_train_samples = int(num_total_samples * 0.7)
num_val_samples = int(num_total_samples * 0.2)

train_x = x[:num_train_samples]
val_x = x[num_train_samples:num_train_samples+num_val_samples]
test_x = x[num_train_samples+num_val_samples:]

train_file_path = [sliced_audio[i] for i in train_x]
val_file_path = [sliced_audio[i] for i in val_x]
test_file_path = [sliced_audio[i] for i in test_x]

train_file_label = [sliced_label[i] for i in train_x]
val_file_label = [sliced_label[i] for i in val_x]
test_file_label = [sliced_label[i] for i in test_x]
#Dataset
train_dataset = AudioDataset(train_file_path, train_file_label)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=10)

val_dataset = AudioDataset(val_file_path, val_file_label)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=True, num_workers=10)

test_dataset = AudioDataset(test_file_path, test_file_label)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True, num_workers=10)

In [None]:
train_dataset[0][0].shape

In [None]:
len(train_dataset),len(val_dataset)