In [12]:
import os
import pandas as pd
import librosa  
import librosa.display
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.models import resnet34
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader 
from tqdm.notebook import tqdm





In [6]:
df = pd.read_csv('meta/esc50.csv')

In [22]:
df.head(10)

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
5,1-101296-B-19.wav,1,19,thunderstorm,False,101296,B
6,1-101336-A-30.wav,1,30,door_wood_knock,False,101336,A
7,1-101404-A-34.wav,1,34,can_opening,False,101404,A
8,1-103298-A-9.wav,1,9,crow,False,103298,A
9,1-103995-A-30.wav,1,30,door_wood_knock,False,103995,A


In [9]:
train = df[df['fold']!=5]
valid = df[df['fold']==5]

In [19]:
wav, sr = librosa.load('audio/1-100032-A-0.wav', sr=None) #audio will be sampled at given / defalut rate. len(wav) = sampling rate * time(s)
print(f'Sampling rate of the audio is {sr} and length of the audio is {len(wav)/sr} seconds{len(wav)}') #f'blakabkakak {name1}, {name2}. balbla'

Sampling rate of the audio is 44100 and length of the audio is 5.0 seconds220500


In [None]:
#making spectrogram using librosa
# input_nfft = int(round(sr*frame_length))
# frame_length = input_nfft/sr

def get_spectrogram(path, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=830, top_db=80):
    wav, sr = librosa.load(path, sr=sr)
    melspec = librosa.feature.melspectrogram(signal, sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, 
                                             fmin=fmin, fmax=fmax)
    melspec_db = librosa.power_to_db(melspec, top_db=top_db)
    return melspec_db
  
    
    

In [21]:
def spec_to_image(spec, eps=1e-6):
    std = spec.std()
    mean = spec.mean()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min) # 0 ~ 255 scaling 공식
    spec_scaled = spec_scaled.astype(np.uint8)
    return spec_scaled


In [23]:
class ESC50(Dataset): #Dataset - 데이터셋을 나타내는 추상클래스. 1)상속 후 2)오버라이드해야함
    def __init__(self, base, df, in_col, out_col):
        self.df = df
        self.data = []
        self.labels = []
        self.c2i = {}
        self.i2c = {}
        self.categories = sorted(df[out_col].unique())
        for i, category in enumerate(self.categories):
            self.i2c[i] = category
            self.c2i[category] = i
        for ind in tqdm(range(len(df))):
            row = df.iloc[ind] #행 번호를 기준으로 행 데이터 읽기
            file_path = os.path.join(base, row[in_col]) #경로를 합쳐 새 경로 생성 / in_col = 0이될듯 / in_col = 'filename'
            self.data.append(spec_to_image(get_spectrogram(file_path))) #
            self.labels.append(self.c2i[row['category']])
    def __len__(self): #전체데이터 크기 리턴해야됨 
        return len(self.data)
    def __getitem__(self, idx): #loaded data 차례로 반환 ==> 특정 i번째 샘플 찾는데 사용
        return self.data[idx], self.labels[idx]
                
            

In [None]:
train_data = ECC50('audio', train, 'file_name', 'category')

In [None]:
valid_data = ECC50('audio', valid, 'file_name', 'category')

In [None]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=16, shuffle=True)

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
class PlainClassifier(nn.Module):
    def __init__(self, ):
        

In [None]:
def train(model, ):
    for epoch in tqdm(range(1,epochs+1)):
        model.train()
        batch_losses = []
        for i, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = F.CrossEntropy(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        for i, data in enumerate(valid_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = F.CrossEntropy(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()

In [None]:
USE_GPU = True

dtype = torch.float32 # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
resnet_34 = resnet34(pretrained=True, progress=True, **kwargs)
resnet_34.fc = nn.linear(512, 50)
resnet_34.conv1 = nn.Con2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
resnet_34.model = resnet_34.to(device) # move model to CPU or GPU

learning_rate=2e-4
epoch = 50
optimizer = optim.Adam(resnet_34.parameters(), lr=learning_rate, betas=(0.9,0.999), eps=1e-8, weight_decay=5e-4, amsgrad=True)

for i in range(epoch):    
    resnet_34.train() # set to train mode... the arch and role are far different when it is on the eval mode #1
    
    pred = resnet_34(x) #2
    loss = F.cross_entropy(pred, label) #3

    optimizer.zero_grad() #4

    loss.backward() #5
    optimizer.step() #6







