In [1]:
#from getdata import getdataset
from torch.utils.data import DataLoader
from torch import nn
import torch
import json
from tqdm import tqdm
from model import CNN_BLSTM


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('config.json') as f:
    data = f.read()
config = json.loads(data)
loaddata_config = config["loaddata_config"]
train_config = config["train_config"]

In [3]:
loaddata_config

{'num_train': 13580,
 'num_test': 4000,
 'num_valid': 3000,
 'bin_root': './data/wav/',
 'data_dir': './data',
 'fft_size': 512,
 'max_timestep': 521}

In [35]:
from scipy.stats.stats import _first
import torch
from utils import read_list
import os
import h5py
import numpy as np
import random
import torch.utils.data
import torchaudio
import torchaudio.transforms as T

MAX_WAV_VALUE = 32768.0

def load_wav_to_torch(full_path):
    signal, sr = torchaudio.load(full_path)
    return signal, sr

class getdataset(torch.utils.data.Dataset):
    def __init__(self, config, seed, mode):

        self.config = config
        mos_list = read_list(os.path.join(config["data_dir"],'mos_list.txt'))
        random.seed(seed)
        random.shuffle(mos_list)
        self.max_wav_value = MAX_WAV_VALUE
        self.spec_fn = T.Spectrogram(
            n_fft=512,
            win_length=512,
            hop_length=256,
            center=True,
            pad_mode="reflect",
            power=2.0,
        )

        self.max_timestep = self.getmax_timestep(config,seed)
        if mode == "train":
            self.filelist = mos_list[0:-(config["num_test"]+config["num_valid"])]
        elif mode == "valid":
            self.filelist = mos_list[-(config["num_test"]+config["num_valid"]):-config["num_test"]]
        elif mode == "test":
            self.filelist= mos_list[-config["num_test"]:]

    def get_magnitude_spec(self, filename):
        audio, sampling_rate = load_wav_to_torch(filename)
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        mag_spec = self.spec_fn(audio_norm)
        mag_spec = torch.squeeze(mag_spec, 0)
        return mag_spec

    def read(self, file_path):
        mag_spec = self.get_magnitude_spec(file_path)
        time_step = mag_spec.shape[2]
        spec_dim = self.config["fft_size"] // 2 + 1
        #mag_spec = np.reshape(mag_spec,(1, time_step, spec_dim))
        return {
            'mag_spec': mag_spec,
        }  

    def pad(self,array, reference_shape):
        
        result = np.zeros(reference_shape)
        result[:array.shape[0],:array.shape[1],:array.shape[2]] = array

        return result

    def getmax_timestep(self,config,seed):
        file_list = read_list(os.path.join(config["data_dir"],'mos_list.txt'))
        random.seed(seed)
        random.shuffle(file_list)
        filename = [file_list[x].split(',')[0] for x in range(len(file_list))]
        for i in range(len(filename)):
            all_feat = self.read(os.path.join(config["bin_root"],filename[i]))
            mag_spec = all_feat['mag_spec']
            if i == 0:
                feat = mag_spec
                max_timestep = feat.shape[2]
            else:
                if mag_spec.shape[2] > max_timestep:
                    max_timestep = mag_spec.shape[2]
        print(max_timestep)
        return max_timestep

    def __getitem__(self, index):
        # Read audio
        filename,mos = self.filelist[index].split(',')
        all_feat = self.read(os.path.join(self.config["bin_root"],filename))
        mag_spec = all_feat['mag_spec']
        ref_shape = [mag_spec.shape[0], mag_spec.shape[1], self.max_timestep]
        
        mag_spec = self.pad(mag_spec,ref_shape)
        print(mag_spec.shape)
        mos=np.asarray(float(mos)).reshape([1])
        frame_mos = np.array([mos*np.ones([mag_spec.shape[1],1])])
        return mag_spec, [mos,frame_mos.reshape((1,-1)).transpose(1,0)]

    def __len__(self):
        return len(self.filelist)


In [36]:
trainset = getdataset(loaddata_config, train_config["seed"], "train")

492


In [37]:
train_loader = DataLoader(trainset, num_workers=0,
                          batch_size=4,
                          pin_memory=False,
                          drop_last=True)

In [None]:
data = iter(train_loader)

In [None]:
for i in data:
    print(type(i))
    print(i.shape)

In [9]:
model = CNN_BLSTM()

In [10]:
torch.cuda.is_available()

True

In [11]:
torch.cuda.current_device()

0

In [12]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [38]:
model.train()

CNN_BLSTM(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 3), padding=(1, 1))
    (5): ReLU()
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 3), padding=(1, 1))
    (5): ReLU()
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 3), padding=(1, 1))
    (5): ReLU()
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding

In [39]:
use_cuda = torch.cuda.is_available()

device = torch.device("cuda" if use_cuda else "cpu")

In [40]:
model = model.to(device)

In [41]:
for i, batch in enumerate(tqdm(train_loader)):
    model.train()
    model.zero_grad()
    model_input, [mos_y, frame_mos_y] = batch
    model_input = torch.autograd.Variable(model_input.to(device, dtype=torch.float))
    #mos_y = mos_y.cuda()
    #frame_mos_y = frame_mos_y.cuda()

    avg_score, frame_score = model(model_input)
    fn_mse1 = nn.MSELoss()
    fn_mse2 = nn.MSELoss()
    #loss = fn_mse1(batch[1][0].cuda(), avg_score) + fn_mse2(batch[1][1].cuda(), frame_score)
    loss = fn_mse1(batch[1][0].to(device), avg_score.to(device) + fn_mse2(batch[1][1].to(device), frame_score.to(device)))
    reduced_loss = loss.item()


  0%|                                                                                                                                                                | 0/3395 [00:00<?, ?it/s]

(1, 257, 492)
(1, 257, 492)
(1, 257, 492)
(1, 257, 492)





RuntimeError: shape '[4, 257, 512]' is invalid for input of size 921088