# Mount to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/11785_Project/IEMOCAP_full_release_withoutVideos.tar.gz /content/

In [None]:
!tar -xvf IEMOCAP_full_release_withoutVideos.tar.gz

# Pre-processing

In [1]:
from pathlib import Path
import os
import numpy as np
import scipy
import librosa
import json

In [None]:
in_dir = "IEMOCAP_full_release"
out_dir = "IEMOCAP_processed"
Path(out_dir).mkdir(parents=True, exist_ok=True)

In [None]:
def preemphasis(x, preemph):
    return scipy.signal.lfilter([1, -preemph], [1], x)


def mulaw_encode(x, mu):
    mu = mu - 1
    fx = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
    return np.floor((fx + 1) / 2 * mu + 0.5)


def mulaw_decode(y, mu):
    mu = mu - 1
    x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
    return x


def process_wav(wav_path, dir_path, out_path, sr=160000, preemph=0.97, n_fft=2048, n_mels=80, hop_length=160,
                win_length=400, fmin=50, top_db=80, bits=8, offset=0.0, duration=None):
    # print(dir_path + "/" + wav_path)
    wav, _ = librosa.load(dir_path  + "/" + wav_path, sr=sr,
                          offset=offset, duration=duration)
    wav = wav / np.abs(wav).max() * 0.999

    mel = librosa.feature.melspectrogram(preemphasis(wav, preemph),
                                         sr=sr,
                                         n_fft=n_fft,
                                         n_mels=n_mels,
                                         hop_length=hop_length,
                                         win_length=win_length,
                                         fmin=fmin,
                                         power=1)
    logmel = librosa.amplitude_to_db(mel, top_db=top_db)
    logmel = logmel / top_db + 1

    wav = mulaw_encode(wav, mu=2**bits)

    Path(out_path + "/mel").mkdir(parents=True, exist_ok=True)
    Path(out_path + "/wav").mkdir(parents=True, exist_ok=True)

    np.save(out_path + "/mel/" + wav_path[:-4] + ".wav.npy", wav)
    np.save(out_path + "/wav/" + wav_path[:-4] + ".mel.npy", logmel)
    return out_path, logmel.shape[-1]

In [None]:
for session_num in range(1, 6):
    print("Processing session: ", session_num)
    session = "Session" + str(session_num)
    session_dir = in_dir + "/" + session
    out_session_dir = out_dir + "/" + session

    Path(out_session_dir).mkdir(parents=True, exist_ok=True)

    files_path = in_dir + "/" + session + "/sentences/wav"
    for converation in os.listdir(files_path):
      print("--Processing conversation: ", converation)
      out_converation_dir = out_session_dir + "/" + converation
      Path(out_converation_dir).mkdir(parents=True, exist_ok=True)

      converation_dir = files_path + "/" + converation
      for file_name in os.listdir(converation_dir):
        if file_name[-3:] != "wav":
          continue
        print("----Processing file: ", file_name)
        process_wav(file_name, converation_dir, out_converation_dir)
    print()


# Train

## Prepare Json for Each Session

In [None]:
for session_num in range(1, 6):
  session_path = "/content/IEMOCAP_full_release/Session{}/sentences/wav".format(session_num)
  syn_list = []
  for sub_dir in os.listdir(session_path):
    for file_name in os.listdir(session_path + "/" + sub_dir):
      if file_name[-3:] != "wav":
        continue
      info = file_name.split("_")
      gender = info[-1][0]
      if gender == "M":
        syn_list.append([file_name[:-4], "V002", file_name[:-4] + "_toF"])
      else:
        syn_list.append([file_name[:-4], "V001", file_name[:-4] + "_toM"])
    print(syn_list)
  json_string = json.dumps(syn_list)

  text_file = open("/content/drive/MyDrive/11785_Project/VQ-VAE/syn_list/syn_list_session{}.txt".format(session_num), "w")
  n = text_file.write(json_string)
  text_file.close()


## Train Model with each session
#### syn_list dir:

- /content/drive/MyDrive/11785_Project/VQ-VAE/syn_lis
- file name: syn_list_session1.txt - syn_list_session5.txt

#### in_dir
- /content/IEMOCAP_full_release

#### out_dir
- /content/IEMOCAP_full_release_converted

### Test Code

In [16]:
# only for testing file path
in_dir = Path("/content/IEMOCAP_full_release")
out_dir = Path("/content/IEMOCAP_full_release_converted")

with open("/content/drive/MyDrive/11785_Project/VQ-VAE/syn_list/syn_list_session1.txt") as file:
  synthesis_list = json.load(file)
  for wav_path, speaker_id, out_filename in synthesis_list:
    Path(out_dir / "Session1" / "sentences" / "wav" / wav_path[:-5]).mkdir(parents=True, exist_ok=True)

### Training with Session

In [13]:
# move to mounted folder
%cd /content/drive/MyDrive/11785_Project/VQ-VAE

/content/drive/.shortcut-targets-by-id/1MsycVwhsdrkeTJBUYb51RZAEeHuMrsgV/11785_Project/VQ-VAE


In [14]:
# check current dir
!pwd

/content/drive/.shortcut-targets-by-id/1MsycVwhsdrkeTJBUYb51RZAEeHuMrsgV/11785_Project/VQ-VAE


In [None]:
# install requirement
!pip install -r requirements.txt

In [None]:
# training: checkpoint, out_dir, synthesis_list, dataset from current dir; in_dir from downloaded dataset
! python convert.py checkpoint="checkpoints/2019english/model.ckpt-500000.pt" in_dir="/content/IEMOCAP_full_release" out_dir="IEMOCAP_full_release_converted" synthesis_list="syn_list/syn_list_session1.txt" dataset=2019/english

Load checkpoint from: checkpoints/2019english/model.ckpt-500000.pt:
  0% 0/1819 [00:00<?, ?it/s]
  0% 0/127680 [00:00<?, ?it/s][A
  0% 37/127680 [00:00<05:49, 365.13it/s][A
  0% 91/127680 [00:00<05:15, 404.07it/s][A
  0% 152/127680 [00:00<04:44, 448.03it/s][A
  0% 214/127680 [00:00<04:20, 488.65it/s][A
  0% 275/127680 [00:00<04:05, 519.50it/s][A
  0% 330/127680 [00:00<04:02, 526.21it/s][A
  0% 392/127680 [00:00<03:51, 550.63it/s][A
  0% 449/127680 [00:00<03:49, 553.60it/s][A
  0% 508/127680 [00:00<03:46, 561.86it/s][A
  0% 564/127680 [00:01<03:48, 556.09it/s][A
  0% 620/127680 [00:01<04:00, 529.36it/s][A
  1% 679/127680 [00:01<03:52, 546.13it/s][A
  1% 736/127680 [00:01<03:49, 552.80it/s][A
  1% 793/127680 [00:01<03:47, 556.93it/s][A
  1% 849/127680 [00:01<03:49, 553.13it/s][A
  1% 906/127680 [00:01<03:48, 555.47it/s][A
  1% 962/127680 [00:01<03:51, 546.76it/s][A
  1% 1017/127680 [00:01<03:55, 537.76it/s][A
  1% 1071/127680 [00:01<03:55, 536.50it/s][A
  1% 1131/1276