In [4]:
import os 

import torch 
import torch.nn 
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import pickle

from transformers import WhisperProcessor, WhisperTokenizer, WhisperForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import nlptutti as metrics
import torch.optim as optim
from tqdm.auto import tqdm 

import whisper
#from my_model import Mymodel
from data_utils import MinyoDataset, custom_collate_fn, get_wer


from my_model import Mymodel

In [10]:
audio_path = '/home/daewoong/userdata/danbi/final_tts_audio'
lyric_path = '/home/daewoong/userdata/danbi/final_lyrics_data/'
each_lyric = '/home/daewoong/userdata/danbi/each_song_lyrics.txt'
result_path = '/home/daewoong/userdata/danbi/encoder_result'
filtered_id_list = pickle.load(open('/home/daewoong/userdata/danbi/thirty_second_filtered_id.pkl', 'rb'))

print('download token now')

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2", language="ko", task="transcribe", predict_timestamps=True)
dataset = MinyoDataset(result_path, lyric_path, processor, filtered_id_list, each_lyric, max_len = 1024, random_ratio=0.0)
print('token download complete')

# train_size = int(len(dataset) * 0.8)
# valid_size = len(dataset) - train_size

# train_data, valid_data = random_split(dataset, [train_size, valid_size])

# train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=custom_collate_fn, num_workers=8, pin_memory=True)
# valid_dataloader = DataLoader(valid_data, batch_size=4, shuffle=False, collate_fn=custom_collate_fn, num_workers=8, pin_memory=True)


download token now
token download complete


In [18]:
dataset.text_process[3005]

('우리는 왜 이리 밤낮없이 자고나먼 땅만 파고허여도',
 '우리 어머니는 나를 날적에\n고비 나물을 잡쉈나\n우리 어머니는 나를 길러서\n무엇땜이 날 갖다 이런 데다 성궈놓고\n넘들은 이짓 않고 잘살고 잘먹고\n이지경으로 사나\n우리는 왜 이리 밤낮없이\n아이구 내신세야 내 팔자야\n이짓도 않고 팔자좋게 잘지내두만\n내가 뭣땜이 이런 데로와서\n고대광실 높은 집에 앉아서\n날 고생을 이리나 시겨 주시나야\n자고나먼 땅만 파고허여도\n이런 것을 허니라고 이고생을 하고 사나야\n내 팔자야 내 신세야',
 'lyric6241-3')

In [4]:
pre_model = whisper.load_model("/home/daewoong/userdata/danbi/whisper_pretrain/large-v2.pt", device='cpu')

In [5]:
processor.tokenizer.set_prefix_tokens(language="ko", task="transcribe", predict_timestamps=True)
# pre_model.config.forced_decoder_ids = None
# pre_model.config.suppress_tokens = []



criterion = nn.CrossEntropyLoss(ignore_index = -100)

device = 'cuda'
epoch = 4
model_dims = pre_model.dims
model_dims.n_ref_encoder_layer = 2
model_dims.n_ref_decoder_layer = 2
model_dims.n_ref_text_ctx = 1024
model_dims.n_ref_text_state = 240

model = Mymodel(model_dims)

model.encoder.load_state_dict(pre_model.encoder.state_dict())
model.decoder.token_embedding.load_state_dict(pre_model.decoder.token_embedding.state_dict())
model.decoder.blocks.load_state_dict(pre_model.decoder.blocks.state_dict())
model.decoder.positional_embedding.data=pre_model.decoder.positional_embedding.data.clone()
model.ref_encoder.token_embedding.load_state_dict(pre_model.decoder.token_embedding.state_dict())


<All keys matched successfully>

In [26]:
audio, audio_attn, input_text, input_txt_attn, around_text, around_txt_attn = next(iter(train_dataloader))

In [27]:
x_input_text = input_text[:,:-1] #[batch, seq_len-1]
# train_batch += input_text.size(0)
true_input_text = input_text[:,1:] #[batch, seq_len-1]
x_input_text.shape

torch.Size([4, 39])

In [28]:
#model.to(device)
  
for param in model.encoder.parameters():
  param.requires_grad = False
for param in model.decoder.blocks.parameters():
  param.requires_grad = False

#pred = model(audio.to(device), around_text.to(device), tokens=x_input_text.to(device))

In [29]:
model.to('cuda')
#tokens: torch.Tensor, audio_features: torch.Tensor, ref_features:
pred = model(mel = audio.to(device), ref_text = around_text.to(device), tokens=x_input_text.to(device))

tensor([[[ 1.2509e+00, -1.5443e+00, -2.4126e-01,  ..., -1.1996e+00,
          -1.9290e-01,  3.1803e-01],
         [-1.1485e+00, -1.5220e-02,  9.6799e-02,  ..., -5.6689e-01,
           6.1495e-01, -1.3621e+00],
         [-1.3679e+00,  7.5911e-01, -1.5911e+00,  ...,  9.6563e-01,
          -1.0292e+00,  2.2303e+00],
         ...,
         [ 4.2903e-01, -1.5980e-01,  2.9957e-01,  ...,  7.5604e-01,
          -6.0803e-01, -8.2439e-01],
         [ 1.7943e-01, -7.7869e-01, -3.4541e+00,  ...,  1.3234e+00,
          -1.5520e+00,  1.4877e+00],
         [ 1.1394e-01, -1.8901e-01, -5.0344e-01,  ...,  1.9427e-03,
          -5.4099e-01,  2.1125e+00]],

        [[ 1.2496e+00, -1.5443e+00, -2.4209e-01,  ..., -1.1966e+00,
          -1.8885e-01,  3.1650e-01],
         [-1.1493e+00, -1.6606e-02,  9.5269e-02,  ..., -5.6438e-01,
           6.1983e-01, -1.3643e+00],
         [-1.3681e+00,  7.5855e-01, -1.5930e+00,  ...,  9.6777e-01,
          -1.0241e+00,  2.2276e+00],
         ...,
         [ 4.2799e-01, -1

In [12]:
pred

tensor([[[ 7.7596e-01, -2.0786e-01,  1.1276e+00,  ...,  1.0815e+00,
           8.8370e-01,  2.1638e+00],
         [-5.1833e-01,  2.0370e-02,  9.3164e-01,  ...,  7.9126e-01,
           1.6226e-01,  1.5217e+00],
         [ 2.2917e+00,  2.0330e+00,  2.0590e+00,  ...,  1.0363e+00,
           1.1996e+00,  1.1691e-01],
         ...,
         [ 1.1230e+00,  5.9915e-01,  1.5143e+00,  ...,  2.4483e-01,
          -2.9402e-01, -2.2606e-03],
         [ 1.1755e-01,  1.6448e-03,  5.7252e-01,  ...,  2.7672e-01,
           1.8164e-01,  2.9102e-01],
         [ 2.1178e+00,  1.0049e+00,  4.0970e-01,  ..., -1.0399e+00,
          -1.0569e+00, -2.1100e+00]]], device='cuda:0',
       grad_fn=<UnsafeViewBackward0>)

In [30]:
pred.shape, true_input_text.shape

(torch.Size([4, 39, 51865]), torch.Size([4, 39]))

In [35]:
processor.batch_decode(pred.argmax(-1), skip_special_tokens=True)

['�� 밥도 해� 준다 해를 해를 째매야 하냐 부� 부� 부� 부� 부� 부� 부� 부� 부� 부�',
 '신이 잘라�나요요 세상에 누가 잘�나 산천초목에 불��는 것은 산직기나 꺼주��만은 당신',
 '��� 벨� 때 앞서서 나간 사람들은 뒤에 처�진 사람들에게 높은 뒔로 쫓아 들어가라고 하면서 하면서 같이 가자자는 말을 한다',
 ' 이 어디 어디 어디 올라서민서 잡아당겨요 이러이� 처� 파� �으로 이 넘으로�� 올라 가자자 이러 이러 어 어 어 어 어 어 어']

In [36]:
processor.batch_decode(true_input_text, skip_special_tokens=True)

['부모 밥도 해 준다 해를 해를 째매야 하냐',
 '당신이 잘났나 요 세상에 누가 잘 나 산천초목에 불붙는 것은 산지기나 꺼주련만은',
 '벼를 벨 때 앞서서 나간 사람들은 뒤에 쳐진 사람들에게 높은 디로 쫓아 들어가라 고 하면서 같이 가자는 말을 한다',
 '어 이러 어디 어디 어디 올라서민서 잡아 당기어 이러 이러 저 바우 넘으로 이 넘어로 올라가자 이러 이러']

In [39]:
wers_n, wers_s, wers_d, wers_i, cers_n, cers_s, cers_d, cers_i  = get_wer(pred, true_input_text, processor)

In [48]:
wers_i

17

In [62]:
true_input_text.shape

torch.Size([4, 39])

In [73]:
pred.view(-1, pred.size(-1)).shape

torch.Size([156, 51865])

In [77]:
(true_input_text.masked_fill(input_txt_attn[:, 1:].ne(1) , -100)).view(-1).shape

torch.Size([156])

In [80]:
criterion(pred.view(-1, pred.size(-1)), (true_input_text.masked_fill(input_txt_attn[:, 1:].ne(1) , -100)).view(-1).to(device))

tensor(5.4191, device='cuda:0', grad_fn=<NllLossBackward0>)

In [88]:
ttt = model.encoder(audio.to(device))

In [89]:
ttt.shape

torch.Size([4, 1500, 1280])

In [92]:
ttt.dtype

torch.float32

In [95]:
model.transcribe(audio[0])

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.71 GiB (GPU 0; 47.54 GiB total capacity; 39.73 GiB already allocated; 5.32 GiB free; 40.52 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
true_input_text = true_input_text.masked_fill(input_txt_attn[:, 1:].ne(1) , -100)

#pred [batch, seq_len, ]
loss = criterion(pred.view(-1, pred.size(-1)), true_input_text.view(-1).to(device))


In [59]:
wers_n

57

In [60]:
(wers_s + wers_d + wers_i) / wers_n

0.8245614035087719

In [37]:

def get_wer(pred, target, processor):
  
  cers_n = 0
  wers_n = 0
  
  wers_s = 0
  wers_d = 0
  wers_i = 0
  cers_s = 0 
  cers_d = 0
  cers_i = 0
  
  pred = pred.argmax(-1)
  pred_list = processor.batch_decode(pred, skip_special_tokens=True)
  target_list = processor.batch_decode(target, skip_special_tokens=True)

  for pred, target in zip(pred_list, target_list):

    cers_n += len(target.replace(" ", ""))
    wers_n += len(target.split())

    result_wer = metrics.get_wer(target, pred)
    result_cer = metrics.get_cer(target, pred)

    wers_s += result_wer['substitutions']
    wers_d += result_wer['deletions']
    wers_i += result_wer['insertions']

    cers_s += result_cer['substitutions']
    cers_d += result_cer['deletions']
    cers_i += result_cer['insertions']
        
    # result_crr = result_crr['crr']
    # result_cer = result_cer['cer']
    # result_wer = result_wer['wer']

  return wers_n, wers_s, wers_d, wers_i, cers_n, cers_s, cers_d, cers_i 

In [19]:
criterion(pred.view(-1, pred.size(-1)), true_input_text.view(-1).to('cuda'))

tensor(5.1554, device='cuda:0', grad_fn=<NllLossBackward0>)

In [10]:
model.to(device)
model.ref_encoder(around_text.to(device))

tensor([[[-0.0014,  0.0030, -0.0087,  ..., -0.0003,  0.0002,  0.0159],
         [-0.0063, -0.0056, -0.0209,  ..., -0.0677, -0.0222,  0.0315],
         [-0.0130, -0.0174, -0.0148,  ..., -0.0014,  0.0152,  0.0038],
         ...,
         [-0.0122,  0.0368, -0.0029,  ...,  0.0075, -0.0381,  0.0289],
         [-0.0122,  0.0368, -0.0029,  ...,  0.0075, -0.0381,  0.0289],
         [-0.0122,  0.0368, -0.0029,  ...,  0.0075, -0.0381,  0.0289]]],
       device='cuda:0', grad_fn=<EmbeddingBackward0>)
tensor([[[ 0.6196,  1.9599,  0.7894,  ...,  0.3508, -1.2241,  0.4873],
         [ 0.1104, -0.6934,  0.3184,  ..., -0.7842, -0.1767,  0.3938],
         [ 2.1164,  0.1866,  0.1350,  ...,  0.2515,  0.5931,  0.6284],
         ...,
         [ 0.6805,  0.2545, -0.9995,  ..., -2.1879, -1.0458,  0.2098],
         [ 0.8337,  0.5682, -1.2653,  ..., -0.1606, -1.2800, -0.7900],
         [ 0.0692,  0.1358, -0.0836,  ..., -1.4638,  0.1160,  1.1889]]],
       device='cuda:0', grad_fn=<AddBackward0>)
tensor([[[-0.2

tensor([[[ 1.6014, -2.5540, -2.5017,  ..., -0.4078,  0.8544, -2.2407],
         [-0.3728, -0.9477, -0.2267,  ...,  0.8787,  1.2461,  0.1079],
         [ 0.4283,  2.0171, -0.1819,  ...,  1.8734,  1.3777, -1.2547],
         ...,
         [ 1.7700, -0.9116,  1.1887,  ..., -0.7926, -1.3691,  0.6146],
         [-1.5806, -0.8774, -0.7879,  ...,  1.7944,  0.7011, -1.1102],
         [-1.8294,  0.0158,  0.7911,  ...,  0.3893, -0.2903,  3.3937]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

In [28]:
model.forward()

TypeError: forward() missing 3 required positional arguments: 'mel', 'ref_text', and 'tokens'

In [None]:
pred.shape #[batch, 43, emb_size]

In [None]:
true_input_text.shape

In [23]:
x_input_text.shape

torch.Size([1, 16])

In [27]:
pre_model.forward

<bound method Whisper.forward of Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0): ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=False)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (out): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
      (

In [None]:
criterion(pred.view(-1, pred.size(-1)), true_input_text.view(-1))

In [None]:
import pandas