In [20]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification 
import pandas as pd
import librosa
import numpy as np
from tqdm import tqdm

import sys
sys.path.append("..")
from src.models import EModel, Wav2Vec2Facebook

In [21]:
model_name_or_path = "facebook/wav2vec2-base"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate

path = "/home/work/joono/joono/joono/DV_DV.Deep/te7xe6lt/checkpoints/best-checkpoint_oneshot.ckpt"
model = Wav2Vec2Facebook.load_from_checkpoint(path, args={})

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([768, 8])
torch.Size([8, 768])
torch.Size([7

In [22]:
def speech_file_to_array_fn(path):
    audio, _ = librosa.load(path, sr=sampling_rate)
    inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    return inputs.input_values.squeeze()

In [23]:
# Collate 함수 정의
def collate_fn(batch):
    signals, labels = zip(*batch)
    max_length = max([signal.size(0) for signal in signals])
    padded_signals = torch.zeros(len(signals), max_length)
    for i, signal in enumerate(signals):
        padded_signals[i, :signal.size(0)] = signal
    labels = torch.tensor(labels)
    return padded_signals, labels

In [24]:
class TestDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        path = os.path.join("..", "dataset", self.df.loc[idx, 'path'])
        if not os.path.exists(path):
            raise FileNotFoundError(f"파일을 찾을 수 없습니다: {path}")     
        signal = speech_file_to_array_fn(path)           
        return signal, -1

In [25]:
test_df = pd.read_csv('../dataset/test.csv', index_col=None)
test_df['path'] = '../dataset/' + test_df['path'].str[1:]
test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, shuffle=False, num_workers=24, batch_size=32, collate_fn=collate_fn)

In [26]:
def inference(model, test_loader, device):
    model.to(device)
    model = model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            probs = model(inputs)

            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
            
    return predictions

In [27]:
preds = inference(model=model, test_loader=test_loader, device='cuda:0')
# preds = model.inference(test_loader=test_loader)

100%|██████████| 1563/1563 [01:19<00:00, 19.63it/s]


In [28]:
submit = pd.read_csv('/home/work/joono/joono/dataset/sample_submission.csv')

max_thres = 0.999
min_thres = 0.001

for i in tqdm(range(len(preds))):
    if      preds[i][0] > max_thres : submit.iloc[i, 1] = 1
    elif    preds[i][0] < min_thres : submit.iloc[i, 1] = 0 
    else                            : submit.iloc[i, 1] = preds[i][0]
    if      preds[i][1] > max_thres : submit.iloc[i, 2] = 1
    elif    preds[i][1] < min_thres : submit.iloc[i, 2] = 0 
    else                            : submit.iloc[i, 2] = preds[i][1]

submit[1000:1050]

  0%|          | 0/50000 [00:00<?, ?it/s]

  else                            : submit.iloc[i, 1] = preds[i][0]
  else                            : submit.iloc[i, 2] = preds[i][1]
100%|██████████| 50000/50000 [00:08<00:00, 6148.72it/s]


Unnamed: 0,id,fake,real
1000,TEST_01000,0.210188,0.901941
1001,TEST_01001,0.068295,0.995438
1002,TEST_01002,0.066197,0.978072
1003,TEST_01003,0.070671,0.364453
1004,TEST_01004,0.366598,0.92162
1005,TEST_01005,0.031512,0.876438
1006,TEST_01006,0.043989,0.938361
1007,TEST_01007,0.280403,0.917152
1008,TEST_01008,0.128964,0.991209
1009,TEST_01009,0.018735,0.925891


In [10]:
submit.to_csv('joono_wav2vec2_lora_train_test_align_test_submit.csv', index=False)

In [None]:
preds