In [1]:
import librosa
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from tqdm import tqdm

In [2]:
# Initialising

# we use the facebook pretrained model
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def get_feature(input_audio):
    
    # normilising the input, so that it has zero mean and unit variance
    normalised_input = feature_extractor(input_audio, return_tensors="pt", 
                                         feature_size=1, sampling_rate=sample_rate,
                                        padding=True)

    with torch.no_grad():
        # getting the representation in Wav2Vec2
        outputs = model(normalised_input.input_values, output_hidden_states=True)

    embeddings = outputs.last_hidden_state

    return np.mean(embeddings[0].numpy(), axis=0).astype(np.float32)

In [6]:
test_folder = "data/test_dataset/"
test_directory_list = os.listdir(test_folder)

files_path = []
features = []

for file_path in tqdm(test_directory_list):
    input_audio, sample_rate = librosa.load(test_folder+file_path,  sr=16000)
    features.append(get_feature(input_audio))

features = np.array(features)

100%|███████████████████████████████████████| 1550/1550 [11:47<00:00,  2.19it/s]


In [7]:
import pickle
# save the model to disk
filename = 'data/GradientBoostingClassifier_weights.sav'

# load the model from disk
clf = pickle.load(open(filename, 'rb'))

In [8]:
y_predicted = clf.predict_proba(features)

In [15]:
float(y_predicted[0])

TypeError: only length-1 arrays can be converted to Python scalars

In [21]:
output = ''

for i in range(len(y_predicted)):
    output += test_directory_list[i] + '|' + '|'.join(
        str(val) for val in y_predicted[i]) + "\n"

In [22]:
with open("answers.txt", "w") as f:
    f.write(output)