In [1]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, pipeline
import torch
import soundfile as sf

# # Load the model and processor
model_name = "hafidikhsan/Wav2vec2-large-robust-Pronounciation-Evaluation"
# model_name = "wav2vec2-english-pronunciation"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

# Use a pipeline as a high-level helper
from transformers import pipeline
from huggingface_hub import snapshot_download

# model_name = "hafidikhsan/wav2vec2-large-xlsr-53-english-pronunciation-evaluation-aod-real"
# snapshot_download(model_name, local_dir="./hafidikhsan--wav2vec2-large-xlsr-53-english-pronunciation-evaluation-aod-real")

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "audio-classification", 
    model=model,
    feature_extractor=processor,
    chunk_length_s=30,
    device=device
)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at hafidikhsan/Wav2vec2-large-robust-Pronounciation-Evaluation were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at hafidikhsan/Wav2vec2-large-robust-Pronounciation-Evaluation and are newly initializ

In [2]:
from pydub import AudioSegment
import numpy as np
from collections import Counter

# Load your audio file
file_path = "media/audio/24/CAMERA.wav"
audio_input = AudioSegment.from_file("media/audio/24/CAMERA.wav")

audio_input = audio_input.set_frame_rate(16000).set_channels(1)

audio_array = np.array(audio_input.get_array_of_samples()).astype(np.float32) / 32768.0

chunk_length = 30 * 16000
chunk_overlap = 5 * 16000

correctness = []
classes = ["incorrect", "correct"]
id2label = {
    0: "proficient",
    1: "advanced",
    2: "intermediate",
    3: "beginer"
  }

# for i in range(0, len(audio_array), chunk_length):
#     timestep = i // chunk_length + 1
#     chunk = audio_array[i:(i+chunk_length-chunk_overlap)]

#     inputs = processor(chunk, sampling_rate=16000, return_tensors="pt", padding=True)

#     with torch.no_grad():
#         logits = model(**inputs).logits

#     predicted_class = torch.argmax(logits, dim=-1).item()

#     correctness.append(id2label[predicted_class])
#     print(f"Pronounciation at {timestep} is {id2label[predicted_class]}")

# c = Counter(correctness)
# print(c)
# total = sum(c.values())
# percent = {key: value / total * 100 for key, value in c.items()}
# for key in id2label.values():
#     if key not in percent.keys():
#         percent[key] = 0

In [3]:
# Process the audio input
# print(percent)

In [4]:
def get_frames_from_timestamp(audio, start_timestamp_ms, stop_timestamp_ms):
    start_timestamp_ms = start_timestamp_ms * 1000
    stop_timestamp_ms = stop_timestamp_ms * 1000
    
    # Ensure the start timestamp is within the audio duration
    if start_timestamp_ms >= len(audio):
        raise ValueError("Start timestamp is beyond the audio duration")

    # Slice the audio from the start timestamp
    # sliced_audio = audio[start_timestamp_ms:]

    # Generate frames
    # for i in range(0, len(sliced_audio), frame_duration_ms):
    #     yield 
    return audio[start_timestamp_ms:stop_timestamp_ms]

In [5]:
import math
import json

# logits = pipe(audio_array.copy(), batch_size=2)
# Load your audio file
file_path = "media/audio/24/CAMERA.wav"
audio_input = AudioSegment.from_file("media/audio/24/CAMERA.wav")

audio_input = audio_input.set_frame_rate(16000).set_channels(1)

with open("temp_results.json", "r") as reader:
    timestamp_data = json.load(reader)["data"]

for data in timestamp_data:
    start_timestamp, stop_timestamp = data["timestamp"]
    if stop_timestamp is None:
        stop_timestamp = len(audio_input) / 1000
        
    # start_timestamp_ms = start_timestamp_ms * 1000
    # stop_timestamp_ms = stop_timestamp_ms * 1000
    # frames = audio_input[start_timestamp_ms:].generate_frames_as_segments(frame_duration_ms)

    # for segment, timestamp in get_frames_from_timestamp(audio_input, start_timestamp_ms, frame_duration_ms):
    segment = get_frames_from_timestamp(audio_input, start_timestamp, stop_timestamp)
    segment = np.array(segment.get_array_of_samples()).astype(np.float32) / 32768.0
    inputs = processor(segment, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class = torch.argmax(logits, dim=-1).item()

    correctness.append(id2label[predicted_class])
    
    print(f"Pronounciation at {math.ceil(start_timestamp)}:{int((start_timestamp - int(start_timestamp)) * 60)} - {math.ceil(stop_timestamp)}:{int((stop_timestamp - int(stop_timestamp)) * 60)} is {id2label[predicted_class]}")

Pronounciation at 0:0 - 26:16 is advanced
Pronounciation at 34:26 - 39:26 is advanced
Pronounciation at 40:45 - 44:4 is advanced
Pronounciation at 44:4 - 51:26 is advanced
Pronounciation at 51:26 - 52:26 is advanced
Pronounciation at 52:26 - 59:50 is advanced
Pronounciation at 59:50 - 61:43 is advanced
Pronounciation at 61:43 - 65:0 is advanced
Pronounciation at 66:2 - 67:52 is advanced
Pronounciation at 67:52 - 72:52 is intermediate
Pronounciation at 74:14 - 79:14 is intermediate
Pronounciation at 82:47 - 91:38 is advanced
Pronounciation at 91:38 - 114:40 is advanced
Pronounciation at 114:40 - 127:0 is intermediate
Pronounciation at 127:0 - 141:48 is intermediate
Pronounciation at 141:48 - 151:52 is advanced
Pronounciation at 151:52 - 160:0 is intermediate
Pronounciation at 161:43 - 169:16 is advanced
Pronounciation at 169:16 - 174:31 is advanced
Pronounciation at 174:31 - 180:57 is advanced
Pronounciation at 180:57 - 189:38 is advanced
Pronounciation at 190:26 - 196:45 is advanced
Pr

In [6]:
c = Counter(correctness)
print(c)
total = sum(c.values())
percent = {key: value / total * 100 for key, value in c.items()}
for key in id2label.values():
    if key not in percent.keys():
        percent[key] = 0

Counter({'advanced': 139, 'intermediate': 47, 'beginer': 10, 'proficient': 1})
