In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
from datasets import load_dataset
#The dataset consists of people who have donated their voice online. 
#You agree to not attempt to determine the identity of speakers in the Common Voice dataset.
dataset = load_dataset("common_voice", "zh-HK", split='validated')

In [None]:
#references https://github.com/scottykwok/cantonese-selfish-project/blob/master/Part4_wav2vec2/Run_wav2vec2_Cantonese.ipynb
import numpy as np
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from functools import lru_cache

# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("scottykwok/wav2vec2-large-xlsr-cantonese")
model = Wav2Vec2ForCTC.from_pretrained("scottykwok/wav2vec2-large-xlsr-cantonese")

In [None]:
#load pandas dataframe of commonvoice validated dataset
df = pd.DataFrame(columns=['path','sentence','results'])
df['path']=dataset['path']
df['sentence']=dataset['sentence']
df=df[df.sentence.apply(lambda x:len(str(x))>8)]#filter sentence length>8
df.reset_index()
df.drop(columns='path')

In [None]:
df=df.head()

#change audio sample rate to 16000
@lru_cache(maxsize=None)
def resampler(rate):
    return torchaudio.transforms.Resample(rate,16000)

predictions=[]
#run model on audio
for path,sentence in zip(df['path'],df['sentence']):
    
    #attempt on loading mp3 with windows
    #audio = AudioSegment.from_mp3('zh-HK/clips/'+path)
    #temp=a.get_array_of_samples()
    #sample_rate=a.framerate
    #audio_input=np.array(y)[::2]
    #audio_input=audio_input.astype('float32')/10000
    #y-=y.mean()
    
    audio_input, sample_rate=torchaudio.load(path)
    transform=resampler(sample_rate)
    audio_input=transform(audio_input)
    
    # pad input values and return pt tensor
    input_values = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_values
    
    # INFERENCE
    # retrieve logits & take argmax
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # transcribe
    predictions.append(processor.decode(predicted_ids[0]))
df['results']=predictions
df.drop(columns='path')

In [None]:
#alternate code to test if the result-saving works
df=df.head()
predictions=list(df['sentence'])
df['results']=predictions
df.drop(columns='path')

In [None]:
df.to_csv('tested.csv')