In [None]:
!pip3 install -q git+https://github.com/vasudevgupta7/gsoc-wav2vec2@main
!pip install SpeechRecognition
!pip install ffmpeg-python
!pip install MoviePy
!pip install pydub

!wget https://huggingface.co/vasudevgupta/gsoc-wav2vec2-960h/resolve/main/saved-model.tar.gz
!tar -xf saved-model.tar.gz

import soundfile as sf
import os
import tensorflow as tf
import tensorflow_hub as hub
import subprocess
import ffmpeg
import librosa
from pydub import AudioSegment
from google.colab import files
from moviepy.editor import AudioFileClip

from wav2vec2 import Wav2Vec2Processor

model = hub.KerasLayer("saved-model")


@tf.function(jit_compile=True)
def tf_forward(speech):
  tf_out = model(speech, training=False)
  return tf.squeeze(tf.argmax(tf_out, axis=-1))


tokenizer = Wav2Vec2Processor(is_tokenizer=True)
processor = Wav2Vec2Processor(is_tokenizer=False)

AUDIO_MAXLEN = 246000
DO_PADDING = True

#preprocess speech, restricting sequence length to 246000
def preprocess_speech(audio):
    audio = tf.constant(audio, dtype=tf.float32)
    audio = processor(audio)[None]
    if DO_PADDING:
        audio = audio[:, :AUDIO_MAXLEN]
        padding = tf.zeros((audio.shape[0], AUDIO_MAXLEN - audio.shape[1]), dtype=audio.dtype)
        audio = tf.concat([audio, padding], axis=-1)
    return audio

#convert files from webm to wav format
def webm_2_wav(name):
      from pydub import AudioSegment
      wav = AudioSegment.from_file(name, format = "webm")
      getaudio = wav.export(name.split(".")[:-1][0]+".wav", format="wav")
      return getaudio

#convert files from mp4 to wav format
def mp4_2_wav(name):
    input_file = name
    output_file = name.split(".")[:-1][0]+".wav"
    subprocess.call(['ffmpeg', '-i', input_file, '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '2', output_file])
    return output_file

#convert files from mov to wav format
def mov_2_wav(name):
    input_file = name
    output_file = name.split(".")[:-1][0]+".wav"

    audio_clip = AudioFileClip(input_file)
    audio_clip.write_audiofile(output_file)
    return output_file

#converter between webm|mp4|flac|mov and wav
def converter(uploaded):
    if list(uploaded.keys())[0].split(".")[-1] == "wav":
        #print("wav")
        output_file = list(uploaded.keys())[0]
        return output_file
    elif list(uploaded.keys())[0].split(".")[-1] == "flac":
        #print("flac")
        output_file = list(uploaded.keys())[0]
        return output_file
    elif list(uploaded.keys())[0].split(".")[-1] == "webm":
        #print("webm")
        output_file = webm_2_wav(list(uploaded.keys())[0])
        return output_file
    elif list(uploaded.keys())[0].split(".")[-1] == "mp4":
        #print("mp4")
        output_file = mp4_2_wav(list(uploaded.keys())[0])
        return output_file
    elif list(uploaded.keys())[0].split(".")[-1] == "mov":
        #print("mov")
        output_file = mov_2_wav(list(uploaded.keys())[0])
        return output_file 


#split .wav file into 15 second .wav files
def split_wav(audio):
    # Set the path of the input .wav file
    input_file = audio

    # Set the duration of each segment in milliseconds (e.g. 5000 for 5 seconds)
    segment_duration = 15000

    # Load the audio file
    audio = AudioSegment.from_wav(input_file)

    # Split the audio file into segments
    segments = []
    for i in range(0, len(audio), segment_duration):
        segment = audio[i:i+segment_duration]
        segments.append(segment)

    names_of_segments = []
    # Save each segment as a separate file
    for i, segment in enumerate(segments):
        output_file = os.path.splitext(input_file)[0] + f"_segment{i}.wav"
        segment.export(output_file, format="wav")
        names_of_segments.append(output_file)
    return names_of_segments

#resample .wav file and reduce number of channels
def preprocess_audio(output_file):
    speech, samplerate = sf.read(output_file)

    if len(speech.shape) > 1: 
        speech = speech[:,0] + speech[:,1]
    #Resample to 16khz
    if samplerate != 16000:
        speech = librosa.resample(speech, orig_sr = samplerate, target_sr = 16000)
    return speech

  Preparing metadata (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2023-03-25 15:18:09--  https://huggingface.co/vasudevgupta/gsoc-wav2vec2-960h/resolve/main/saved-model.tar.gz
Resolving huggingface.co (huggingface.co)... 35.173.225.216, 34.203.133.210, 54.82.45.103, ...
Connecting to huggingface.co (huggingface.co)|35.173.225.216|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/vasudevgupta/gsoc-wav2vec2-960h/2a93d38e08cf94ca6c9e5501ac61ea72aa29e244ef66a767024b70080478de4f?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27saved-mode

In [None]:
uploaded = files.upload()

Saving rec12.mp4 to rec12.mp4


In [None]:
output_file = converter(uploaded)
wavs = split_wav(output_file)
preprocessed_wavs = [preprocess_audio(wav) for wav in wavs]

In [None]:
#translate speech into text
script = ""
for audio in preprocessed_wavs:
    audio = preprocess_speech(audio)
    tf_out = tf_forward(audio)
    script = script + tokenizer.decode(tf_out.numpy().tolist())

In [None]:
script

"PITHON IS A HIGH LEVEL INTERPRETED GENERAL PURPOSE PROCAMMING LANGUAGE DESIGN FOR A CODE READABILITY A PYTHON PROGRAMME IS A SEQUENCE OF DEFINITIONS AND COMMANDS WHICH CAN ALSO BE KNOWN AS SCRITS ITHON SOURCE COAT FILES WITH THE DOT PIE EXTENSIONAN BE THOUGHT OF AS MODULES AND YOU CAN FIND YOURSELF WRITING MULTIPLE MODULES FOR LARGER PROGAL WHEN A PYTHON PROGRAMME IS RUN THE PYTHON INTERPRETER WHICH IS A PYTHON VIRTUAL MACHINE THAT CAN READ PYTHON CODE E VALUATES THE DEFINITIONS AND EXECUTES THE COMMANDS INTHESCRIPS IT DOES THIS LINE BY LINE KEEP IN MIND THE INTERPRETER HAS TO BE INSTALLED ON THE COMPUTER YOU WANT TO RUN YOUR PYTHON CODE OR ELSE IT WON'T BE ABLE TO UNDERSTAND IT I THOUGHT COMMANDS CAN BE CONSIDERED A STATEMENT THAT INSTRUCTS ANER TO DO SOMETHING THERE IS NOCOMPILED TIME CHECKING LIKE INSTATICALLY TYPE LANGUAGES SO UNFORTUNATELY THAT DOES MEAN TYPE ERRORS ARMLY CAUGHT DURING COMPILED TIME FOR STATICALLY TYPE LANGUAGES AN FIND THEIR WAY TO BE RUN TIME ERRORS IN PYVHON O