**SPEECH** ***RECOGNITION***

In [1]:
!pip install gTTS speechrecognition pydub


Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting speechrecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub, speechrecognition, gTTS
Successfully installed gTTS-2.5.4 pydub-0.25.1 speechrecognition-3.11.0


In [None]:
from google.colab import files
import speech_recognition as sr
from pydub import AudioSegment
import os

# Step 1: Function to upload the audio file
def upload_audio_file():
    """
    Prompts the user to upload an audio file, supports mp3, wav, and other formats.
    Returns the file name of the uploaded audio.
    """
    print("Please upload your audio file (in mp3, wav, or aac format):")
    uploaded = files.upload()

    # Extract the file name
    audio_file = list(uploaded.keys())[0]
    print(f"File uploaded: {audio_file}")
    return audio_file

# Step 2: Function to convert MP3 or other formats to WAV
def convert_to_wav(audio_file):
    """
    Converts audio files (MP3, AAC, etc.) to WAV format for uniformity.
    """
    if audio_file.endswith(".mp3"):
        # Convert the mp3 file to wav
        sound = AudioSegment.from_mp3(audio_file)
        wav_file = audio_file.replace(".mp3", ".wav")
        sound.export(wav_file, format="wav")
        print(f"Audio file converted to WAV: {wav_file}")
        return wav_file
    elif audio_file.endswith(".aac"):
        # Convert AAC to WAV
        sound = AudioSegment.from_file(audio_file, format="aac")
        wav_file = audio_file.replace(".aac", ".wav")
        sound.export(wav_file, format="wav")
        print(f"Audio file converted to WAV: {wav_file}")
        return wav_file
    elif audio_file.endswith(".wav"):
        # If the audio is already in WAV format, return it as is
        print("Audio is already in WAV format.")
        return audio_file
    else:
        print("Unsupported file format. Only MP3, WAV, or AAC files are supported.")
        return None

# Step 3: Function to convert audio to text using Speech Recognition
def convert_audio_to_text(audio_file):
    """
    Takes an audio file (WAV format) and converts it to text using Google's Speech API.
    """
    recognizer = sr.Recognizer()

    # Path to the audio file on Colab
    audio_path = '/content/' + audio_file

    try:
        with sr.AudioFile(audio_path) as source:
            print("Processing the audio file...")
            audio = recognizer.record(source)  # Read the entire audio file

        # Recognizing speech using Google's API
        print("Recognizing speech...")
        text = recognizer.recognize_google(audio)
        print(f"Transcription: {text}")
        return text

    except sr.UnknownValueError:
        print("Could not understand the audio")
        return None
    except sr.RequestError as e:
        print(f"Error with the request; {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# Step 4: Putting everything together and handling file formats
def main():
    """
    Main function that coordinates the process of uploading, converting, and transcribing an audio file.
    """
    # Step 1: Upload the audio file
    audio_file = upload_audio_file()

    # Step 2: Convert the audio file to WAV format (if needed)
    wav_file = convert_to_wav(audio_file)

    if wav_file:
        # Step 3: Convert audio to text
        text = convert_audio_to_text(wav_file)

        if text:
            print("Final transcription result:")
            print(text)
        else:
            print("Transcription failed. Please try again with a clearer audio file.")
    else:
        print("Audio file conversion failed. Please upload a valid MP3, AAC, or WAV file.")

# Run the main function
if __name__ == "__main__":
    main()


Please upload your audio file (in mp3, wav, or aac format):


Saving harvard.wav to harvard (2).wav
File uploaded: harvard (2).wav
Audio is already in WAV format.
Processing the audio file...
Recognizing speech...
Transcription: the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle taste fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun
Final transcription result:
the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle taste fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun


***SPEAKER VERIFICATION***

In [None]:
!pip install speechbrain






In [None]:
# Install necessary libraries
!pip install speechbrain

# Import necessary libraries
from google.colab import files
import shutil
import os
from speechbrain.pretrained import SpeakerRecognition

# Upload the first audio file
print("Please upload the first audio file (e.g., audio1.wav or audio1.mp3):")
audio_file_1 = files.upload()

# Get the file name of the first uploaded file
audio_file_1_name = list(audio_file_1.keys())[0]

# Upload the second audio file
print("Please upload the second audio file (e.g., audio2.wav or audio2.mp3):")
audio_file_2 = files.upload()

# Get the file name of the second uploaded file
audio_file_2_name = list(audio_file_2.keys())[0]

# Move the files to a working directory (optional but helps in organizing)
audio_dir = '/content/audio_files'
os.makedirs(audio_dir, exist_ok=True)

# Move the uploaded files to the specified directory
shutil.move(audio_file_1_name, os.path.join(audio_dir, audio_file_1_name))
shutil.move(audio_file_2_name, os.path.join(audio_dir, audio_file_2_name))

# Now audio_file_1 and audio_file_2 are in /content/audio_files directory
print(f"First file: {audio_file_1_name} and second file: {audio_file_2_name} have been uploaded successfully.")

# Step 1: Load the pre-trained speaker recognition model
speaker_recognizer = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmpdir")

# Step 2: Perform speaker verification
print("Verifying if the speakers are the same...")

# Compare the two audio files
score, prediction = speaker_recognizer.verify_files(os.path.join(audio_dir, audio_file_1_name), os.path.join(audio_dir, audio_file_2_name))

# Step 3: Output the result
if prediction == 1:
    print("The speakers are the same.")
else:
    print("The speakers are different.")


Please upload the first audio file (e.g., audio1.wav or audio1.mp3):


Saving harvard.wav to harvard (3).wav
Please upload the second audio file (e.g., audio2.wav or audio2.mp3):


Saving F_0101_10y4m_1.wav to F_0101_10y4m_1 (1).wav
First file: harvard (3).wav and second file: F_0101_10y4m_1 (1).wav have been uploaded successfully.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


Verifying if the speakers are the same...
The speakers are different.


***LANGUAGE DETECTION***

In [None]:
!pip install speechrecognition langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=6db5a3e3c8fd0fb2719a1cd5f268b676676f23c6094d23d27f4cd9cad2dd979e
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import speech_recognition as sr
from langdetect import detect

# Function to recognize speech from an audio file
def recognize_speech(audio_path):
    recognizer = sr.Recognizer()
    audio = sr.AudioFile(audio_path)

    with audio as source:
        audio_data = recognizer.record(source)

    try:
        # Recognize speech using Google Web Speech API
        text = recognizer.recognize_google(audio_data)
        print(f"Recognized text: {text}")
        return text
    except sr.UnknownValueError:
        print("Sorry, I could not understand the audio")
        return None
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return None

# Function to detect language
def detect_language(text):
    # Detect language using langdetect
    language = detect(text)
    print(f"Detected language: {language}")
    return language

# Main function to upload an audio file and identify the language
def main():
    from google.colab import files
    uploaded = files.upload()  # Upload the audio file

    if len(uploaded) != 1:
        print("Please upload exactly one audio file.")
        return

    # Get the file name of the uploaded file
    audio_file = next(iter(uploaded))  # First file
    print(f"File uploaded: {audio_file}")

    # Recognize the speech in the audio file
    recognized_text = recognize_speech(audio_file)

    if recognized_text:
        # Detect the language of the recognized text
        detect_language(recognized_text)
    else:
        print("Could not extract text for language identification.")

if __name__ == "__main__":
    main()


Saving La población de Madr (2).wav to La población de Madr (2) (1).wav
File uploaded: La población de Madr (2) (1).wav
Recognized text: capital
Detected language: it
