In [8]:
# Required Libraries
# !pip install webrtcvad pydub faster-whisper transformers

# Step 1 -> Voice-to-Text Conversion using Whisper

In [9]:
# Importing Libraries
import wave
import webrtcvad
from pydub import AudioSegment
from faster_whisper import WhisperModel
import os

In [10]:
# Convert audio file (M4A or MP3) to WAV and ensure correct format
def convert_audio_to_wav(input_file, wav_file):
    file_extension = os.path.splitext(input_file)[1].lower()

    if file_extension in [".m4a", ".mp3"]:
        audio = AudioSegment.from_file(input_file, format=file_extension[1:])
        audio = audio.set_frame_rate(32000).set_channels(1)    # Sample rate 32KHz (determines how many audio samples are taken per second)
        audio.export(wav_file, format="wav")
    else:
        raise ValueError("Unsupported file format. Please provide a .m4a or .mp3 file.")

In [11]:
# Read WAV file
def read_wave(path):
    with wave.open(path, 'rb') as wf:
        pcm_data = wf.readframes(wf.getnframes())
        sample_rate = wf.getframerate()
    return pcm_data, sample_rate

# Frame generator for VAD
def frame_generator(frame_duration_ms, audio, sample_rate):
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    while offset + n <= len(audio):
        yield audio[offset:offset + n]
        offset += n

In [12]:
# Apply VAD filtering
def vad_filter(audio_path, vad_mode=1, vad_threshold=0.5):
    vad = webrtcvad.Vad(vad_mode)
    pcm_data, sample_rate = read_wave(audio_path)

    frames = frame_generator(30, pcm_data, sample_rate)           # Frame duration 30ms
    frames_with_voice = [frame for frame in frames if vad.is_speech(frame, sample_rate)]

    filtered_audio = b''.join(frames_with_voice)
    return filtered_audio, sample_rate

# Save filtered audio to WAV
def save_wave(file_name, pcm_data, sample_rate):
    with wave.open(file_name, 'wb') as wf:
        wf.setnchannels(1)          # Audio Channel:1 (mono)
        wf.setsampwidth(2)          # Sample width 2 bytes (16bit)
        wf.setframerate(sample_rate)
        wf.writeframes(pcm_data)

In [13]:
# Transcribe audio using faster-whisper
def transcribe_audio(input_file):
    # Convert to WAV
    wav_path = "temp_audio.wav"
    convert_audio_to_wav(input_file, wav_path)

    # Apply VAD filter
    filtered_audio, sample_rate = vad_filter(wav_path, vad_mode=1, vad_threshold=0.5)        # VAD Threshold 0.5

    # Save filtered audio
    filtered_audio_path = "filtered_audio.wav"
    save_wave(filtered_audio_path, filtered_audio, sample_rate)

    # Load Whisper model
    model = WhisperModel("base")

    # Transcribe the filtered audio
    segments, info = model.transcribe(filtered_audio_path, language="en", beam_size=5)

    # Collect transcription
    transcribed_text = " ".join(segment.text for segment in segments)

    return transcribed_text

In [14]:
# Transcribing Text
audio_path = "Recording.m4a"  # Replace with your audio file (.m4a or .mp3)
transcribed_text = transcribe_audio(audio_path)
print("Transcribed Text:", transcribed_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/145M [00:00<?, ?B/s]

Transcribed Text:  What is machine learning?


# Step 2 -> Text Input into LLM

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the pre-trained LLaMA model and tokenizer
model_name = "openlm-research/open_llama_3b"  # Model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
    (rot

In [17]:
# Tokenize the input text
inputs = tokenizer(transcribed_text, return_tensors="pt").to(device)

# Generate a concise response from the LLaMA model
outputs = model.generate(
    inputs.input_ids,
    num_return_sequences=1,     # Ensure only one response is generated
    do_sample=True,             # Enable sampling to use top_k, top_p, and temperature
    top_k=50,                   # Consider only the top 50 tokens to reduce randomness
    top_p=0.9,                  # Nucleus sampling with top_p (cumulative probability)
    temperature=0.7,            # Lower temperature for less randomness
    no_repeat_ngram_size=3,     # Prevent repeating any 3-word sequences
    max_new_tokens=200             # Limit the number of new tokens generated
)

# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)

What is Machine Learning?
The term Machine Learning refers to the process of using algorithms and statistical models to automate decision making. The goal of this type of learning is to improve the accuracy and efficiency of algorithms and systems.
How is Machine learning used in the Financial Services industry?
Machine learning has been used in many different fields. One example is in the financial services industry. In this industry, machine learning can be used to make predictions about the future. For example, a bank might use machine learning to predict the likelihood that a customer will default on a loan. This type of prediction can help the bank make better decisions about which customers to lend to and which ones to avoid.
Another example of machine learning in the finance industry is the use of algorithms to make trading decisions. Algorithms are programs that are designed to make decisions based on the information available to them. For instance, an algorithm might be design

In [18]:
# Remove the transcribed text from the beginning of the response
if response.startswith(transcribed_text):
    response = response[len(transcribed_text):].strip()

print("LLM Response:", response)

LLM Response: The term Machine Learning refers to the process of using algorithms and statistical models to automate decision making. The goal of this type of learning is to improve the accuracy and efficiency of algorithms and systems.
How is Machine learning used in the Financial Services industry?
Machine learning has been used in many different fields. One example is in the financial services industry. In this industry, machine learning can be used to make predictions about the future. For example, a bank might use machine learning to predict the likelihood that a customer will default on a loan. This type of prediction can help the bank make better decisions about which customers to lend to and which ones to avoid.
Another example of machine learning in the finance industry is the use of algorithms to make trading decisions. Algorithms are programs that are designed to make decisions based on the information available to them. For instance, an algorithm might be designed to buy an

#Step 3 -> Text To Speech Conversion

In [23]:
# !pip install git+https://github.com/huggingface/parler-tts.git

In [24]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import numpy as np

# Function to limit the number of sentences
def limit_sentences(text, max_sentences=2):
    sentences = text.split('.')
    limited_text = '.'.join(sentences[:max_sentences]) + '.'
    return limited_text

# Function to apply Voice Activity Detection (VAD)
def apply_vad(audio_arr, vad_threshold=0.01):
    # Assuming audio_arr is normalized between -1 and 1
    # Thresholding audio to remove low-energy segments
    audio_arr = np.where(np.abs(audio_arr) > vad_threshold, audio_arr, 0)
    return audio_arr

# Function to convert text to speech with tunable parameters
def text_to_speech(prompt, description, output_file, pitch=1.0, gender='female', speed=1.0, vad_threshold=0.01):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # Limit the description to a maximum of 2 sentences
    description = limit_sentences(description)

    # Adjust the description based on the gender
    if gender == 'male':
        description = description.replace("female", "male")
    else:
        description = description.replace("male", "female")

    # Load the model and tokenizer
    model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
    tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")

    # Tokenize the description and prompt
    input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
    prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    # Generate speech
    with torch.no_grad():
        generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)

    # Convert generated audio to numpy array
    audio_arr = generation.cpu().numpy().squeeze()

    # Apply Voice Activity Detection (VAD)
    audio_arr = apply_vad(audio_arr, vad_threshold=vad_threshold)

    # Adjust pitch and speed - Placeholder
    audio_arr = np.interp(audio_arr, (audio_arr.min(), audio_arr.max()), (-1, 1))  # Normalize audio for pitch adjustment

    # Save the audio file
    sf.write(output_file, audio_arr, model.config.sampling_rate)
    print(f"Audio saved to {output_file}")

In [25]:
text_prompt = response
voice_description = response
output_filename = "output.wav"

# Adjust tunable parameters here
text_to_speech(text_prompt, voice_description, output_filename, pitch=1.0, gender='male', speed=1.0, vad_threshold=0.01)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Audio saved to output.wav
