In [3]:
!pip install -q transformers==4.37.2
!pip install bitsandbytes==0.41.3 accelerate==0.25.0
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q gTTS

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
import torch
from transformers import BitsAndBytesConfig, pipeline

model_id = 'llava-hf/llava-1.5-7b-hf'
quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype = torch.float16
)

pipe = pipeline(
    "image-to-text",
    model = model_id,
    model_kwargs = {'quantization_config': quant_config}
)
device = "cuda" if torch.cuda.is_available() else "cpu"



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
import whisper
import gradio as gr
import time
import warnings
import os
from gtts import gTTS
from PIL import Image

warnings.filterwarnings('ignore')

In [6]:
import datetime

#Log File
tstamp = datetime.datetime.now()
tstamp = str(tstamp).replace(" ", "_")
logfile = f"log_{tstamp}.txt"

def writehistory(text):
    with open(logfile, 'a', encoding='utf-8') as f:
        f.write(text)
        f.write('\n')
    f.close()

In [27]:
import requests
import re

def img2txt(input_text, input_image):
  image = Image.open(input_image)
  writehistory(f'Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}')
  if type(input_text) == tuple:
    prompt_instructions = """
    Describe the image using as much details as possible.
    You are a helpful AI Assistant who is able to answer questions about the image.
    What is the image all about?
    Now generate the helpful answer."""
  else:
    prompt_instructions = """
    Act as an expert in imagery descriptive analysis, using as much detail as possible from the image
    """ + input_text
  writehistory(f'Prompt Instructions: {prompt_instructions}')
  prompt = "User: <image>\n" + prompt_instructions + "\nAssistant:"
  outputs = pipe(image, prompt = prompt, generate_kwargs= {"max_new_tokens": 200})
  #print(outputs)
  #Extract the response text
  if outputs is not None and len(outputs[0]['generated_text']) > 0:
    match = re.search(r'Assistant:\s*(.*)', outputs[0]["generated_text"])
    if match:
      response = match.group(1)
    else:
      response = 'No response found.'
  else:
    response = 'No response generated.'
  return response

#response = img2txt('what is this image about?', 'image.png')

In [28]:
def transcribe (audio):
  if audio is None or audio == '':
    return ('','',None)

  model = whisper.load_model("medium", device = device)

  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)

  mel = whisper.log_mel_spectrogram(audio).to(model.device)

  _, probs = model.detect_language(mel)

  options = whisper.DecodingOptions()
  result = whisper.decode(model, mel, options)
  result_text = result.text
  return result_text

In [29]:
def text2speech(text, file_path):
  audioobj = gTTS(text = text, lang = 'en', slow = False)
  audioobj.save(file_path)
  return file_path

In [10]:
#This step and command is needed else the next line will show an error message
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [11]:
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame audio_file.mp3

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [30]:
def process_inputs(audio_path, image_path):
  #Extracting the text from the audio
  speech2text_output = transcribe(audio_path)

  #Transforming the text to an image
  if image_path:
    text_output = img2txt(speech2text_output, image_path)
  else:
    text_output = 'No image provided.'

  #Transforming the text to an audio
  audio_output = text2speech(text_output, "audio_file.mp3")

  return speech2text_output, text_output, audio_output

In [31]:
#Gradio interface
import gradio as gr
import base64
import os

iface = gr.Interface(
    fn = process_inputs,
    inputs = [
        gr.Audio(sources=['microphone'], type='filepath'),
        gr.Image(type='filepath')
    ],
    outputs = [
        gr.Textbox(label='Speech to Text'),
        gr.Textbox(label='Text Output'),
        gr.Audio('audio_file.mp3')
    ],
    title = 'Voice Assistant for multimodal data',
    description = 'Upload an image and interact via voice input and audio reponse'
)
iface.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://4b458d054a83c12ffe.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://4b458d054a83c12ffe.gradio.live


